diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/added_tokens.json b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/config.json b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/config.json new file mode 100644 index 0000000000000000000000000000000000000000..802edf45ec5e5c90ae4d5a897fdeb345ec741ebb --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/config.json @@ -0,0 +1,192 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.005, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 8192, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev6", + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.001, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/generation_config.json b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00001-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c6b828a815f6067044296a76393d1559a149bf14 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd12fdbb339d14901cf408ecc51623f5808b7271754cff45dd0c77e5a4f1d950 +size 4972489328 diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00002-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3477cbc15bad01fd0b6095a177e41df9435810e6 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c8e85d4b5fecf332a4efd43ce28054375403c57b0d468875eb01e577fa47665 +size 4985976068 diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00003-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..67c6703cc8c3dc7e0cfb8e02dc4bfc7b3d74722d --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:268e231b2aac44c592319cdf3b10f364af0d5a822c69a2b1a1197a13501b82dd +size 248943552 diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model.safetensors.index.json b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7cdc5da041253f30bfca8dad5f6a64a31333d1b4 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/model.safetensors.index.json @@ -0,0 +1,1033 @@ +{ + "metadata": { + "total_size": 10207261884 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/special_tokens_map.json b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/tokenizer.model b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/tokenizer_config.json b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/trainer_state.json b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..815f8a9ee4e2c0e2e3e432cf8844cbd8cacd0789 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/trainer_state.json @@ -0,0 +1,249523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05029151, + "auxiliary_loss_mlp": 0.02214953, + "balance_loss_clip": 2.43632078, + "balance_loss_mlp": 1.76977742, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.35830299524466, + "language_loss": 2.85073018, + "learning_rate": 0.0, + "loss": 1.94530988, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 17.318878173828125 + }, + { + "auxiliary_loss_clip": 0.03379918, + "auxiliary_loss_mlp": 0.01458506, + "balance_loss_clip": 1.62802148, + "balance_loss_mlp": 1.18918824, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 37.8575400949719, + "language_loss": 1.82785273, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87623692, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.7479476928710938 + }, + { + "auxiliary_loss_clip": 0.03319938, + "auxiliary_loss_mlp": 0.01441325, + "balance_loss_clip": 1.62546372, + "balance_loss_mlp": 1.18898261, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 32.96402497703952, + "language_loss": 1.57782006, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62543273, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.5365407466888428 + }, + { + "auxiliary_loss_clip": 0.03362514, + "auxiliary_loss_mlp": 0.01452359, + "balance_loss_clip": 1.62441599, + "balance_loss_mlp": 1.15595698, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.25751765909487, + "language_loss": 1.6771524, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72530115, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.486661434173584 + }, + { + "auxiliary_loss_clip": 0.03401355, + "auxiliary_loss_mlp": 0.0150483, + "balance_loss_clip": 1.6251328, + "balance_loss_mlp": 1.21720243, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.44189025618091, + "language_loss": 1.91194272, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.9610045, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.7184627056121826 + }, + { + "auxiliary_loss_clip": 0.03370972, + "auxiliary_loss_mlp": 0.01517902, + "balance_loss_clip": 1.61567426, + "balance_loss_mlp": 1.2228353, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.8091583547165, + "language_loss": 1.60754454, + "learning_rate": 1.153628246576487e-06, + "loss": 1.65643334, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.7071356773376465 + }, + { + "auxiliary_loss_clip": 0.03354134, + "auxiliary_loss_mlp": 0.0148747, + "balance_loss_clip": 1.61573601, + "balance_loss_mlp": 1.20403814, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 24.64043686571471, + "language_loss": 1.53235507, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58077121, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 2.780254602432251 + }, + { + "auxiliary_loss_clip": 0.03321873, + "auxiliary_loss_mlp": 0.01443716, + "balance_loss_clip": 1.61235714, + "balance_loss_mlp": 1.16600585, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.87516652485001, + "language_loss": 1.43822336, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48587918, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 2.919626474380493 + }, + { + "auxiliary_loss_clip": 0.03370006, + "auxiliary_loss_mlp": 0.01496321, + "balance_loss_clip": 1.61173606, + "balance_loss_mlp": 1.21250725, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 33.57067072578005, + "language_loss": 1.49794102, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54660439, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 2.853858470916748 + }, + { + "auxiliary_loss_clip": 0.03310582, + "auxiliary_loss_mlp": 0.01476964, + "balance_loss_clip": 1.61591125, + "balance_loss_mlp": 1.20783734, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 25.390598353918637, + "language_loss": 1.44701624, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49489164, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.841477632522583 + }, + { + "auxiliary_loss_clip": 0.0336678, + "auxiliary_loss_mlp": 0.01493314, + "balance_loss_clip": 1.62095714, + "balance_loss_mlp": 1.21903729, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.375907864368557, + "language_loss": 1.45379663, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.50239766, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 2.8475635051727295 + }, + { + "auxiliary_loss_clip": 0.03295074, + "auxiliary_loss_mlp": 0.01449957, + "balance_loss_clip": 1.60801077, + "balance_loss_mlp": 1.17320049, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.63687972438663, + "language_loss": 1.45120096, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49865127, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.7507269382476807 + }, + { + "auxiliary_loss_clip": 0.03322717, + "auxiliary_loss_mlp": 0.01405066, + "balance_loss_clip": 1.61711848, + "balance_loss_mlp": 1.14642906, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 12.862075846423894, + "language_loss": 1.24319816, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.29047585, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 2.829045534133911 + }, + { + "auxiliary_loss_clip": 0.03289876, + "auxiliary_loss_mlp": 0.01470217, + "balance_loss_clip": 1.61317921, + "balance_loss_mlp": 1.2016623, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.990546033869611, + "language_loss": 1.2084738, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25607479, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 2.7510697841644287 + }, + { + "auxiliary_loss_clip": 0.03277076, + "auxiliary_loss_mlp": 0.01431361, + "balance_loss_clip": 1.61832547, + "balance_loss_mlp": 1.16814661, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 7.0225559343988495, + "language_loss": 1.12868559, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.17576993, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 2.929739475250244 + }, + { + "auxiliary_loss_clip": 0.03241542, + "auxiliary_loss_mlp": 0.01413533, + "balance_loss_clip": 1.60285878, + "balance_loss_mlp": 1.16328871, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 6.229118444474942, + "language_loss": 1.11279762, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15934837, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 2.8228559494018555 + }, + { + "auxiliary_loss_clip": 0.03229268, + "auxiliary_loss_mlp": 0.01418813, + "balance_loss_clip": 1.60921359, + "balance_loss_mlp": 1.1779151, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.445706508689182, + "language_loss": 1.12924266, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17572343, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 5.9547436237335205 + }, + { + "auxiliary_loss_clip": 0.03164279, + "auxiliary_loss_mlp": 0.01378826, + "balance_loss_clip": 1.60703158, + "balance_loss_mlp": 1.1472739, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.816996781256858, + "language_loss": 1.08032155, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12575257, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 4.357151985168457 + }, + { + "auxiliary_loss_clip": 0.0319249, + "auxiliary_loss_mlp": 0.0140173, + "balance_loss_clip": 1.60626125, + "balance_loss_mlp": 1.13641798, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.657107151162177, + "language_loss": 1.02274203, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06868422, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 2.836827516555786 + }, + { + "auxiliary_loss_clip": 0.03134765, + "auxiliary_loss_mlp": 0.01343792, + "balance_loss_clip": 1.60767603, + "balance_loss_mlp": 1.12263465, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 3.938576481912362, + "language_loss": 1.16706395, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21184945, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 2.771637201309204 + }, + { + "auxiliary_loss_clip": 0.03121388, + "auxiliary_loss_mlp": 0.01380831, + "balance_loss_clip": 1.58878088, + "balance_loss_mlp": 1.13154101, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 4.51715016456875, + "language_loss": 1.0606215, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.10564375, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.7825608253479004 + }, + { + "auxiliary_loss_clip": 0.030142, + "auxiliary_loss_mlp": 0.01379599, + "balance_loss_clip": 1.56970656, + "balance_loss_mlp": 1.14652145, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.4726477422507758, + "language_loss": 1.05827451, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10221255, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.8826630115509033 + }, + { + "auxiliary_loss_clip": 0.02966446, + "auxiliary_loss_mlp": 0.01336422, + "balance_loss_clip": 1.5716157, + "balance_loss_mlp": 1.12613702, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 3.480086580110654, + "language_loss": 0.9183234, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96135211, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 2.882730007171631 + }, + { + "auxiliary_loss_clip": 0.0293564, + "auxiliary_loss_mlp": 0.01361274, + "balance_loss_clip": 1.56393147, + "balance_loss_mlp": 1.14116585, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.5881239274134145, + "language_loss": 1.08182538, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12479448, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 2.791328191757202 + }, + { + "auxiliary_loss_clip": 0.02825628, + "auxiliary_loss_mlp": 0.01329383, + "balance_loss_clip": 1.55683672, + "balance_loss_mlp": 1.11890733, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.7152624569182207, + "language_loss": 1.01416683, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05571699, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 2.831124782562256 + }, + { + "auxiliary_loss_clip": 0.02820202, + "auxiliary_loss_mlp": 0.01310653, + "balance_loss_clip": 1.55958223, + "balance_loss_mlp": 1.10179877, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.833388712087326, + "language_loss": 1.0669049, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10821366, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 2.8187031745910645 + }, + { + "auxiliary_loss_clip": 0.02762055, + "auxiliary_loss_mlp": 0.0132583, + "balance_loss_clip": 1.54996896, + "balance_loss_mlp": 1.12593949, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 2.6763653273195525, + "language_loss": 0.95519817, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99607706, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 2.961591958999634 + }, + { + "auxiliary_loss_clip": 0.02736317, + "auxiliary_loss_mlp": 0.01314185, + "balance_loss_clip": 1.55380476, + "balance_loss_mlp": 1.13346386, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 5.1417577944278126, + "language_loss": 1.06413603, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.10464108, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.7764153480529785 + }, + { + "auxiliary_loss_clip": 0.02701686, + "auxiliary_loss_mlp": 0.01317949, + "balance_loss_clip": 1.5384115, + "balance_loss_mlp": 1.13198209, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.3601334899343116, + "language_loss": 1.02524543, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06544185, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 2.792874574661255 + }, + { + "auxiliary_loss_clip": 0.02694699, + "auxiliary_loss_mlp": 0.01311301, + "balance_loss_clip": 1.53506279, + "balance_loss_mlp": 1.12571609, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.5888872418529694, + "language_loss": 1.19167686, + "learning_rate": 2.189868360711334e-06, + "loss": 1.23173678, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 2.7243568897247314 + }, + { + "auxiliary_loss_clip": 0.02612929, + "auxiliary_loss_mlp": 0.01339652, + "balance_loss_clip": 1.5211128, + "balance_loss_mlp": 1.15902638, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 2.81038150639946, + "language_loss": 1.02331305, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06283891, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 2.826751470565796 + }, + { + "auxiliary_loss_clip": 0.02583269, + "auxiliary_loss_mlp": 0.01331079, + "balance_loss_clip": 1.52231908, + "balance_loss_mlp": 1.15236008, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 5.003146756066283, + "language_loss": 0.95635653, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99549997, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.824202537536621 + }, + { + "auxiliary_loss_clip": 0.02565834, + "auxiliary_loss_mlp": 0.01302931, + "balance_loss_clip": 1.51770532, + "balance_loss_mlp": 1.1364193, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 5.143217920774505, + "language_loss": 0.95262307, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99131072, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.749882459640503 + }, + { + "auxiliary_loss_clip": 0.02421034, + "auxiliary_loss_mlp": 0.01303886, + "balance_loss_clip": 1.48436666, + "balance_loss_mlp": 1.14700663, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 3.5008728347011386, + "language_loss": 0.91540098, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95265019, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.7577292919158936 + }, + { + "auxiliary_loss_clip": 0.02376771, + "auxiliary_loss_mlp": 0.01273165, + "balance_loss_clip": 1.44979906, + "balance_loss_mlp": 1.11847961, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 5.374203161391714, + "language_loss": 0.76655591, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80305529, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 3.0075721740722656 + }, + { + "auxiliary_loss_clip": 0.02350138, + "auxiliary_loss_mlp": 0.01274356, + "balance_loss_clip": 1.46302319, + "balance_loss_mlp": 1.13035107, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.7461012272146412, + "language_loss": 0.88938475, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92562973, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.7921204566955566 + }, + { + "auxiliary_loss_clip": 0.02293693, + "auxiliary_loss_mlp": 0.01338804, + "balance_loss_clip": 1.44974256, + "balance_loss_mlp": 1.1916517, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.437902392408772, + "language_loss": 0.93115032, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96747524, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 2.806931972503662 + }, + { + "auxiliary_loss_clip": 0.02255013, + "auxiliary_loss_mlp": 0.01278764, + "balance_loss_clip": 1.44485974, + "balance_loss_mlp": 1.15650284, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.133804757880331, + "language_loss": 1.04124117, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07657886, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.7816731929779053 + }, + { + "auxiliary_loss_clip": 0.02224816, + "auxiliary_loss_mlp": 0.01258644, + "balance_loss_clip": 1.43830991, + "balance_loss_mlp": 1.13447535, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.1230188099401683, + "language_loss": 0.85691607, + "learning_rate": 2.358792165262154e-06, + "loss": 0.89175069, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 2.833686590194702 + }, + { + "auxiliary_loss_clip": 0.02198963, + "auxiliary_loss_mlp": 0.01249981, + "balance_loss_clip": 1.42813993, + "balance_loss_mlp": 1.12113929, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.814492040176496, + "language_loss": 0.90620697, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.94069642, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 2.878185987472534 + }, + { + "auxiliary_loss_clip": 0.02150038, + "auxiliary_loss_mlp": 0.01273896, + "balance_loss_clip": 1.41863322, + "balance_loss_mlp": 1.16002703, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 2.526063021970412, + "language_loss": 0.93445015, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.9686895, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.7372782230377197 + }, + { + "auxiliary_loss_clip": 0.0211377, + "auxiliary_loss_mlp": 0.01255344, + "balance_loss_clip": 1.41025496, + "balance_loss_mlp": 1.15110755, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 2.2578143603098177, + "language_loss": 0.97613764, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00982869, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.78126859664917 + }, + { + "auxiliary_loss_clip": 0.02073232, + "auxiliary_loss_mlp": 0.01299466, + "balance_loss_clip": 1.41097188, + "balance_loss_mlp": 1.192559, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.6062066393781342, + "language_loss": 0.97619784, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00992477, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.8118743896484375 + }, + { + "auxiliary_loss_clip": 0.02092642, + "auxiliary_loss_mlp": 0.01313159, + "balance_loss_clip": 1.4091506, + "balance_loss_mlp": 1.20062578, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.009350759841868, + "language_loss": 0.93719983, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97125781, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.7457404136657715 + }, + { + "auxiliary_loss_clip": 0.02052956, + "auxiliary_loss_mlp": 0.01268471, + "balance_loss_clip": 1.40605867, + "balance_loss_mlp": 1.17048049, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 4.745099487992204, + "language_loss": 0.98813123, + "learning_rate": 2.450927955901469e-06, + "loss": 1.0213455, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.8648271560668945 + }, + { + "auxiliary_loss_clip": 0.02027839, + "auxiliary_loss_mlp": 0.01224291, + "balance_loss_clip": 1.3918829, + "balance_loss_mlp": 1.13736379, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8592952073963076, + "language_loss": 1.0271697, + "learning_rate": 2.465079122983384e-06, + "loss": 1.05969095, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.7965316772460938 + }, + { + "auxiliary_loss_clip": 0.01991679, + "auxiliary_loss_mlp": 0.01267476, + "balance_loss_clip": 1.38235199, + "balance_loss_mlp": 1.17711568, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.0888853801783718, + "language_loss": 0.88167959, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91427118, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 2.916123628616333 + }, + { + "auxiliary_loss_clip": 0.01952947, + "auxiliary_loss_mlp": 0.0125035, + "balance_loss_clip": 1.3718102, + "balance_loss_mlp": 1.16432869, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 13.527485854715882, + "language_loss": 0.87921274, + "learning_rate": 2.492481223656015e-06, + "loss": 0.9112457, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 2.731175422668457 + }, + { + "auxiliary_loss_clip": 0.01953242, + "auxiliary_loss_mlp": 0.01236666, + "balance_loss_clip": 1.36205912, + "balance_loss_mlp": 1.14563787, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.4069952643885095, + "language_loss": 0.897385, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.92928416, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.839404344558716 + }, + { + "auxiliary_loss_clip": 0.01944412, + "auxiliary_loss_mlp": 0.01228465, + "balance_loss_clip": 1.35555327, + "balance_loss_mlp": 1.14420795, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.7471441431967607, + "language_loss": 0.90910065, + "learning_rate": 2.51876455396287e-06, + "loss": 0.9408294, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.7316267490386963 + }, + { + "auxiliary_loss_clip": 0.01942248, + "auxiliary_loss_mlp": 0.01193139, + "balance_loss_clip": 1.36072338, + "balance_loss_mlp": 1.11231554, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 4.998125092279102, + "language_loss": 0.86902332, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90037721, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.8773951530456543 + }, + { + "auxiliary_loss_clip": 0.01898729, + "auxiliary_loss_mlp": 0.01204098, + "balance_loss_clip": 1.35063279, + "balance_loss_mlp": 1.12551486, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.2186000966199746, + "language_loss": 0.95343286, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98446119, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 2.8712332248687744 + }, + { + "auxiliary_loss_clip": 0.0189583, + "auxiliary_loss_mlp": 0.01239401, + "balance_loss_clip": 1.34721875, + "balance_loss_mlp": 1.16024601, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 2.0134530248051297, + "language_loss": 0.92222869, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95358098, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 2.8396379947662354 + }, + { + "auxiliary_loss_clip": 0.0188482, + "auxiliary_loss_mlp": 0.01193634, + "balance_loss_clip": 1.35095167, + "balance_loss_mlp": 1.1131916, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.999423200292947, + "language_loss": 0.82660162, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85738611, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 2.7012131214141846 + }, + { + "auxiliary_loss_clip": 0.01880563, + "auxiliary_loss_mlp": 0.01206078, + "balance_loss_clip": 1.33883548, + "balance_loss_mlp": 1.12759078, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.3464389832324266, + "language_loss": 0.81291401, + "learning_rate": 2.580130221340046e-06, + "loss": 0.8437804, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 2.8101465702056885 + }, + { + "auxiliary_loss_clip": 0.01870613, + "auxiliary_loss_mlp": 0.01200149, + "balance_loss_clip": 1.33343279, + "balance_loss_mlp": 1.12142289, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 3.1001875504813765, + "language_loss": 0.87078232, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90148997, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 2.6986641883850098 + }, + { + "auxiliary_loss_clip": 0.01870118, + "auxiliary_loss_mlp": 0.01160931, + "balance_loss_clip": 1.32570791, + "balance_loss_mlp": 1.08749855, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 1.7918407341121303, + "language_loss": 0.92737663, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95768714, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 2.751530408859253 + }, + { + "auxiliary_loss_clip": 0.01833976, + "auxiliary_loss_mlp": 0.01212311, + "balance_loss_clip": 1.32999766, + "balance_loss_mlp": 1.14021301, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.5114799105236054, + "language_loss": 0.99806839, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02853119, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 5.795925855636597 + }, + { + "auxiliary_loss_clip": 0.01816034, + "auxiliary_loss_mlp": 0.01193748, + "balance_loss_clip": 1.31632841, + "balance_loss_mlp": 1.12322342, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 3.330168206116289, + "language_loss": 0.88325584, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91335362, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.7023532390594482 + }, + { + "auxiliary_loss_clip": 0.01837535, + "auxiliary_loss_mlp": 0.01159742, + "balance_loss_clip": 1.32512736, + "balance_loss_mlp": 1.0874536, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.0502100469384095, + "language_loss": 0.9357177, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96569049, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 4.214938163757324 + }, + { + "auxiliary_loss_clip": 0.01816243, + "auxiliary_loss_mlp": 0.0117186, + "balance_loss_clip": 1.30902767, + "balance_loss_mlp": 1.10395825, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 2.642031834529037, + "language_loss": 0.89858866, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.92846972, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.722295045852661 + }, + { + "auxiliary_loss_clip": 0.01799619, + "auxiliary_loss_mlp": 0.01140989, + "balance_loss_clip": 1.30610287, + "balance_loss_mlp": 1.07413614, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 2.081395699913946, + "language_loss": 0.88535726, + "learning_rate": 2.657264485425803e-06, + "loss": 0.91476333, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.7226181030273438 + }, + { + "auxiliary_loss_clip": 0.01781994, + "auxiliary_loss_mlp": 0.0116088, + "balance_loss_clip": 1.29724872, + "balance_loss_mlp": 1.09145296, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.2459413343808934, + "language_loss": 0.96047366, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.98990244, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.7053561210632324 + }, + { + "auxiliary_loss_clip": 0.01790318, + "auxiliary_loss_mlp": 0.01168733, + "balance_loss_clip": 1.30483007, + "balance_loss_mlp": 1.10211849, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 3.401537171152515, + "language_loss": 0.98793828, + "learning_rate": 2.677705954159056e-06, + "loss": 1.01752877, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.7738535404205322 + }, + { + "auxiliary_loss_clip": 0.0179659, + "auxiliary_loss_mlp": 0.01147214, + "balance_loss_clip": 1.30360353, + "balance_loss_mlp": 1.07959807, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.0594332535833106, + "language_loss": 0.852714, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88215202, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.7213141918182373 + }, + { + "auxiliary_loss_clip": 0.0177372, + "auxiliary_loss_mlp": 0.01153027, + "balance_loss_clip": 1.29018199, + "balance_loss_mlp": 1.08574557, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.4224756166061985, + "language_loss": 0.8527143, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88198179, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.7229156494140625 + }, + { + "auxiliary_loss_clip": 0.01775394, + "auxiliary_loss_mlp": 0.01153474, + "balance_loss_clip": 1.28792357, + "balance_loss_mlp": 1.07837248, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.4272459018382535, + "language_loss": 0.9665969, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99588561, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.6782984733581543 + }, + { + "auxiliary_loss_clip": 0.01748788, + "auxiliary_loss_mlp": 0.01153559, + "balance_loss_clip": 1.28228796, + "balance_loss_mlp": 1.08122325, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.161970615365575, + "language_loss": 0.9459132, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97493672, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.703951358795166 + }, + { + "auxiliary_loss_clip": 0.01743292, + "auxiliary_loss_mlp": 0.01154696, + "balance_loss_clip": 1.27981842, + "balance_loss_mlp": 1.08512545, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.058925622216738, + "language_loss": 0.95731151, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98629141, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.758229970932007 + }, + { + "auxiliary_loss_clip": 0.01736847, + "auxiliary_loss_mlp": 0.01159389, + "balance_loss_clip": 1.28298688, + "balance_loss_mlp": 1.09434795, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.3428812712459783, + "language_loss": 0.97972, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00868237, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.7233221530914307 + }, + { + "auxiliary_loss_clip": 0.01741202, + "auxiliary_loss_mlp": 0.011453, + "balance_loss_clip": 1.2718792, + "balance_loss_mlp": 1.07620609, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.69329757076113, + "language_loss": 0.94060946, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96947438, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.7573745250701904 + }, + { + "auxiliary_loss_clip": 0.01813114, + "auxiliary_loss_mlp": 0.01291869, + "balance_loss_clip": 1.42705297, + "balance_loss_mlp": 1.25295913, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.412992742264751, + "language_loss": 0.65748668, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68853652, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.252178192138672 + }, + { + "auxiliary_loss_clip": 0.01795852, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 1.41905904, + "balance_loss_mlp": 1.23148084, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.325182915911008, + "language_loss": 0.63762957, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66829389, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.2374298572540283 + }, + { + "auxiliary_loss_clip": 0.0171552, + "auxiliary_loss_mlp": 0.01144793, + "balance_loss_clip": 1.2630229, + "balance_loss_mlp": 1.07670021, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 4.460895577171822, + "language_loss": 0.85925388, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88785696, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.6862800121307373 + }, + { + "auxiliary_loss_clip": 0.01718989, + "auxiliary_loss_mlp": 0.01161148, + "balance_loss_clip": 1.26288867, + "balance_loss_mlp": 1.09200704, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 1.9901922048226213, + "language_loss": 0.97006446, + "learning_rate": 2.779824149153005e-06, + "loss": 0.99886584, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.7084622383117676 + }, + { + "auxiliary_loss_clip": 0.01696683, + "auxiliary_loss_mlp": 0.01142517, + "balance_loss_clip": 1.25842071, + "balance_loss_mlp": 1.07614136, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.046634102206223, + "language_loss": 0.87649709, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90488911, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.711571455001831 + }, + { + "auxiliary_loss_clip": 0.01698434, + "auxiliary_loss_mlp": 0.0114696, + "balance_loss_clip": 1.25674641, + "balance_loss_mlp": 1.07729411, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 2.830877280673488, + "language_loss": 0.91853619, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94699013, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 2.7681658267974854 + }, + { + "auxiliary_loss_clip": 0.01689025, + "auxiliary_loss_mlp": 0.01165414, + "balance_loss_clip": 1.25836182, + "balance_loss_mlp": 1.09422207, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 4.278964236322828, + "language_loss": 0.92313254, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95167696, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.7235851287841797 + }, + { + "auxiliary_loss_clip": 0.01685377, + "auxiliary_loss_mlp": 0.01154824, + "balance_loss_clip": 1.25289679, + "balance_loss_mlp": 1.08577776, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.746404034487634, + "language_loss": 0.8263917, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85479367, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 2.7845823764801025 + }, + { + "auxiliary_loss_clip": 0.01698993, + "auxiliary_loss_mlp": 0.01132172, + "balance_loss_clip": 1.25614524, + "balance_loss_mlp": 1.06236327, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 21.59185654838779, + "language_loss": 0.91175926, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94007087, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.720216751098633 + }, + { + "auxiliary_loss_clip": 0.0166845, + "auxiliary_loss_mlp": 0.01141794, + "balance_loss_clip": 1.24923432, + "balance_loss_mlp": 1.07107949, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.2221943037679384, + "language_loss": 0.95205116, + "learning_rate": 2.829375683533245e-06, + "loss": 0.98015368, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.7619292736053467 + }, + { + "auxiliary_loss_clip": 0.01684091, + "auxiliary_loss_mlp": 0.01148394, + "balance_loss_clip": 1.25388551, + "balance_loss_mlp": 1.08144593, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.6472649544958227, + "language_loss": 0.96208733, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99041224, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.7066733837127686 + }, + { + "auxiliary_loss_clip": 0.01667539, + "auxiliary_loss_mlp": 0.01156692, + "balance_loss_clip": 1.24211359, + "balance_loss_mlp": 1.0865016, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.1236126154448693, + "language_loss": 0.86649197, + "learning_rate": 2.84508017388607e-06, + "loss": 0.89473426, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.759330987930298 + }, + { + "auxiliary_loss_clip": 0.01661905, + "auxiliary_loss_mlp": 0.01156352, + "balance_loss_clip": 1.24492204, + "balance_loss_mlp": 1.08592331, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 3.1901761590455084, + "language_loss": 0.91703284, + "learning_rate": 2.852791070641559e-06, + "loss": 0.9452154, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.7494819164276123 + }, + { + "auxiliary_loss_clip": 0.016342, + "auxiliary_loss_mlp": 0.01180875, + "balance_loss_clip": 1.33693552, + "balance_loss_mlp": 1.14329994, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.3990810846797548, + "language_loss": 0.62527388, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65342462, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.2106058597564697 + }, + { + "auxiliary_loss_clip": 0.01649679, + "auxiliary_loss_mlp": 0.01127969, + "balance_loss_clip": 1.23494554, + "balance_loss_mlp": 1.05692053, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.6661269260869276, + "language_loss": 0.90890115, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93667763, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.734661340713501 + }, + { + "auxiliary_loss_clip": 0.01653501, + "auxiliary_loss_mlp": 0.01160662, + "balance_loss_clip": 1.24101019, + "balance_loss_mlp": 1.08880234, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.715989047232065, + "language_loss": 0.81896996, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84711164, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.694636344909668 + }, + { + "auxiliary_loss_clip": 0.01643799, + "auxiliary_loss_mlp": 0.01152261, + "balance_loss_clip": 1.23962426, + "balance_loss_mlp": 1.08321524, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.9634116472198258, + "language_loss": 0.9575094, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98546994, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.6497671604156494 + }, + { + "auxiliary_loss_clip": 0.01659575, + "auxiliary_loss_mlp": 0.01147354, + "balance_loss_clip": 1.23907709, + "balance_loss_mlp": 1.0797385, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 2.623902539868128, + "language_loss": 0.86201751, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.89008677, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 2.7299280166625977 + }, + { + "auxiliary_loss_clip": 0.01647836, + "auxiliary_loss_mlp": 0.01133685, + "balance_loss_clip": 1.23347902, + "balance_loss_mlp": 1.06683195, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 2.255733383159924, + "language_loss": 0.9160555, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94387072, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.7227258682250977 + }, + { + "auxiliary_loss_clip": 0.01631598, + "auxiliary_loss_mlp": 0.01136127, + "balance_loss_clip": 1.23060417, + "balance_loss_mlp": 1.06793904, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 3.9943340398826463, + "language_loss": 0.86038035, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88805759, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 2.6612589359283447 + }, + { + "auxiliary_loss_clip": 0.01629738, + "auxiliary_loss_mlp": 0.01139658, + "balance_loss_clip": 1.22618914, + "balance_loss_mlp": 1.07304406, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.556840977199713, + "language_loss": 0.86904109, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89673507, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 2.6819419860839844 + }, + { + "auxiliary_loss_clip": 0.01621179, + "auxiliary_loss_mlp": 0.01178554, + "balance_loss_clip": 1.21747684, + "balance_loss_mlp": 1.11050963, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 3.6090454266214205, + "language_loss": 0.92056775, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94856501, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 2.643653392791748 + }, + { + "auxiliary_loss_clip": 0.01632485, + "auxiliary_loss_mlp": 0.01152046, + "balance_loss_clip": 1.22344351, + "balance_loss_mlp": 1.08061588, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 12.78199249124389, + "language_loss": 0.87436771, + "learning_rate": 2.925210265866963e-06, + "loss": 0.9022131, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 2.677804708480835 + }, + { + "auxiliary_loss_clip": 0.01553501, + "auxiliary_loss_mlp": 0.01051774, + "balance_loss_clip": 1.29532385, + "balance_loss_mlp": 1.0147717, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3595182911439596, + "language_loss": 0.68065202, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70670474, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 3.0859627723693848 + }, + { + "auxiliary_loss_clip": 0.01610259, + "auxiliary_loss_mlp": 0.01149794, + "balance_loss_clip": 1.21177101, + "balance_loss_mlp": 1.08222628, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 5.866142265576208, + "language_loss": 0.90316153, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.93076205, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 2.6953649520874023 + }, + { + "auxiliary_loss_clip": 0.01604774, + "auxiliary_loss_mlp": 0.01140033, + "balance_loss_clip": 1.21553731, + "balance_loss_mlp": 1.07732856, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.25411866278016, + "language_loss": 0.89859742, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92604548, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 2.661208152770996 + }, + { + "auxiliary_loss_clip": 0.01591815, + "auxiliary_loss_mlp": 0.01135464, + "balance_loss_clip": 1.20797372, + "balance_loss_mlp": 1.06694269, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 3.1682876711951193, + "language_loss": 0.76548707, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79275984, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 2.760876178741455 + }, + { + "auxiliary_loss_clip": 0.01527626, + "auxiliary_loss_mlp": 0.01045127, + "balance_loss_clip": 1.27998877, + "balance_loss_mlp": 1.00945973, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0407993943850893, + "language_loss": 0.65428632, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68001384, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 6.29551362991333 + }, + { + "auxiliary_loss_clip": 0.0159449, + "auxiliary_loss_mlp": 0.01142421, + "balance_loss_clip": 1.20757198, + "balance_loss_mlp": 1.07137251, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 3.5895426581303376, + "language_loss": 0.90948039, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93684953, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 4.30577826499939 + }, + { + "auxiliary_loss_clip": 0.01604951, + "auxiliary_loss_mlp": 0.0114569, + "balance_loss_clip": 1.21036506, + "balance_loss_mlp": 1.07893324, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 2.3228143675755915, + "language_loss": 0.91046202, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93796843, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.7139055728912354 + }, + { + "auxiliary_loss_clip": 0.01592015, + "auxiliary_loss_mlp": 0.01151351, + "balance_loss_clip": 1.20972264, + "balance_loss_mlp": 1.08058786, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 3.8167389608096496, + "language_loss": 0.91050112, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93793476, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.757636785507202 + }, + { + "auxiliary_loss_clip": 0.01586875, + "auxiliary_loss_mlp": 0.01133002, + "balance_loss_clip": 1.20695567, + "balance_loss_mlp": 1.068295, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 7.110607765216997, + "language_loss": 0.88062668, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90782547, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.7207190990448 + }, + { + "auxiliary_loss_clip": 0.01587423, + "auxiliary_loss_mlp": 0.01134066, + "balance_loss_clip": 1.208812, + "balance_loss_mlp": 1.06711793, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9832919337016384, + "language_loss": 0.93880153, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96601641, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.6858506202697754 + }, + { + "auxiliary_loss_clip": 0.01579808, + "auxiliary_loss_mlp": 0.01144983, + "balance_loss_clip": 1.19952738, + "balance_loss_mlp": 1.08099151, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 3.7504757558722743, + "language_loss": 0.96695012, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99419796, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.822537660598755 + }, + { + "auxiliary_loss_clip": 0.01578534, + "auxiliary_loss_mlp": 0.01157566, + "balance_loss_clip": 1.20003963, + "balance_loss_mlp": 1.08575404, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 3.54528072277259, + "language_loss": 0.87085938, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89822042, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.73551607131958 + }, + { + "auxiliary_loss_clip": 0.01562028, + "auxiliary_loss_mlp": 0.01141166, + "balance_loss_clip": 1.18635273, + "balance_loss_mlp": 1.07307386, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.0507299624547843, + "language_loss": 0.83385372, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86088562, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.729881525039673 + }, + { + "auxiliary_loss_clip": 0.01463456, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.24146938, + "balance_loss_mlp": 1.00620961, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9820437908719741, + "language_loss": 0.64794934, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67297977, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.232052803039551 + }, + { + "auxiliary_loss_clip": 0.01556149, + "auxiliary_loss_mlp": 0.01133848, + "balance_loss_clip": 1.18940067, + "balance_loss_mlp": 1.06451559, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 9.872700304782038, + "language_loss": 0.97843218, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00533211, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.6572000980377197 + }, + { + "auxiliary_loss_clip": 0.01554066, + "auxiliary_loss_mlp": 0.01141322, + "balance_loss_clip": 1.19069302, + "balance_loss_mlp": 1.07456446, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.8705153713862521, + "language_loss": 0.84372711, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.87068099, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 2.726294994354248 + }, + { + "auxiliary_loss_clip": 0.0154825, + "auxiliary_loss_mlp": 0.0115304, + "balance_loss_clip": 1.18389344, + "balance_loss_mlp": 1.08623493, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.116324656612536, + "language_loss": 0.83194518, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85895807, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.7055106163024902 + }, + { + "auxiliary_loss_clip": 0.0155026, + "auxiliary_loss_mlp": 0.01134232, + "balance_loss_clip": 1.1881237, + "balance_loss_mlp": 1.07000208, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 2.96897358918407, + "language_loss": 0.94129241, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96813738, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.737684965133667 + }, + { + "auxiliary_loss_clip": 0.01552824, + "auxiliary_loss_mlp": 0.01139832, + "balance_loss_clip": 1.18405807, + "balance_loss_mlp": 1.07507741, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4088244469556197, + "language_loss": 0.79635513, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.82328165, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.63466477394104 + }, + { + "auxiliary_loss_clip": 0.01541394, + "auxiliary_loss_mlp": 0.01134356, + "balance_loss_clip": 1.18076265, + "balance_loss_mlp": 1.06783676, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.0204000712795227, + "language_loss": 0.93667525, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.96343267, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.686927080154419 + }, + { + "auxiliary_loss_clip": 0.01547233, + "auxiliary_loss_mlp": 0.01135388, + "balance_loss_clip": 1.18005466, + "balance_loss_mlp": 1.07535422, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 1.9035563840758372, + "language_loss": 0.94747281, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97429907, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 2.6715986728668213 + }, + { + "auxiliary_loss_clip": 0.01543058, + "auxiliary_loss_mlp": 0.01119711, + "balance_loss_clip": 1.17755747, + "balance_loss_mlp": 1.05266714, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 7.708472724002857, + "language_loss": 0.8181901, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84481776, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.7534308433532715 + }, + { + "auxiliary_loss_clip": 0.01534575, + "auxiliary_loss_mlp": 0.01126016, + "balance_loss_clip": 1.1786685, + "balance_loss_mlp": 1.06116652, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 6.066092803574981, + "language_loss": 0.88273799, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90934384, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.7525694370269775 + }, + { + "auxiliary_loss_clip": 0.01533119, + "auxiliary_loss_mlp": 0.01149742, + "balance_loss_clip": 1.17586756, + "balance_loss_mlp": 1.08384299, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 3.2810424323342033, + "language_loss": 0.85000515, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87683368, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.7554128170013428 + }, + { + "auxiliary_loss_clip": 0.01542549, + "auxiliary_loss_mlp": 0.011619, + "balance_loss_clip": 1.17723155, + "balance_loss_mlp": 1.09647775, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.7311622176699157, + "language_loss": 0.99291664, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.01996112, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.6890408992767334 + }, + { + "auxiliary_loss_clip": 0.01545718, + "auxiliary_loss_mlp": 0.01121202, + "balance_loss_clip": 1.17390323, + "balance_loss_mlp": 1.05802131, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 4.023308715509795, + "language_loss": 0.89773571, + "learning_rate": 3.082437012097686e-06, + "loss": 0.92440486, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.661738872528076 + }, + { + "auxiliary_loss_clip": 0.01531206, + "auxiliary_loss_mlp": 0.01130051, + "balance_loss_clip": 1.17431331, + "balance_loss_mlp": 1.06539226, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.7488036667223381, + "language_loss": 0.93461585, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96122843, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.748983144760132 + }, + { + "auxiliary_loss_clip": 0.0153162, + "auxiliary_loss_mlp": 0.01145912, + "balance_loss_clip": 1.17511559, + "balance_loss_mlp": 1.08211112, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 2.1674761798503597, + "language_loss": 0.90223503, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92901039, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.6344873905181885 + }, + { + "auxiliary_loss_clip": 0.01526982, + "auxiliary_loss_mlp": 0.01145916, + "balance_loss_clip": 1.1694051, + "balance_loss_mlp": 1.07763267, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.570087643424709, + "language_loss": 0.92590976, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.9526388, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.7490177154541016 + }, + { + "auxiliary_loss_clip": 0.01523442, + "auxiliary_loss_mlp": 0.01130267, + "balance_loss_clip": 1.1632936, + "balance_loss_mlp": 1.0678494, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 2.9317659056514063, + "language_loss": 0.7110498, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73758686, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.7409894466400146 + }, + { + "auxiliary_loss_clip": 0.01513271, + "auxiliary_loss_mlp": 0.01121704, + "balance_loss_clip": 1.16572666, + "balance_loss_mlp": 1.05785561, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.3098262234557234, + "language_loss": 0.88608146, + "learning_rate": 3.108720342404542e-06, + "loss": 0.91243118, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 2.7234652042388916 + }, + { + "auxiliary_loss_clip": 0.015266, + "auxiliary_loss_mlp": 0.01139191, + "balance_loss_clip": 1.16515195, + "balance_loss_mlp": 1.07548523, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.6029705872887425, + "language_loss": 0.82317984, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84983778, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 2.6285464763641357 + }, + { + "auxiliary_loss_clip": 0.01520251, + "auxiliary_loss_mlp": 0.01143008, + "balance_loss_clip": 1.16394949, + "balance_loss_mlp": 1.08054233, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 2.5879860440273807, + "language_loss": 0.67603779, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.70267034, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 2.6036179065704346 + }, + { + "auxiliary_loss_clip": 0.01509706, + "auxiliary_loss_mlp": 0.01130807, + "balance_loss_clip": 1.16669798, + "balance_loss_mlp": 1.06652963, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 2.0024589941844995, + "language_loss": 0.88302016, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90942532, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 2.6726181507110596 + }, + { + "auxiliary_loss_clip": 0.01511034, + "auxiliary_loss_mlp": 0.01142485, + "balance_loss_clip": 1.16118026, + "balance_loss_mlp": 1.078017, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 2.887590025630805, + "language_loss": 0.84746134, + "learning_rate": 3.129000827968184e-06, + "loss": 0.8739965, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 2.5992074012756348 + }, + { + "auxiliary_loss_clip": 0.01504826, + "auxiliary_loss_mlp": 0.0113096, + "balance_loss_clip": 1.16009474, + "balance_loss_mlp": 1.06696892, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.464251114441402, + "language_loss": 0.97495961, + "learning_rate": 3.133972684206866e-06, + "loss": 1.0013175, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 2.6703367233276367 + }, + { + "auxiliary_loss_clip": 0.01498134, + "auxiliary_loss_mlp": 0.01134151, + "balance_loss_clip": 1.15685821, + "balance_loss_mlp": 1.06930161, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.7878179184713576, + "language_loss": 0.82869339, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85501623, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 2.7163424491882324 + }, + { + "auxiliary_loss_clip": 0.01507881, + "auxiliary_loss_mlp": 0.01128266, + "balance_loss_clip": 1.15946639, + "balance_loss_mlp": 1.06618178, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.8803531687007986, + "language_loss": 0.82730567, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85366726, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 2.701465606689453 + }, + { + "auxiliary_loss_clip": 0.01497946, + "auxiliary_loss_mlp": 0.01130299, + "balance_loss_clip": 1.15389729, + "balance_loss_mlp": 1.06764233, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.1526861187450113, + "language_loss": 0.95866299, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98494542, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 2.6026113033294678 + }, + { + "auxiliary_loss_clip": 0.01495069, + "auxiliary_loss_mlp": 0.0112602, + "balance_loss_clip": 1.16309452, + "balance_loss_mlp": 1.0657481, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.9508460692581875, + "language_loss": 0.7354719, + "learning_rate": 3.153484849651286e-06, + "loss": 0.76168281, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 2.7648215293884277 + }, + { + "auxiliary_loss_clip": 0.01491149, + "auxiliary_loss_mlp": 0.01131447, + "balance_loss_clip": 1.15136409, + "balance_loss_mlp": 1.06616807, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 4.201119334581421, + "language_loss": 0.88930738, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91553342, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.5957541465759277 + }, + { + "auxiliary_loss_clip": 0.01494667, + "auxiliary_loss_mlp": 0.01133953, + "balance_loss_clip": 1.15596747, + "balance_loss_mlp": 1.06838787, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.210231195623175, + "language_loss": 0.89505237, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.92133862, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 2.5728707313537598 + }, + { + "auxiliary_loss_clip": 0.01491804, + "auxiliary_loss_mlp": 0.01109698, + "balance_loss_clip": 1.15059924, + "balance_loss_mlp": 1.04952133, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.1714467325040157, + "language_loss": 0.84340513, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86942017, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 2.603395938873291 + }, + { + "auxiliary_loss_clip": 0.01487325, + "auxiliary_loss_mlp": 0.01117945, + "balance_loss_clip": 1.14968061, + "balance_loss_mlp": 1.05767322, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 2.353670378837216, + "language_loss": 0.90320814, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92926085, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 2.6727101802825928 + }, + { + "auxiliary_loss_clip": 0.01476805, + "auxiliary_loss_mlp": 0.01121468, + "balance_loss_clip": 1.14883113, + "balance_loss_mlp": 1.05642712, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.6133429520109663, + "language_loss": 0.91604447, + "learning_rate": 3.177071816289865e-06, + "loss": 0.94202721, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 4.230572462081909 + }, + { + "auxiliary_loss_clip": 0.01494248, + "auxiliary_loss_mlp": 0.01127172, + "balance_loss_clip": 1.15585947, + "balance_loss_mlp": 1.06384778, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.8466834607793134, + "language_loss": 0.85776305, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88397729, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 4.342786073684692 + }, + { + "auxiliary_loss_clip": 0.01479776, + "auxiliary_loss_mlp": 0.01122723, + "balance_loss_clip": 1.14891601, + "balance_loss_mlp": 1.06078196, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 3.5255370767929213, + "language_loss": 0.84629786, + "learning_rate": 3.186269861057098e-06, + "loss": 0.87232292, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 5.600740671157837 + }, + { + "auxiliary_loss_clip": 0.01485435, + "auxiliary_loss_mlp": 0.01136441, + "balance_loss_clip": 1.14748764, + "balance_loss_mlp": 1.07373691, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.370617278904379, + "language_loss": 0.81319523, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.839414, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.59100079536438 + }, + { + "auxiliary_loss_clip": 0.01383991, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.1855669, + "balance_loss_mlp": 1.00909221, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0462853645527281, + "language_loss": 0.66891098, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69314504, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.2658181190490723 + }, + { + "auxiliary_loss_clip": 0.01476071, + "auxiliary_loss_mlp": 0.01125257, + "balance_loss_clip": 1.14696026, + "balance_loss_mlp": 1.0635066, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 3.875038831149435, + "language_loss": 0.84385717, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86987048, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.6014492511749268 + }, + { + "auxiliary_loss_clip": 0.01463068, + "auxiliary_loss_mlp": 0.01115577, + "balance_loss_clip": 1.14117146, + "balance_loss_mlp": 1.0506314, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 4.59093744579834, + "language_loss": 0.88678539, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91257185, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.6591742038726807 + }, + { + "auxiliary_loss_clip": 0.01480088, + "auxiliary_loss_mlp": 0.01127442, + "balance_loss_clip": 1.14311039, + "balance_loss_mlp": 1.0632118, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 2.2565974723989473, + "language_loss": 0.86105263, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88712788, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.625270366668701 + }, + { + "auxiliary_loss_clip": 0.01363776, + "auxiliary_loss_mlp": 0.01038593, + "balance_loss_clip": 1.17515695, + "balance_loss_mlp": 1.00998259, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8611156576665863, + "language_loss": 0.60016787, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62419152, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.192308187484741 + }, + { + "auxiliary_loss_clip": 0.01471431, + "auxiliary_loss_mlp": 0.01127236, + "balance_loss_clip": 1.14709139, + "balance_loss_mlp": 1.06829894, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 2.033567671807296, + "language_loss": 0.84869176, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87467843, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.633937358856201 + }, + { + "auxiliary_loss_clip": 0.01476861, + "auxiliary_loss_mlp": 0.01150074, + "balance_loss_clip": 1.14905071, + "balance_loss_mlp": 1.08388877, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.151047001118478, + "language_loss": 0.88861454, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91488385, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.5979673862457275 + }, + { + "auxiliary_loss_clip": 0.01472307, + "auxiliary_loss_mlp": 0.01113228, + "balance_loss_clip": 1.14372325, + "balance_loss_mlp": 1.0539577, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.403483658710035, + "language_loss": 0.93211913, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95797443, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.7256174087524414 + }, + { + "auxiliary_loss_clip": 0.01460103, + "auxiliary_loss_mlp": 0.01113949, + "balance_loss_clip": 1.13830423, + "balance_loss_mlp": 1.05634761, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 2.212126848722709, + "language_loss": 0.74426126, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.77000177, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.711724042892456 + }, + { + "auxiliary_loss_clip": 0.01472699, + "auxiliary_loss_mlp": 0.01133506, + "balance_loss_clip": 1.1440562, + "balance_loss_mlp": 1.07347202, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.803832401759754, + "language_loss": 0.88487482, + "learning_rate": 3.234636443010188e-06, + "loss": 0.91093689, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.6300430297851562 + }, + { + "auxiliary_loss_clip": 0.01471598, + "auxiliary_loss_mlp": 0.01124674, + "balance_loss_clip": 1.14823687, + "balance_loss_mlp": 1.06373429, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.7645925952864756, + "language_loss": 0.84312946, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86909223, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.6207783222198486 + }, + { + "auxiliary_loss_clip": 0.01464099, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_clip": 1.14128387, + "balance_loss_mlp": 1.07685483, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9300731393520965, + "language_loss": 0.90006936, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92609972, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.648571252822876 + }, + { + "auxiliary_loss_clip": 0.01465944, + "auxiliary_loss_mlp": 0.01155046, + "balance_loss_clip": 1.1432879, + "balance_loss_mlp": 1.09522653, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.200811269119682, + "language_loss": 0.89893824, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92514819, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.647969961166382 + }, + { + "auxiliary_loss_clip": 0.01469847, + "auxiliary_loss_mlp": 0.0111956, + "balance_loss_clip": 1.14000535, + "balance_loss_mlp": 1.06057501, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 2.8542898988479326, + "language_loss": 0.86644053, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89233458, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.620116949081421 + }, + { + "auxiliary_loss_clip": 0.01468414, + "auxiliary_loss_mlp": 0.0112358, + "balance_loss_clip": 1.14530158, + "balance_loss_mlp": 1.06435668, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.5438100529120535, + "language_loss": 0.9976933, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02361321, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.5593183040618896 + }, + { + "auxiliary_loss_clip": 0.01452363, + "auxiliary_loss_mlp": 0.01143127, + "balance_loss_clip": 1.1392014, + "balance_loss_mlp": 1.08333218, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.261015750052864, + "language_loss": 0.88574237, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91169727, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.659562587738037 + }, + { + "auxiliary_loss_clip": 0.01459443, + "auxiliary_loss_mlp": 0.01123623, + "balance_loss_clip": 1.13909388, + "balance_loss_mlp": 1.06358898, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.7664154106318493, + "language_loss": 0.86681962, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89265037, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.5552921295166016 + }, + { + "auxiliary_loss_clip": 0.01447731, + "auxiliary_loss_mlp": 0.01138308, + "balance_loss_clip": 1.13326049, + "balance_loss_mlp": 1.07736874, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.938197447769381, + "language_loss": 0.86929882, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89515913, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.638495683670044 + }, + { + "auxiliary_loss_clip": 0.01455997, + "auxiliary_loss_mlp": 0.0112286, + "balance_loss_clip": 1.13960755, + "balance_loss_mlp": 1.06659293, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.2939086296391373, + "language_loss": 0.91707361, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94286215, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.6033341884613037 + }, + { + "auxiliary_loss_clip": 0.01460061, + "auxiliary_loss_mlp": 0.01111663, + "balance_loss_clip": 1.13877141, + "balance_loss_mlp": 1.05477667, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 1.9078789351518843, + "language_loss": 0.91724592, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94296312, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.6500167846679688 + }, + { + "auxiliary_loss_clip": 0.01320473, + "auxiliary_loss_mlp": 0.01068193, + "balance_loss_clip": 1.14984798, + "balance_loss_mlp": 1.04416037, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.2888977435641602, + "language_loss": 0.72321737, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74710405, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.119203567504883 + }, + { + "auxiliary_loss_clip": 0.01446062, + "auxiliary_loss_mlp": 0.0111835, + "balance_loss_clip": 1.13728356, + "balance_loss_mlp": 1.06108165, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.5800867782856556, + "language_loss": 0.84361261, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86925673, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.707993745803833 + }, + { + "auxiliary_loss_clip": 0.01435882, + "auxiliary_loss_mlp": 0.01103906, + "balance_loss_clip": 1.12802362, + "balance_loss_mlp": 1.04754412, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.010312107982353, + "language_loss": 0.89248973, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91788769, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.622227191925049 + }, + { + "auxiliary_loss_clip": 0.01439973, + "auxiliary_loss_mlp": 0.01134763, + "balance_loss_clip": 1.12958741, + "balance_loss_mlp": 1.07377529, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 1.9998240016473217, + "language_loss": 0.7994417, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82518905, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 2.669440746307373 + }, + { + "auxiliary_loss_clip": 0.01443634, + "auxiliary_loss_mlp": 0.01131106, + "balance_loss_clip": 1.13278174, + "balance_loss_mlp": 1.07078648, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.558841003776382, + "language_loss": 0.91863072, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94437808, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 2.6845414638519287 + }, + { + "auxiliary_loss_clip": 0.01434565, + "auxiliary_loss_mlp": 0.01138137, + "balance_loss_clip": 1.13042879, + "balance_loss_mlp": 1.08110726, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 2.396983118655951, + "language_loss": 0.90739059, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93311763, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 2.5508968830108643 + }, + { + "auxiliary_loss_clip": 0.01429544, + "auxiliary_loss_mlp": 0.01110375, + "balance_loss_clip": 1.12684011, + "balance_loss_mlp": 1.05129492, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.5992335973274163, + "language_loss": 0.87126637, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89666557, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 2.7013816833496094 + }, + { + "auxiliary_loss_clip": 0.01429335, + "auxiliary_loss_mlp": 0.01117555, + "balance_loss_clip": 1.12762427, + "balance_loss_mlp": 1.05947649, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 1.7019638915171318, + "language_loss": 0.84749556, + "learning_rate": 3.306695037731344e-06, + "loss": 0.8729645, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 2.560199499130249 + }, + { + "auxiliary_loss_clip": 0.01439166, + "auxiliary_loss_mlp": 0.01138434, + "balance_loss_clip": 1.12822008, + "balance_loss_mlp": 1.07973528, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.1240756620175114, + "language_loss": 0.90026176, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92603773, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 2.619224786758423 + }, + { + "auxiliary_loss_clip": 0.01433249, + "auxiliary_loss_mlp": 0.01113937, + "balance_loss_clip": 1.1311717, + "balance_loss_mlp": 1.05850494, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 2.4173915813384768, + "language_loss": 0.8911109, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91658282, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 2.5637710094451904 + }, + { + "auxiliary_loss_clip": 0.01421072, + "auxiliary_loss_mlp": 0.01125495, + "balance_loss_clip": 1.12345695, + "balance_loss_mlp": 1.06894243, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 2.332717372230567, + "language_loss": 0.81096029, + "learning_rate": 3.317958045350308e-06, + "loss": 0.8364259, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 2.661184549331665 + }, + { + "auxiliary_loss_clip": 0.01434098, + "auxiliary_loss_mlp": 0.01109973, + "balance_loss_clip": 1.12797141, + "balance_loss_mlp": 1.05644822, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 1.9168930942366074, + "language_loss": 0.82726431, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.852705, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 2.6062891483306885 + }, + { + "auxiliary_loss_clip": 0.01426884, + "auxiliary_loss_mlp": 0.01127354, + "balance_loss_clip": 1.12474203, + "balance_loss_mlp": 1.0710398, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 3.503063709876092, + "language_loss": 0.72888958, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75443196, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 2.6294448375701904 + }, + { + "auxiliary_loss_clip": 0.01429289, + "auxiliary_loss_mlp": 0.01136691, + "balance_loss_clip": 1.12561464, + "balance_loss_mlp": 1.07813525, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 3.2539969892441083, + "language_loss": 0.97954404, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00520384, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.7316343784332275 + }, + { + "auxiliary_loss_clip": 0.01413809, + "auxiliary_loss_mlp": 0.01135053, + "balance_loss_clip": 1.1221453, + "balance_loss_mlp": 1.08052635, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 4.287220674996523, + "language_loss": 0.76933682, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79482543, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 2.6373603343963623 + }, + { + "auxiliary_loss_clip": 0.01425691, + "auxiliary_loss_mlp": 0.01109348, + "balance_loss_clip": 1.12181044, + "balance_loss_mlp": 1.05348647, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.613741253452931, + "language_loss": 0.77066743, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79601783, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 2.6013035774230957 + }, + { + "auxiliary_loss_clip": 0.01431902, + "auxiliary_loss_mlp": 0.01112532, + "balance_loss_clip": 1.12589598, + "balance_loss_mlp": 1.05454874, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.7593671818771037, + "language_loss": 0.8416329, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86707723, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.5888009071350098 + }, + { + "auxiliary_loss_clip": 0.01421623, + "auxiliary_loss_mlp": 0.01115626, + "balance_loss_clip": 1.11971736, + "balance_loss_mlp": 1.05754757, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 2.2907131341148292, + "language_loss": 0.83691031, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86228281, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 4.188909530639648 + }, + { + "auxiliary_loss_clip": 0.0142316, + "auxiliary_loss_mlp": 0.01121644, + "balance_loss_clip": 1.12319314, + "balance_loss_mlp": 1.06513882, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 6.559188474758526, + "language_loss": 0.77869451, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80414253, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 4.119690656661987 + }, + { + "auxiliary_loss_clip": 0.01417074, + "auxiliary_loss_mlp": 0.01133775, + "balance_loss_clip": 1.11847425, + "balance_loss_mlp": 1.07777083, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.0741868824761367, + "language_loss": 0.76426423, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78977269, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 4.102848529815674 + }, + { + "auxiliary_loss_clip": 0.01414093, + "auxiliary_loss_mlp": 0.01126399, + "balance_loss_clip": 1.11817455, + "balance_loss_mlp": 1.06956029, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.6910282366366385, + "language_loss": 0.87865138, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.90405631, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.633543014526367 + }, + { + "auxiliary_loss_clip": 0.01417985, + "auxiliary_loss_mlp": 0.01115679, + "balance_loss_clip": 1.12153053, + "balance_loss_mlp": 1.06389403, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 3.4820427999638652, + "language_loss": 0.86996645, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89530301, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.628582715988159 + }, + { + "auxiliary_loss_clip": 0.01413465, + "auxiliary_loss_mlp": 0.01110453, + "balance_loss_clip": 1.12198806, + "balance_loss_mlp": 1.05382895, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8156714795061841, + "language_loss": 0.83834517, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.8635844, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.597715377807617 + }, + { + "auxiliary_loss_clip": 0.01422181, + "auxiliary_loss_mlp": 0.01112041, + "balance_loss_clip": 1.12016952, + "balance_loss_mlp": 1.05305576, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 3.1583171499426035, + "language_loss": 0.71168923, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73703146, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.6721343994140625 + }, + { + "auxiliary_loss_clip": 0.01417402, + "auxiliary_loss_mlp": 0.01108583, + "balance_loss_clip": 1.1187005, + "balance_loss_mlp": 1.05586886, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 2.194803892017248, + "language_loss": 1.02214789, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04740787, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.6501662731170654 + }, + { + "auxiliary_loss_clip": 0.01407875, + "auxiliary_loss_mlp": 0.01113765, + "balance_loss_clip": 1.11890554, + "balance_loss_mlp": 1.05718803, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.7938224824757587, + "language_loss": 0.75044274, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77565914, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.8282363414764404 + }, + { + "auxiliary_loss_clip": 0.01298211, + "auxiliary_loss_mlp": 0.01064778, + "balance_loss_clip": 1.13792992, + "balance_loss_mlp": 1.0422709, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7641539259587415, + "language_loss": 0.56191349, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58554339, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.2304632663726807 + }, + { + "auxiliary_loss_clip": 0.01404456, + "auxiliary_loss_mlp": 0.01126584, + "balance_loss_clip": 1.11421943, + "balance_loss_mlp": 1.07026911, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.7283952044264885, + "language_loss": 0.95063663, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97594702, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.6158199310302734 + }, + { + "auxiliary_loss_clip": 0.01405017, + "auxiliary_loss_mlp": 0.01104642, + "balance_loss_clip": 1.11530209, + "balance_loss_mlp": 1.0512358, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 3.6514844924959573, + "language_loss": 0.84682012, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.87191671, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.5993337631225586 + }, + { + "auxiliary_loss_clip": 0.01403192, + "auxiliary_loss_mlp": 0.01120394, + "balance_loss_clip": 1.11161685, + "balance_loss_mlp": 1.06725025, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 7.779959893027988, + "language_loss": 0.91878557, + "learning_rate": 3.385049875042367e-06, + "loss": 0.9440214, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.664801597595215 + }, + { + "auxiliary_loss_clip": 0.01398422, + "auxiliary_loss_mlp": 0.01124881, + "balance_loss_clip": 1.11203647, + "balance_loss_mlp": 1.06680262, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 6.0356000806828165, + "language_loss": 0.86986542, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89509845, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.6294500827789307 + }, + { + "auxiliary_loss_clip": 0.0140218, + "auxiliary_loss_mlp": 0.01103686, + "balance_loss_clip": 1.1106894, + "balance_loss_mlp": 1.05194902, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.4902688513882927, + "language_loss": 0.92014492, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94520354, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.626101016998291 + }, + { + "auxiliary_loss_clip": 0.01406929, + "auxiliary_loss_mlp": 0.01115018, + "balance_loss_clip": 1.11512506, + "balance_loss_mlp": 1.06197011, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 2.8071618490887515, + "language_loss": 0.90197563, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92719513, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.636544704437256 + }, + { + "auxiliary_loss_clip": 0.01400456, + "auxiliary_loss_mlp": 0.01116743, + "balance_loss_clip": 1.11426735, + "balance_loss_mlp": 1.06143034, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 4.713758286264593, + "language_loss": 0.86052138, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88569343, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.5651981830596924 + }, + { + "auxiliary_loss_clip": 0.01397792, + "auxiliary_loss_mlp": 0.0111085, + "balance_loss_clip": 1.11070657, + "balance_loss_mlp": 1.05620432, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.5855041156376943, + "language_loss": 0.93568641, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.96077287, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.582828998565674 + }, + { + "auxiliary_loss_clip": 0.01396688, + "auxiliary_loss_mlp": 0.01117811, + "balance_loss_clip": 1.11225033, + "balance_loss_mlp": 1.06364214, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.24646501336351, + "language_loss": 0.79067314, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81581807, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.667124032974243 + }, + { + "auxiliary_loss_clip": 0.01394439, + "auxiliary_loss_mlp": 0.01124909, + "balance_loss_clip": 1.11421347, + "balance_loss_mlp": 1.07126462, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 1.9373105360840475, + "language_loss": 0.88283324, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90802675, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.5799190998077393 + }, + { + "auxiliary_loss_clip": 0.01405383, + "auxiliary_loss_mlp": 0.01132812, + "balance_loss_clip": 1.11634719, + "balance_loss_mlp": 1.07482886, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.9679314438802176, + "language_loss": 0.81448138, + "learning_rate": 3.411333205349222e-06, + "loss": 0.8398633, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.6355738639831543 + }, + { + "auxiliary_loss_clip": 0.01402798, + "auxiliary_loss_mlp": 0.01108793, + "balance_loss_clip": 1.11269498, + "balance_loss_mlp": 1.05452871, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.129359209318031, + "language_loss": 0.87683272, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90194863, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.540936231613159 + }, + { + "auxiliary_loss_clip": 0.01401426, + "auxiliary_loss_mlp": 0.0111474, + "balance_loss_clip": 1.11486673, + "balance_loss_mlp": 1.0591172, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 1.9171102818700223, + "language_loss": 0.84193122, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86709291, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.585872173309326 + }, + { + "auxiliary_loss_clip": 0.01388679, + "auxiliary_loss_mlp": 0.01114149, + "balance_loss_clip": 1.10616279, + "balance_loss_mlp": 1.06002784, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.840411412398398, + "language_loss": 0.89861548, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92364377, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.6183383464813232 + }, + { + "auxiliary_loss_clip": 0.01270686, + "auxiliary_loss_mlp": 0.01081932, + "balance_loss_clip": 1.11604619, + "balance_loss_mlp": 1.06190443, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0214253213516586, + "language_loss": 0.61186433, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63539052, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 3.0356922149658203 + }, + { + "auxiliary_loss_clip": 0.01399062, + "auxiliary_loss_mlp": 0.01111482, + "balance_loss_clip": 1.11034465, + "balance_loss_mlp": 1.05779064, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.9314138273840062, + "language_loss": 0.91336095, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93846637, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 2.54370379447937 + }, + { + "auxiliary_loss_clip": 0.01406672, + "auxiliary_loss_mlp": 0.01124015, + "balance_loss_clip": 1.11548996, + "balance_loss_mlp": 1.06870198, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.7483632749528537, + "language_loss": 0.89549762, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.9208045, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 2.6122395992279053 + }, + { + "auxiliary_loss_clip": 0.01397252, + "auxiliary_loss_mlp": 0.01108245, + "balance_loss_clip": 1.10900736, + "balance_loss_mlp": 1.05526805, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.9849436744817677, + "language_loss": 0.95620656, + "learning_rate": 3.43348263905683e-06, + "loss": 0.98126155, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 2.6059184074401855 + }, + { + "auxiliary_loss_clip": 0.01396642, + "auxiliary_loss_mlp": 0.01123095, + "balance_loss_clip": 1.11442471, + "balance_loss_mlp": 1.06830621, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 1.7643366434280674, + "language_loss": 0.75801396, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78321135, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 2.5859811305999756 + }, + { + "auxiliary_loss_clip": 0.01385936, + "auxiliary_loss_mlp": 0.0111779, + "balance_loss_clip": 1.10945344, + "balance_loss_mlp": 1.06285834, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 15.64706863612978, + "language_loss": 0.98765773, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01269495, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.663562536239624 + }, + { + "auxiliary_loss_clip": 0.01391174, + "auxiliary_loss_mlp": 0.01123304, + "balance_loss_clip": 1.10961556, + "balance_loss_mlp": 1.06820548, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 3.162943234489302, + "language_loss": 0.85276121, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87790596, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.767760753631592 + }, + { + "auxiliary_loss_clip": 0.01389208, + "auxiliary_loss_mlp": 0.0113322, + "balance_loss_clip": 1.10861814, + "balance_loss_mlp": 1.0809114, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.9757518144907693, + "language_loss": 0.97162986, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99685419, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 2.633619785308838 + }, + { + "auxiliary_loss_clip": 0.01397386, + "auxiliary_loss_mlp": 0.01128989, + "balance_loss_clip": 1.11328447, + "balance_loss_mlp": 1.07334161, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 8.077546353765417, + "language_loss": 0.95337689, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97864068, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 2.7045111656188965 + }, + { + "auxiliary_loss_clip": 0.01386541, + "auxiliary_loss_mlp": 0.01123377, + "balance_loss_clip": 1.1111635, + "balance_loss_mlp": 1.06916094, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 1.739279046381434, + "language_loss": 0.76314104, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78824025, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 2.838383674621582 + }, + { + "auxiliary_loss_clip": 0.01390658, + "auxiliary_loss_mlp": 0.01118115, + "balance_loss_clip": 1.1084013, + "balance_loss_mlp": 1.06473315, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 2.8311013673807572, + "language_loss": 0.86190534, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.88699311, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 2.553703546524048 + }, + { + "auxiliary_loss_clip": 0.01389652, + "auxiliary_loss_mlp": 0.0113763, + "balance_loss_clip": 1.11582017, + "balance_loss_mlp": 1.08074331, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 5.925791793177649, + "language_loss": 0.77647197, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80174482, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 2.6762819290161133 + }, + { + "auxiliary_loss_clip": 0.01399409, + "auxiliary_loss_mlp": 0.01122695, + "balance_loss_clip": 1.11475897, + "balance_loss_mlp": 1.06950355, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.44027950900775, + "language_loss": 0.90577227, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93099332, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.654634714126587 + }, + { + "auxiliary_loss_clip": 0.01390677, + "auxiliary_loss_mlp": 0.01120027, + "balance_loss_clip": 1.1080699, + "balance_loss_mlp": 1.0660733, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 4.439602930878437, + "language_loss": 0.93738383, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96249086, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 2.567896842956543 + }, + { + "auxiliary_loss_clip": 0.01384187, + "auxiliary_loss_mlp": 0.01112167, + "balance_loss_clip": 1.10714638, + "balance_loss_mlp": 1.05601978, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 4.2422538310591005, + "language_loss": 0.9395777, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.9645412, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 2.606668710708618 + }, + { + "auxiliary_loss_clip": 0.01379747, + "auxiliary_loss_mlp": 0.01108747, + "balance_loss_clip": 1.10587597, + "balance_loss_mlp": 1.05581772, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 2.6264537379510124, + "language_loss": 0.8624112, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88729614, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 2.7235164642333984 + }, + { + "auxiliary_loss_clip": 0.01386677, + "auxiliary_loss_mlp": 0.01108185, + "balance_loss_clip": 1.11005116, + "balance_loss_mlp": 1.05539966, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 2.200208270394203, + "language_loss": 0.87726569, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90221435, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 2.649165391921997 + }, + { + "auxiliary_loss_clip": 0.01377299, + "auxiliary_loss_mlp": 0.01117426, + "balance_loss_clip": 1.1038661, + "balance_loss_mlp": 1.06785846, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 2.1033248571806102, + "language_loss": 0.86475182, + "learning_rate": 3.475618842282164e-06, + "loss": 0.8896991, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 4.038396120071411 + }, + { + "auxiliary_loss_clip": 0.01381584, + "auxiliary_loss_mlp": 0.01118106, + "balance_loss_clip": 1.10302341, + "balance_loss_mlp": 1.06481969, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.0776353711003908, + "language_loss": 0.92467713, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94967407, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 7.1508119106292725 + }, + { + "auxiliary_loss_clip": 0.01376938, + "auxiliary_loss_mlp": 0.011107, + "balance_loss_clip": 1.10540926, + "balance_loss_mlp": 1.05471897, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.786010801195951, + "language_loss": 0.96146542, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98634183, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 2.583733081817627 + }, + { + "auxiliary_loss_clip": 0.01379123, + "auxiliary_loss_mlp": 0.01111214, + "balance_loss_clip": 1.10529304, + "balance_loss_mlp": 1.05957305, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.6470703844374923, + "language_loss": 0.88312805, + "learning_rate": 3.484300126837776e-06, + "loss": 0.9080314, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.5626776218414307 + }, + { + "auxiliary_loss_clip": 0.01378557, + "auxiliary_loss_mlp": 0.0111046, + "balance_loss_clip": 1.10443246, + "balance_loss_mlp": 1.05414557, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 3.1637239518689264, + "language_loss": 0.89344442, + "learning_rate": 3.487168070036317e-06, + "loss": 0.91833454, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.6065399646759033 + }, + { + "auxiliary_loss_clip": 0.01374853, + "auxiliary_loss_mlp": 0.01126944, + "balance_loss_clip": 1.10426176, + "balance_loss_mlp": 1.07210803, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 5.345883404545831, + "language_loss": 0.99090821, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01592612, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.562645435333252 + }, + { + "auxiliary_loss_clip": 0.01378538, + "auxiliary_loss_mlp": 0.01117547, + "balance_loss_clip": 1.10550284, + "balance_loss_mlp": 1.0621388, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 2.363585422033777, + "language_loss": 0.91278428, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93774515, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.597832202911377 + }, + { + "auxiliary_loss_clip": 0.01251314, + "auxiliary_loss_mlp": 0.01057802, + "balance_loss_clip": 1.10567904, + "balance_loss_mlp": 1.03825176, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9394670113477557, + "language_loss": 0.57613301, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59922415, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.192260265350342 + }, + { + "auxiliary_loss_clip": 0.01369665, + "auxiliary_loss_mlp": 0.0111844, + "balance_loss_clip": 1.1026752, + "balance_loss_mlp": 1.0679667, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.3412242728662136, + "language_loss": 0.87611294, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90099406, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.6750972270965576 + }, + { + "auxiliary_loss_clip": 0.01377196, + "auxiliary_loss_mlp": 0.01105499, + "balance_loss_clip": 1.10274243, + "balance_loss_mlp": 1.05402493, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.442050094749775, + "language_loss": 0.84105831, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86588526, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.578756093978882 + }, + { + "auxiliary_loss_clip": 0.01376466, + "auxiliary_loss_mlp": 0.01125996, + "balance_loss_clip": 1.10436201, + "balance_loss_mlp": 1.07554662, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.8582416290966708, + "language_loss": 0.90494239, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92996705, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.5776290893554688 + }, + { + "auxiliary_loss_clip": 0.01379261, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_clip": 1.10734487, + "balance_loss_mlp": 1.06493735, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.3504231347296094, + "language_loss": 0.83633339, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86128402, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.5836117267608643 + }, + { + "auxiliary_loss_clip": 0.01378874, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_clip": 1.09834552, + "balance_loss_mlp": 1.04401135, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 3.6560023344578365, + "language_loss": 0.73967606, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76442087, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.6398606300354004 + }, + { + "auxiliary_loss_clip": 0.01384089, + "auxiliary_loss_mlp": 0.01131112, + "balance_loss_clip": 1.10586643, + "balance_loss_mlp": 1.07799232, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.7793892400527755, + "language_loss": 0.85797465, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88312662, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.5683116912841797 + }, + { + "auxiliary_loss_clip": 0.0137665, + "auxiliary_loss_mlp": 0.01118638, + "balance_loss_clip": 1.10586143, + "balance_loss_mlp": 1.06873715, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.362422278460194, + "language_loss": 0.89162028, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91657317, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.641057252883911 + }, + { + "auxiliary_loss_clip": 0.01376412, + "auxiliary_loss_mlp": 0.01135853, + "balance_loss_clip": 1.1076076, + "balance_loss_mlp": 1.08521259, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 2.3507609450307396, + "language_loss": 0.85532123, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88044387, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.6380436420440674 + }, + { + "auxiliary_loss_clip": 0.01371028, + "auxiliary_loss_mlp": 0.01108503, + "balance_loss_clip": 1.09947443, + "balance_loss_mlp": 1.05705214, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.156750816958058, + "language_loss": 0.82477492, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84957021, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.734506368637085 + }, + { + "auxiliary_loss_clip": 0.01370199, + "auxiliary_loss_mlp": 0.01126106, + "balance_loss_clip": 1.10197115, + "balance_loss_mlp": 1.07322454, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 5.582611305255762, + "language_loss": 0.77283251, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79779559, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.567065715789795 + }, + { + "auxiliary_loss_clip": 0.01367108, + "auxiliary_loss_mlp": 0.01116885, + "balance_loss_clip": 1.10474992, + "balance_loss_mlp": 1.0696547, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 2.3306341134791397, + "language_loss": 0.87207961, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89691949, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.6649396419525146 + }, + { + "auxiliary_loss_clip": 0.0135271, + "auxiliary_loss_mlp": 0.01112882, + "balance_loss_clip": 1.09611249, + "balance_loss_mlp": 1.06450713, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 6.170950107999519, + "language_loss": 0.92942977, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95408571, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.6169345378875732 + }, + { + "auxiliary_loss_clip": 0.01361693, + "auxiliary_loss_mlp": 0.01110152, + "balance_loss_clip": 1.10032916, + "balance_loss_mlp": 1.06244457, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 3.4093690745305887, + "language_loss": 0.85043412, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87515259, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.7204782962799072 + }, + { + "auxiliary_loss_clip": 0.0136025, + "auxiliary_loss_mlp": 0.01126092, + "balance_loss_clip": 1.10579205, + "balance_loss_mlp": 1.0754993, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 2.283639164842516, + "language_loss": 0.88811612, + "learning_rate": 3.534064540103573e-06, + "loss": 0.91297948, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.5893006324768066 + }, + { + "auxiliary_loss_clip": 0.0136018, + "auxiliary_loss_mlp": 0.01111295, + "balance_loss_clip": 1.09967124, + "balance_loss_mlp": 1.05991602, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 3.7461014512517594, + "language_loss": 0.87054825, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89526296, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 2.6158931255340576 + }, + { + "auxiliary_loss_clip": 0.01366995, + "auxiliary_loss_mlp": 0.01110255, + "balance_loss_clip": 1.10307074, + "balance_loss_mlp": 1.05904305, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.7913694359252337, + "language_loss": 0.84294534, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86771786, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 2.605095386505127 + }, + { + "auxiliary_loss_clip": 0.01372834, + "auxiliary_loss_mlp": 0.01123644, + "balance_loss_clip": 1.10365927, + "balance_loss_mlp": 1.0710485, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.415558764474647, + "language_loss": 0.78814387, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81310868, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 2.630140542984009 + }, + { + "auxiliary_loss_clip": 0.0137053, + "auxiliary_loss_mlp": 0.01105825, + "balance_loss_clip": 1.09878969, + "balance_loss_mlp": 1.0507983, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.1172033256481657, + "language_loss": 0.84198213, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86674571, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 2.6559367179870605 + }, + { + "auxiliary_loss_clip": 0.01365943, + "auxiliary_loss_mlp": 0.01102142, + "balance_loss_clip": 1.09773421, + "balance_loss_mlp": 1.05076337, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 2.118801866891168, + "language_loss": 0.90381753, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92849833, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 2.636168956756592 + }, + { + "auxiliary_loss_clip": 0.01363975, + "auxiliary_loss_mlp": 0.01102458, + "balance_loss_clip": 1.09335184, + "balance_loss_mlp": 1.05224657, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.2133062221627107, + "language_loss": 0.78150022, + "learning_rate": 3.549833136812155e-06, + "loss": 0.8061645, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.664808988571167 + }, + { + "auxiliary_loss_clip": 0.01366414, + "auxiliary_loss_mlp": 0.01108616, + "balance_loss_clip": 1.1037842, + "balance_loss_mlp": 1.05735588, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.4543687891283033, + "language_loss": 0.83892894, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86367923, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.5926084518432617 + }, + { + "auxiliary_loss_clip": 0.01358279, + "auxiliary_loss_mlp": 0.01112647, + "balance_loss_clip": 1.09532726, + "balance_loss_mlp": 1.06269813, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.0958071992586835, + "language_loss": 0.93616521, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.9608745, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.5766170024871826 + }, + { + "auxiliary_loss_clip": 0.01366013, + "auxiliary_loss_mlp": 0.01120519, + "balance_loss_clip": 1.10062683, + "balance_loss_mlp": 1.06818557, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.9909836991097096, + "language_loss": 0.96928835, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99415374, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.627459764480591 + }, + { + "auxiliary_loss_clip": 0.01362309, + "auxiliary_loss_mlp": 0.01107337, + "balance_loss_clip": 1.09611356, + "balance_loss_mlp": 1.05734098, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 1.9094580447651823, + "language_loss": 0.84286404, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86756057, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 2.6129603385925293 + }, + { + "auxiliary_loss_clip": 0.01355859, + "auxiliary_loss_mlp": 0.01121026, + "balance_loss_clip": 1.09743071, + "balance_loss_mlp": 1.07009983, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.4037219890607706, + "language_loss": 0.98530233, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.01007128, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.657147169113159 + }, + { + "auxiliary_loss_clip": 0.01238289, + "auxiliary_loss_mlp": 0.01072513, + "balance_loss_clip": 1.09880519, + "balance_loss_mlp": 1.05382144, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8621881721548964, + "language_loss": 0.55617005, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57927805, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.122217893600464 + }, + { + "auxiliary_loss_clip": 0.0136176, + "auxiliary_loss_mlp": 0.01123305, + "balance_loss_clip": 1.09342158, + "balance_loss_mlp": 1.0716157, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.638985031825443, + "language_loss": 0.90249729, + "learning_rate": 3.567754632921479e-06, + "loss": 0.92734796, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 2.6765778064727783 + }, + { + "auxiliary_loss_clip": 0.01357094, + "auxiliary_loss_mlp": 0.0113548, + "balance_loss_clip": 1.0950017, + "balance_loss_mlp": 1.08424413, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.3304864867975184, + "language_loss": 0.85491097, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.87983674, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 2.597442150115967 + }, + { + "auxiliary_loss_clip": 0.01364438, + "auxiliary_loss_mlp": 0.0112159, + "balance_loss_clip": 1.09550941, + "balance_loss_mlp": 1.06980538, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.4203114402156594, + "language_loss": 0.71241826, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73727852, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 2.6201045513153076 + }, + { + "auxiliary_loss_clip": 0.01356333, + "auxiliary_loss_mlp": 0.01109978, + "balance_loss_clip": 1.09450269, + "balance_loss_mlp": 1.05912352, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 2.1823219400058895, + "language_loss": 0.9481234, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97278649, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 2.62046217918396 + }, + { + "auxiliary_loss_clip": 0.01357576, + "auxiliary_loss_mlp": 0.01111623, + "balance_loss_clip": 1.09427011, + "balance_loss_mlp": 1.06141198, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 4.6759891537061, + "language_loss": 0.93120158, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95589364, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.735224723815918 + }, + { + "auxiliary_loss_clip": 0.013506, + "auxiliary_loss_mlp": 0.01108002, + "balance_loss_clip": 1.09636354, + "balance_loss_mlp": 1.05953145, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.077536796322883, + "language_loss": 0.97432274, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99890882, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 2.593290090560913 + }, + { + "auxiliary_loss_clip": 0.01364679, + "auxiliary_loss_mlp": 0.01126504, + "balance_loss_clip": 1.09667873, + "balance_loss_mlp": 1.07655501, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.2431328568455857, + "language_loss": 0.87930107, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90421289, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.650317907333374 + }, + { + "auxiliary_loss_clip": 0.01355189, + "auxiliary_loss_mlp": 0.01121796, + "balance_loss_clip": 1.09207344, + "balance_loss_mlp": 1.07132244, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 9.506985959530498, + "language_loss": 0.67314005, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69790995, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 4.088837146759033 + }, + { + "auxiliary_loss_clip": 0.0135462, + "auxiliary_loss_mlp": 0.01139252, + "balance_loss_clip": 1.09504032, + "balance_loss_mlp": 1.08725321, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.6500237720867434, + "language_loss": 0.68098879, + "learning_rate": 3.587643540438383e-06, + "loss": 0.70592749, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 5.5947980880737305 + }, + { + "auxiliary_loss_clip": 0.01354664, + "auxiliary_loss_mlp": 0.01118729, + "balance_loss_clip": 1.09114087, + "balance_loss_mlp": 1.06773162, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 2.516322684897006, + "language_loss": 0.85553622, + "learning_rate": 3.590087005168037e-06, + "loss": 0.88027018, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.5757393836975098 + }, + { + "auxiliary_loss_clip": 0.01361523, + "auxiliary_loss_mlp": 0.01103879, + "balance_loss_clip": 1.09610975, + "balance_loss_mlp": 1.05505061, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 2.3531835380940693, + "language_loss": 1.04187012, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06652403, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.541177272796631 + }, + { + "auxiliary_loss_clip": 0.01364654, + "auxiliary_loss_mlp": 0.01123619, + "balance_loss_clip": 1.09932196, + "balance_loss_mlp": 1.06990314, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 3.1292494511946884, + "language_loss": 0.75093418, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77581692, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.5964596271514893 + }, + { + "auxiliary_loss_clip": 0.01349271, + "auxiliary_loss_mlp": 0.01113377, + "balance_loss_clip": 1.09457386, + "balance_loss_mlp": 1.06280875, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 2.1588565019948733, + "language_loss": 0.91038173, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93500817, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.6323041915893555 + }, + { + "auxiliary_loss_clip": 0.01359139, + "auxiliary_loss_mlp": 0.0111809, + "balance_loss_clip": 1.09634149, + "balance_loss_mlp": 1.06919086, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.899591615677194, + "language_loss": 0.86226118, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88703334, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 2.5905344486236572 + }, + { + "auxiliary_loss_clip": 0.01349767, + "auxiliary_loss_mlp": 0.0110673, + "balance_loss_clip": 1.09635448, + "balance_loss_mlp": 1.05833066, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 3.42191954437053, + "language_loss": 0.88386548, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90843046, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 2.5828511714935303 + }, + { + "auxiliary_loss_clip": 0.0135537, + "auxiliary_loss_mlp": 0.0110607, + "balance_loss_clip": 1.09293318, + "balance_loss_mlp": 1.0524019, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.2861472907085187, + "language_loss": 0.97392982, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99854416, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.5340802669525146 + }, + { + "auxiliary_loss_clip": 0.01356916, + "auxiliary_loss_mlp": 0.01119507, + "balance_loss_clip": 1.09910107, + "balance_loss_mlp": 1.06984425, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.2514730400851186, + "language_loss": 0.86133248, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88609672, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 2.662799835205078 + }, + { + "auxiliary_loss_clip": 0.0135281, + "auxiliary_loss_mlp": 0.01109548, + "balance_loss_clip": 1.09010243, + "balance_loss_mlp": 1.05976653, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.7046411526838487, + "language_loss": 0.81583732, + "learning_rate": 3.609307900676025e-06, + "loss": 0.8404609, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.6917741298675537 + }, + { + "auxiliary_loss_clip": 0.01347216, + "auxiliary_loss_mlp": 0.01121627, + "balance_loss_clip": 1.09210777, + "balance_loss_mlp": 1.07351375, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 8.429660597367329, + "language_loss": 0.81359744, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83828592, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.6141159534454346 + }, + { + "auxiliary_loss_clip": 0.01345607, + "auxiliary_loss_mlp": 0.01105936, + "balance_loss_clip": 1.0874207, + "balance_loss_mlp": 1.05655956, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.078847886788903, + "language_loss": 0.91433835, + "learning_rate": 3.614024787585744e-06, + "loss": 0.93885374, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.5416715145111084 + }, + { + "auxiliary_loss_clip": 0.01342059, + "auxiliary_loss_mlp": 0.011133, + "balance_loss_clip": 1.08925748, + "balance_loss_mlp": 1.06261206, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 2.112971518549705, + "language_loss": 0.88174909, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90630269, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.6443099975585938 + }, + { + "auxiliary_loss_clip": 0.01344794, + "auxiliary_loss_mlp": 0.01108031, + "balance_loss_clip": 1.0896287, + "balance_loss_mlp": 1.05622244, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 13.29719722111766, + "language_loss": 0.80876631, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83329451, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 2.5852925777435303 + }, + { + "auxiliary_loss_clip": 0.01338712, + "auxiliary_loss_mlp": 0.01104872, + "balance_loss_clip": 1.09051967, + "balance_loss_mlp": 1.05835617, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 2.5464341776882713, + "language_loss": 0.81099504, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83543086, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.6670706272125244 + }, + { + "auxiliary_loss_clip": 0.01335903, + "auxiliary_loss_mlp": 0.01096772, + "balance_loss_clip": 1.08295393, + "balance_loss_mlp": 1.04806292, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.2485931923727733, + "language_loss": 0.8061161, + "learning_rate": 3.623356141983041e-06, + "loss": 0.83044285, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 2.534116506576538 + }, + { + "auxiliary_loss_clip": 0.0134017, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_clip": 1.08817434, + "balance_loss_mlp": 1.0551132, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 2.107544923952264, + "language_loss": 0.90981221, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.93423903, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.6438167095184326 + }, + { + "auxiliary_loss_clip": 0.01345127, + "auxiliary_loss_mlp": 0.01123133, + "balance_loss_clip": 1.08875346, + "balance_loss_mlp": 1.07332683, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.130568761082479, + "language_loss": 0.93957174, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96425426, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.571112632751465 + }, + { + "auxiliary_loss_clip": 0.01347138, + "auxiliary_loss_mlp": 0.01111575, + "balance_loss_clip": 1.08835173, + "balance_loss_mlp": 1.06155455, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 2.070363318647908, + "language_loss": 0.74232405, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76691115, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.6324145793914795 + }, + { + "auxiliary_loss_clip": 0.01341436, + "auxiliary_loss_mlp": 0.01120982, + "balance_loss_clip": 1.09006798, + "balance_loss_mlp": 1.073704, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.4670810570921238, + "language_loss": 0.80099833, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82562256, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 2.514763355255127 + }, + { + "auxiliary_loss_clip": 0.01348793, + "auxiliary_loss_mlp": 0.01124101, + "balance_loss_clip": 1.09303415, + "balance_loss_mlp": 1.07491565, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.526400604807274, + "language_loss": 0.77749509, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80222404, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 2.592761993408203 + }, + { + "auxiliary_loss_clip": 0.0134774, + "auxiliary_loss_mlp": 0.01098333, + "balance_loss_clip": 1.09347713, + "balance_loss_mlp": 1.0524137, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.3394441172841645, + "language_loss": 0.84466124, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86912191, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 2.696824550628662 + }, + { + "auxiliary_loss_clip": 0.01338732, + "auxiliary_loss_mlp": 0.01106289, + "balance_loss_clip": 1.08796656, + "balance_loss_mlp": 1.05679333, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 3.8235097214601073, + "language_loss": 0.97264862, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99709874, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 2.5856339931488037 + }, + { + "auxiliary_loss_clip": 0.01340076, + "auxiliary_loss_mlp": 0.01100768, + "balance_loss_clip": 1.08982348, + "balance_loss_mlp": 1.05518198, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.642032898220433, + "language_loss": 0.93878645, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96319485, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.6298766136169434 + }, + { + "auxiliary_loss_clip": 0.01333338, + "auxiliary_loss_mlp": 0.0109359, + "balance_loss_clip": 1.0854739, + "balance_loss_mlp": 1.04631162, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 2.326492891551433, + "language_loss": 0.92498344, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94925272, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.6070456504821777 + }, + { + "auxiliary_loss_clip": 0.01338878, + "auxiliary_loss_mlp": 0.01092969, + "balance_loss_clip": 1.08585727, + "balance_loss_mlp": 1.04528546, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.1603081624441516, + "language_loss": 1.02199376, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04631233, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.552755117416382 + }, + { + "auxiliary_loss_clip": 0.01220086, + "auxiliary_loss_mlp": 0.01134191, + "balance_loss_clip": 1.08524704, + "balance_loss_mlp": 1.11750221, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9238609143134718, + "language_loss": 0.63871729, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66226006, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.235379934310913 + }, + { + "auxiliary_loss_clip": 0.01339972, + "auxiliary_loss_mlp": 0.01109628, + "balance_loss_clip": 1.09096408, + "balance_loss_mlp": 1.06437635, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.9099880386159325, + "language_loss": 0.88352311, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90801913, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 2.5943217277526855 + }, + { + "auxiliary_loss_clip": 0.01338091, + "auxiliary_loss_mlp": 0.01098718, + "balance_loss_clip": 1.08893502, + "balance_loss_mlp": 1.05170131, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 1.8470274722650404, + "language_loss": 0.849455, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87382317, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.6045758724212646 + }, + { + "auxiliary_loss_clip": 0.01344465, + "auxiliary_loss_mlp": 0.01116236, + "balance_loss_clip": 1.09658742, + "balance_loss_mlp": 1.06628704, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 2.293185622217882, + "language_loss": 0.72761422, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75222123, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.609170436859131 + }, + { + "auxiliary_loss_clip": 0.01333154, + "auxiliary_loss_mlp": 0.01105733, + "balance_loss_clip": 1.08824134, + "balance_loss_mlp": 1.05831134, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.479495450976274, + "language_loss": 0.87334973, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89773858, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.5752971172332764 + }, + { + "auxiliary_loss_clip": 0.01335286, + "auxiliary_loss_mlp": 0.01111435, + "balance_loss_clip": 1.08835673, + "balance_loss_mlp": 1.06484807, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 4.123594788989893, + "language_loss": 0.81058669, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83505386, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 2.5755398273468018 + }, + { + "auxiliary_loss_clip": 0.01332795, + "auxiliary_loss_mlp": 0.01116179, + "balance_loss_clip": 1.08430278, + "balance_loss_mlp": 1.06878138, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.1757667732425987, + "language_loss": 0.83883768, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86332744, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 2.606102705001831 + }, + { + "auxiliary_loss_clip": 0.01340287, + "auxiliary_loss_mlp": 0.01123645, + "balance_loss_clip": 1.094859, + "balance_loss_mlp": 1.07636678, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.1976409286863348, + "language_loss": 0.8473711, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87201035, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 2.5671563148498535 + }, + { + "auxiliary_loss_clip": 0.01340096, + "auxiliary_loss_mlp": 0.01107578, + "balance_loss_clip": 1.09070444, + "balance_loss_mlp": 1.06149173, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.2913747651727103, + "language_loss": 0.87860382, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90308058, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 2.562626838684082 + }, + { + "auxiliary_loss_clip": 0.01338804, + "auxiliary_loss_mlp": 0.0110606, + "balance_loss_clip": 1.08917046, + "balance_loss_mlp": 1.06064093, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 3.759416259853715, + "language_loss": 0.88681203, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.9112606, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 2.5411319732666016 + }, + { + "auxiliary_loss_clip": 0.0133397, + "auxiliary_loss_mlp": 0.01127892, + "balance_loss_clip": 1.09104073, + "balance_loss_mlp": 1.08035147, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 2.0027991202299833, + "language_loss": 0.88532209, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.90994072, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 2.5524613857269287 + }, + { + "auxiliary_loss_clip": 0.01339346, + "auxiliary_loss_mlp": 0.01116328, + "balance_loss_clip": 1.08930635, + "balance_loss_mlp": 1.06828654, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 2.9699076253617744, + "language_loss": 0.64987189, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67442864, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.639860153198242 + }, + { + "auxiliary_loss_clip": 0.01339353, + "auxiliary_loss_mlp": 0.01112391, + "balance_loss_clip": 1.09307003, + "balance_loss_mlp": 1.06451643, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.5377369787020077, + "language_loss": 0.88549519, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.91001272, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.5334768295288086 + }, + { + "auxiliary_loss_clip": 0.01211838, + "auxiliary_loss_mlp": 0.01094096, + "balance_loss_clip": 1.077402, + "balance_loss_mlp": 1.07874203, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.9200056868633223, + "language_loss": 0.62149227, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64455163, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.304966449737549 + }, + { + "auxiliary_loss_clip": 0.01330255, + "auxiliary_loss_mlp": 0.01115121, + "balance_loss_clip": 1.08574653, + "balance_loss_mlp": 1.06693673, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.3864595301341183, + "language_loss": 0.89782798, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92228174, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 4.217484712600708 + }, + { + "auxiliary_loss_clip": 0.01337125, + "auxiliary_loss_mlp": 0.01124267, + "balance_loss_clip": 1.09061384, + "balance_loss_mlp": 1.07486701, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 2.04047947578029, + "language_loss": 0.80379725, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82841116, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.59952974319458 + }, + { + "auxiliary_loss_clip": 0.01331664, + "auxiliary_loss_mlp": 0.01097833, + "balance_loss_clip": 1.09335756, + "balance_loss_mlp": 1.05274808, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 4.083591023188011, + "language_loss": 0.82913792, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85343289, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 5.610285758972168 + }, + { + "auxiliary_loss_clip": 0.01332048, + "auxiliary_loss_mlp": 0.01095875, + "balance_loss_clip": 1.08554113, + "balance_loss_mlp": 1.05114722, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 2.0058664077617845, + "language_loss": 0.90962374, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93390298, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.613525152206421 + }, + { + "auxiliary_loss_clip": 0.01326869, + "auxiliary_loss_mlp": 0.01094491, + "balance_loss_clip": 1.08336496, + "balance_loss_mlp": 1.04902482, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.143561921213879, + "language_loss": 0.86611283, + "learning_rate": 3.687243426879095e-06, + "loss": 0.89032638, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.6491758823394775 + }, + { + "auxiliary_loss_clip": 0.01328126, + "auxiliary_loss_mlp": 0.01115442, + "balance_loss_clip": 1.08872521, + "balance_loss_mlp": 1.06535006, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.3057925507211543, + "language_loss": 0.71987092, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74430662, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.6738011837005615 + }, + { + "auxiliary_loss_clip": 0.01332031, + "auxiliary_loss_mlp": 0.01112389, + "balance_loss_clip": 1.08430886, + "balance_loss_mlp": 1.06565905, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.2011547788302, + "language_loss": 0.91847789, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94292212, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.584021806716919 + }, + { + "auxiliary_loss_clip": 0.01343225, + "auxiliary_loss_mlp": 0.01110409, + "balance_loss_clip": 1.08736229, + "balance_loss_mlp": 1.06141353, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 5.3979629259857935, + "language_loss": 0.72726762, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75180399, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.6430583000183105 + }, + { + "auxiliary_loss_clip": 0.01330738, + "auxiliary_loss_mlp": 0.01124187, + "balance_loss_clip": 1.09043324, + "balance_loss_mlp": 1.07800519, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 2.0752657398697107, + "language_loss": 0.73916519, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76371443, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.8001863956451416 + }, + { + "auxiliary_loss_clip": 0.01341775, + "auxiliary_loss_mlp": 0.01118424, + "balance_loss_clip": 1.08933663, + "balance_loss_mlp": 1.07219517, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.183527284063509, + "language_loss": 0.9151957, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93979776, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.5418813228607178 + }, + { + "auxiliary_loss_clip": 0.01339209, + "auxiliary_loss_mlp": 0.01129292, + "balance_loss_clip": 1.08973718, + "balance_loss_mlp": 1.08063102, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 3.960654108189894, + "language_loss": 0.90036845, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92505354, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.540820837020874 + }, + { + "auxiliary_loss_clip": 0.01342825, + "auxiliary_loss_mlp": 0.01107931, + "balance_loss_clip": 1.08816135, + "balance_loss_mlp": 1.05893636, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 3.9649423834273514, + "language_loss": 0.73544014, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75994772, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.572291612625122 + }, + { + "auxiliary_loss_clip": 0.01328865, + "auxiliary_loss_mlp": 0.0111119, + "balance_loss_clip": 1.08541834, + "balance_loss_mlp": 1.06546128, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.7397735114281625, + "language_loss": 0.90132272, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.9257232, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.6474173069000244 + }, + { + "auxiliary_loss_clip": 0.01332145, + "auxiliary_loss_mlp": 0.0110458, + "balance_loss_clip": 1.08777785, + "balance_loss_mlp": 1.05801702, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 4.484231233956078, + "language_loss": 0.80825233, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83261955, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 2.5553271770477295 + }, + { + "auxiliary_loss_clip": 0.01326262, + "auxiliary_loss_mlp": 0.01102058, + "balance_loss_clip": 1.08497369, + "balance_loss_mlp": 1.05532765, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.2941642182458786, + "language_loss": 0.90599275, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.93027592, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.585423707962036 + }, + { + "auxiliary_loss_clip": 0.0132257, + "auxiliary_loss_mlp": 0.01101593, + "balance_loss_clip": 1.08421206, + "balance_loss_mlp": 1.05455303, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 3.1515421672648007, + "language_loss": 0.90965378, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93389547, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.6502814292907715 + }, + { + "auxiliary_loss_clip": 0.01325564, + "auxiliary_loss_mlp": 0.01094478, + "balance_loss_clip": 1.08415604, + "balance_loss_mlp": 1.04972649, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.4571733341342257, + "language_loss": 0.94069076, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.9648912, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 2.605076313018799 + }, + { + "auxiliary_loss_clip": 0.01196746, + "auxiliary_loss_mlp": 0.01117988, + "balance_loss_clip": 1.06839895, + "balance_loss_mlp": 1.10234761, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9481933854033598, + "language_loss": 0.59835535, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.6215027, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 3.010685682296753 + }, + { + "auxiliary_loss_clip": 0.01323766, + "auxiliary_loss_mlp": 0.01106553, + "balance_loss_clip": 1.08408141, + "balance_loss_mlp": 1.06139612, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 2.7622622350863986, + "language_loss": 0.90012425, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92442745, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 2.585076332092285 + }, + { + "auxiliary_loss_clip": 0.01332775, + "auxiliary_loss_mlp": 0.01121253, + "balance_loss_clip": 1.08713865, + "balance_loss_mlp": 1.0744276, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 3.9703512256420366, + "language_loss": 0.82694173, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85148203, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 2.640824556350708 + }, + { + "auxiliary_loss_clip": 0.01334117, + "auxiliary_loss_mlp": 0.01095145, + "balance_loss_clip": 1.08718085, + "balance_loss_mlp": 1.05129993, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.1375357300463977, + "language_loss": 0.72943115, + "learning_rate": 3.719954063833981e-06, + "loss": 0.7537238, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 2.5834240913391113 + }, + { + "auxiliary_loss_clip": 0.01320952, + "auxiliary_loss_mlp": 0.01094794, + "balance_loss_clip": 1.08034682, + "balance_loss_mlp": 1.04982829, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 1.934000134608797, + "language_loss": 0.92452192, + "learning_rate": 3.721944334919596e-06, + "loss": 0.94867945, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.5921099185943604 + }, + { + "auxiliary_loss_clip": 0.01328999, + "auxiliary_loss_mlp": 0.01099662, + "balance_loss_clip": 1.08728337, + "balance_loss_mlp": 1.05653191, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 3.78657043034749, + "language_loss": 0.65318108, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.6774677, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.5729846954345703 + }, + { + "auxiliary_loss_clip": 0.01328172, + "auxiliary_loss_mlp": 0.01107525, + "balance_loss_clip": 1.09234107, + "balance_loss_mlp": 1.06255877, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.7820202242315182, + "language_loss": 0.76666152, + "learning_rate": 3.72590651470665e-06, + "loss": 0.79101849, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.5723721981048584 + }, + { + "auxiliary_loss_clip": 0.01321292, + "auxiliary_loss_mlp": 0.01112958, + "balance_loss_clip": 1.08704424, + "balance_loss_mlp": 1.06756282, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.1427474507716378, + "language_loss": 0.79765677, + "learning_rate": 3.727878498433505e-06, + "loss": 0.82199931, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.631901979446411 + }, + { + "auxiliary_loss_clip": 0.01329135, + "auxiliary_loss_mlp": 0.01117667, + "balance_loss_clip": 1.0887897, + "balance_loss_mlp": 1.07282019, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 4.1388287316631684, + "language_loss": 0.80882108, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83328903, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.591996431350708 + }, + { + "auxiliary_loss_clip": 0.01325507, + "auxiliary_loss_mlp": 0.01100592, + "balance_loss_clip": 1.08154404, + "balance_loss_mlp": 1.05491078, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.956409783347945, + "language_loss": 0.93720269, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96146369, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.5343573093414307 + }, + { + "auxiliary_loss_clip": 0.01335147, + "auxiliary_loss_mlp": 0.01113557, + "balance_loss_clip": 1.08800101, + "balance_loss_mlp": 1.06842482, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 5.475739856679843, + "language_loss": 0.74814296, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.77262998, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 2.618504762649536 + }, + { + "auxiliary_loss_clip": 0.01330951, + "auxiliary_loss_mlp": 0.01133418, + "balance_loss_clip": 1.08651161, + "balance_loss_mlp": 1.08754611, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.6938260387229445, + "language_loss": 0.93683672, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96148044, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 2.599533796310425 + }, + { + "auxiliary_loss_clip": 0.01317141, + "auxiliary_loss_mlp": 0.01098732, + "balance_loss_clip": 1.08324385, + "balance_loss_mlp": 1.05522096, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.8336239159486696, + "language_loss": 0.93006259, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95422125, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.6043856143951416 + }, + { + "auxiliary_loss_clip": 0.0132696, + "auxiliary_loss_mlp": 0.0111191, + "balance_loss_clip": 1.08914447, + "balance_loss_mlp": 1.06565738, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 3.3402048105620095, + "language_loss": 0.75779498, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78218371, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 2.6174867153167725 + }, + { + "auxiliary_loss_clip": 0.01326397, + "auxiliary_loss_mlp": 0.01098006, + "balance_loss_clip": 1.08597434, + "balance_loss_mlp": 1.05351758, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 3.1828099630792526, + "language_loss": 0.78993165, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81417572, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.6967601776123047 + }, + { + "auxiliary_loss_clip": 0.01325439, + "auxiliary_loss_mlp": 0.01110013, + "balance_loss_clip": 1.07995796, + "balance_loss_mlp": 1.06154251, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.078269604338335, + "language_loss": 0.83178926, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85614383, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.563120126724243 + }, + { + "auxiliary_loss_clip": 0.01322158, + "auxiliary_loss_mlp": 0.01098808, + "balance_loss_clip": 1.08350408, + "balance_loss_mlp": 1.05362797, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 3.6819141035296763, + "language_loss": 0.92480171, + "learning_rate": 3.745359722027911e-06, + "loss": 0.94901139, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 2.6222574710845947 + }, + { + "auxiliary_loss_clip": 0.01321339, + "auxiliary_loss_mlp": 0.01089904, + "balance_loss_clip": 1.08052564, + "balance_loss_mlp": 1.04517627, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.6569146372862362, + "language_loss": 0.88536835, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90948075, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 2.540034770965576 + }, + { + "auxiliary_loss_clip": 0.01311404, + "auxiliary_loss_mlp": 0.01114283, + "balance_loss_clip": 1.07761228, + "balance_loss_mlp": 1.0677197, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.6262481937264441, + "language_loss": 0.90027416, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.9245311, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 2.619494676589966 + }, + { + "auxiliary_loss_clip": 0.01322539, + "auxiliary_loss_mlp": 0.01111616, + "balance_loss_clip": 1.08061647, + "balance_loss_mlp": 1.06545854, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.0246581329568163, + "language_loss": 0.85144478, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.8757863, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.550361394882202 + }, + { + "auxiliary_loss_clip": 0.01329337, + "auxiliary_loss_mlp": 0.01106676, + "balance_loss_clip": 1.08598232, + "balance_loss_mlp": 1.0598743, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.8486774760335136, + "language_loss": 0.88798106, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91234112, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 2.566288709640503 + }, + { + "auxiliary_loss_clip": 0.01322939, + "auxiliary_loss_mlp": 0.01094754, + "balance_loss_clip": 1.08057272, + "balance_loss_mlp": 1.04599762, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.11969813765734, + "language_loss": 0.88106531, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.9052422, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.545900344848633 + }, + { + "auxiliary_loss_clip": 0.01325239, + "auxiliary_loss_mlp": 0.01108736, + "balance_loss_clip": 1.08017516, + "balance_loss_mlp": 1.06262565, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 2.943307157765013, + "language_loss": 0.8080349, + "learning_rate": 3.756755633390458e-06, + "loss": 0.83237463, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.5976927280426025 + }, + { + "auxiliary_loss_clip": 0.0131293, + "auxiliary_loss_mlp": 0.01104632, + "balance_loss_clip": 1.07882881, + "balance_loss_mlp": 1.05508876, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 1.7530732422713116, + "language_loss": 0.89430946, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91848505, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.5945515632629395 + }, + { + "auxiliary_loss_clip": 0.01323545, + "auxiliary_loss_mlp": 0.011011, + "balance_loss_clip": 1.08431137, + "balance_loss_mlp": 1.05642033, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.3315855807452817, + "language_loss": 0.78018081, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80442727, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 4.070865869522095 + }, + { + "auxiliary_loss_clip": 0.01310308, + "auxiliary_loss_mlp": 0.01107092, + "balance_loss_clip": 1.07841587, + "balance_loss_mlp": 1.05971813, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 2.371772374459496, + "language_loss": 0.74911904, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77329302, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 3.9967079162597656 + }, + { + "auxiliary_loss_clip": 0.01314241, + "auxiliary_loss_mlp": 0.01107769, + "balance_loss_clip": 1.08161187, + "balance_loss_mlp": 1.06149244, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.230164645737477, + "language_loss": 0.90408838, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92830849, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 3.9506664276123047 + }, + { + "auxiliary_loss_clip": 0.01311662, + "auxiliary_loss_mlp": 0.01099378, + "balance_loss_clip": 1.07845128, + "balance_loss_mlp": 1.05682039, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.030579091273755, + "language_loss": 0.79171646, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81582683, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 4.068249940872192 + }, + { + "auxiliary_loss_clip": 0.01315717, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_clip": 1.08288932, + "balance_loss_mlp": 1.06207037, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.203557483517225, + "language_loss": 0.71486187, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73910755, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.6448700428009033 + }, + { + "auxiliary_loss_clip": 0.01318392, + "auxiliary_loss_mlp": 0.01112793, + "balance_loss_clip": 1.07899404, + "balance_loss_mlp": 1.06637275, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 2.8998999619647163, + "language_loss": 0.77000272, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79431462, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.569064140319824 + }, + { + "auxiliary_loss_clip": 0.01301923, + "auxiliary_loss_mlp": 0.01103766, + "balance_loss_clip": 1.07614791, + "balance_loss_mlp": 1.05920577, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.6724176014021428, + "language_loss": 0.85349429, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87755114, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.596196174621582 + }, + { + "auxiliary_loss_clip": 0.01308646, + "auxiliary_loss_mlp": 0.01095397, + "balance_loss_clip": 1.08030438, + "balance_loss_mlp": 1.05317271, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.7565765159009863, + "language_loss": 0.80038178, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82442218, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.550508737564087 + }, + { + "auxiliary_loss_clip": 0.01316029, + "auxiliary_loss_mlp": 0.01114466, + "balance_loss_clip": 1.08076453, + "balance_loss_mlp": 1.06778359, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.8186413528070906, + "language_loss": 0.87573755, + "learning_rate": 3.775311735671078e-06, + "loss": 0.90004253, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.5387682914733887 + }, + { + "auxiliary_loss_clip": 0.01308603, + "auxiliary_loss_mlp": 0.0111263, + "balance_loss_clip": 1.07910538, + "balance_loss_mlp": 1.06632936, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 2.4591602578923424, + "language_loss": 0.82648319, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.85069549, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.6801443099975586 + }, + { + "auxiliary_loss_clip": 0.01305014, + "auxiliary_loss_mlp": 0.01103203, + "balance_loss_clip": 1.07886744, + "balance_loss_mlp": 1.05971575, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.298160228831151, + "language_loss": 0.81012154, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83420372, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.6514198780059814 + }, + { + "auxiliary_loss_clip": 0.0131045, + "auxiliary_loss_mlp": 0.01099139, + "balance_loss_clip": 1.07601011, + "balance_loss_mlp": 1.05317235, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.1572655967776844, + "language_loss": 0.8100096, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83410549, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.6612977981567383 + }, + { + "auxiliary_loss_clip": 0.01305074, + "auxiliary_loss_mlp": 0.01096853, + "balance_loss_clip": 1.07386041, + "balance_loss_mlp": 1.05265057, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2711358176078327, + "language_loss": 0.89675176, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.92077112, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.523118019104004 + }, + { + "auxiliary_loss_clip": 0.01302804, + "auxiliary_loss_mlp": 0.01091062, + "balance_loss_clip": 1.0763911, + "balance_loss_mlp": 1.04511881, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 3.3366964659409835, + "language_loss": 0.80304843, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82698715, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 2.64198637008667 + }, + { + "auxiliary_loss_clip": 0.01305023, + "auxiliary_loss_mlp": 0.01090038, + "balance_loss_clip": 1.07392657, + "balance_loss_mlp": 1.0481714, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.6163093920626608, + "language_loss": 0.76730055, + "learning_rate": 3.786194003461506e-06, + "loss": 0.79125118, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 2.524059534072876 + }, + { + "auxiliary_loss_clip": 0.01302003, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_clip": 1.07186866, + "balance_loss_mlp": 1.04683495, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.2226112053134557, + "language_loss": 0.88744521, + "learning_rate": 3.787989966086264e-06, + "loss": 0.91138858, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 2.5469186305999756 + }, + { + "auxiliary_loss_clip": 0.01310891, + "auxiliary_loss_mlp": 0.01093246, + "balance_loss_clip": 1.07669067, + "balance_loss_mlp": 1.05073619, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 5.766099243808527, + "language_loss": 0.76137787, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78541929, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 2.5574252605438232 + }, + { + "auxiliary_loss_clip": 0.01177726, + "auxiliary_loss_mlp": 0.01143716, + "balance_loss_clip": 1.05002916, + "balance_loss_mlp": 1.13093662, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.882516712906081, + "language_loss": 0.64858347, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67179793, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 3.2225046157836914 + }, + { + "auxiliary_loss_clip": 0.01298456, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_clip": 1.06926823, + "balance_loss_mlp": 1.0512687, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 4.212274162179111, + "language_loss": 0.78162509, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80555886, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.613403081893921 + }, + { + "auxiliary_loss_clip": 0.01303885, + "auxiliary_loss_mlp": 0.01100498, + "balance_loss_clip": 1.07292056, + "balance_loss_mlp": 1.05770183, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.2218699797441057, + "language_loss": 0.92603326, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.95007706, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.5685479640960693 + }, + { + "auxiliary_loss_clip": 0.01299271, + "auxiliary_loss_mlp": 0.01109947, + "balance_loss_clip": 1.07255626, + "balance_loss_mlp": 1.06839085, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.6137075035788393, + "language_loss": 0.89816004, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92225224, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 2.6062421798706055 + }, + { + "auxiliary_loss_clip": 0.01305743, + "auxiliary_loss_mlp": 0.01110899, + "balance_loss_clip": 1.07734752, + "balance_loss_mlp": 1.06633902, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.13359260351928, + "language_loss": 0.79697335, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82113981, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 2.5913569927215576 + }, + { + "auxiliary_loss_clip": 0.01301655, + "auxiliary_loss_mlp": 0.01103895, + "balance_loss_clip": 1.07481408, + "balance_loss_mlp": 1.05945349, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.1652601935043068, + "language_loss": 0.84598076, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.87003624, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.5447299480438232 + }, + { + "auxiliary_loss_clip": 0.01305771, + "auxiliary_loss_mlp": 0.0109612, + "balance_loss_clip": 1.07488108, + "balance_loss_mlp": 1.05535078, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 1.9452507253445621, + "language_loss": 0.87114727, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89516616, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 2.6162877082824707 + }, + { + "auxiliary_loss_clip": 0.01306405, + "auxiliary_loss_mlp": 0.01108201, + "balance_loss_clip": 1.07320225, + "balance_loss_mlp": 1.06469011, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 2.304438887151709, + "language_loss": 0.84949523, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87364125, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 2.623800277709961 + }, + { + "auxiliary_loss_clip": 0.01305809, + "auxiliary_loss_mlp": 0.01099414, + "balance_loss_clip": 1.07097769, + "balance_loss_mlp": 1.0581677, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 3.2408115060257767, + "language_loss": 0.75579512, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77984732, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 2.5531423091888428 + }, + { + "auxiliary_loss_clip": 0.01297501, + "auxiliary_loss_mlp": 0.01113173, + "balance_loss_clip": 1.07150018, + "balance_loss_mlp": 1.07116342, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.509072493794174, + "language_loss": 0.82968664, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85379338, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 2.619936227798462 + }, + { + "auxiliary_loss_clip": 0.01298912, + "auxiliary_loss_mlp": 0.01110118, + "balance_loss_clip": 1.07219243, + "balance_loss_mlp": 1.06825149, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.6311867772252184, + "language_loss": 0.81741494, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84150529, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.575906753540039 + }, + { + "auxiliary_loss_clip": 0.01303848, + "auxiliary_loss_mlp": 0.01102909, + "balance_loss_clip": 1.07716572, + "balance_loss_mlp": 1.05982721, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 7.341094511615535, + "language_loss": 0.83239055, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85645807, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.5618269443511963 + }, + { + "auxiliary_loss_clip": 0.01294617, + "auxiliary_loss_mlp": 0.01099842, + "balance_loss_clip": 1.07061386, + "balance_loss_mlp": 1.0566411, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 2.761016374413963, + "language_loss": 0.78671569, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81066024, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 2.549079179763794 + }, + { + "auxiliary_loss_clip": 0.01301054, + "auxiliary_loss_mlp": 0.0109677, + "balance_loss_clip": 1.07468009, + "balance_loss_mlp": 1.0520668, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 3.1130181928674348, + "language_loss": 0.77850789, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.80248612, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.5403122901916504 + }, + { + "auxiliary_loss_clip": 0.01289665, + "auxiliary_loss_mlp": 0.01082299, + "balance_loss_clip": 1.06450605, + "balance_loss_mlp": 1.03866792, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.7993331890994462, + "language_loss": 0.86258888, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88630855, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.597184658050537 + }, + { + "auxiliary_loss_clip": 0.01297789, + "auxiliary_loss_mlp": 0.01104359, + "balance_loss_clip": 1.07283854, + "balance_loss_mlp": 1.05951262, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 2.99503683120795, + "language_loss": 0.88937879, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91340029, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 2.5675439834594727 + }, + { + "auxiliary_loss_clip": 0.01296925, + "auxiliary_loss_mlp": 0.010899, + "balance_loss_clip": 1.06806004, + "balance_loss_mlp": 1.04870141, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 4.246307499842224, + "language_loss": 0.75008315, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77395141, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.486569881439209 + }, + { + "auxiliary_loss_clip": 0.0128956, + "auxiliary_loss_mlp": 0.01099737, + "balance_loss_clip": 1.0712285, + "balance_loss_mlp": 1.0570128, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.7116784865104178, + "language_loss": 0.99448496, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01837802, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 2.548515558242798 + }, + { + "auxiliary_loss_clip": 0.01174677, + "auxiliary_loss_mlp": 0.01176451, + "balance_loss_clip": 1.05213904, + "balance_loss_mlp": 1.163481, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 1.1396617014678025, + "language_loss": 0.75424945, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77776074, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 3.1680073738098145 + }, + { + "auxiliary_loss_clip": 0.01297813, + "auxiliary_loss_mlp": 0.0109207, + "balance_loss_clip": 1.06860888, + "balance_loss_mlp": 1.05223083, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 2.733750603193357, + "language_loss": 0.78445208, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80835092, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.7168521881103516 + }, + { + "auxiliary_loss_clip": 0.0129262, + "auxiliary_loss_mlp": 0.01089706, + "balance_loss_clip": 1.07094526, + "balance_loss_mlp": 1.04874563, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.4086042833413015, + "language_loss": 0.96650147, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99032474, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 2.5816357135772705 + }, + { + "auxiliary_loss_clip": 0.01300059, + "auxiliary_loss_mlp": 0.01095655, + "balance_loss_clip": 1.07136035, + "balance_loss_mlp": 1.05419385, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.118283947872922, + "language_loss": 0.88023454, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90419167, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.5600109100341797 + }, + { + "auxiliary_loss_clip": 0.01293639, + "auxiliary_loss_mlp": 0.01102269, + "balance_loss_clip": 1.07225657, + "balance_loss_mlp": 1.06123745, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.09302711177727, + "language_loss": 0.8489691, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87292814, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.5540506839752197 + }, + { + "auxiliary_loss_clip": 0.01292112, + "auxiliary_loss_mlp": 0.01113149, + "balance_loss_clip": 1.07259405, + "balance_loss_mlp": 1.07240319, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.411756104409381, + "language_loss": 0.83396024, + "learning_rate": 3.831334200735543e-06, + "loss": 0.8580128, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.554546356201172 + }, + { + "auxiliary_loss_clip": 0.01290506, + "auxiliary_loss_mlp": 0.01099318, + "balance_loss_clip": 1.07394063, + "balance_loss_mlp": 1.0607419, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.9533633746864967, + "language_loss": 0.89204276, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.915941, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 4.0537354946136475 + }, + { + "auxiliary_loss_clip": 0.01296964, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_clip": 1.07407403, + "balance_loss_mlp": 1.09077263, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.0839137784673176, + "language_loss": 0.70195889, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72624058, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.5095279216766357 + }, + { + "auxiliary_loss_clip": 0.01297274, + "auxiliary_loss_mlp": 0.0109685, + "balance_loss_clip": 1.07454908, + "balance_loss_mlp": 1.05627096, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 2.0721621190005624, + "language_loss": 0.87964106, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90358233, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 4.062540054321289 + }, + { + "auxiliary_loss_clip": 0.01297152, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_clip": 1.07233214, + "balance_loss_mlp": 1.04785728, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.580477373288989, + "language_loss": 0.8386755, + "learning_rate": 3.838006303795566e-06, + "loss": 0.86254537, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 4.133410692214966 + }, + { + "auxiliary_loss_clip": 0.01293235, + "auxiliary_loss_mlp": 0.01100757, + "balance_loss_clip": 1.07193112, + "balance_loss_mlp": 1.06191874, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 2.7063615035634876, + "language_loss": 0.93814361, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96208358, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.6133124828338623 + }, + { + "auxiliary_loss_clip": 0.01287681, + "auxiliary_loss_mlp": 0.0109306, + "balance_loss_clip": 1.07135296, + "balance_loss_mlp": 1.05124152, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.0736672570136436, + "language_loss": 0.88085055, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90465796, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.617316961288452 + }, + { + "auxiliary_loss_clip": 0.01293082, + "auxiliary_loss_mlp": 0.01098388, + "balance_loss_clip": 1.07299852, + "balance_loss_mlp": 1.05985963, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.221240922938079, + "language_loss": 0.89230138, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91621602, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.5528345108032227 + }, + { + "auxiliary_loss_clip": 0.01288681, + "auxiliary_loss_mlp": 0.01082179, + "balance_loss_clip": 1.0707016, + "balance_loss_mlp": 1.04272056, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.1766131027213875, + "language_loss": 0.86009467, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88380325, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.616178035736084 + }, + { + "auxiliary_loss_clip": 0.01286145, + "auxiliary_loss_mlp": 0.0109795, + "balance_loss_clip": 1.06983876, + "balance_loss_mlp": 1.05794358, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.9659582538003, + "language_loss": 0.89117289, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91501385, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.553722858428955 + }, + { + "auxiliary_loss_clip": 0.01298174, + "auxiliary_loss_mlp": 0.01117968, + "balance_loss_clip": 1.07610762, + "balance_loss_mlp": 1.07452846, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 1.939268704377502, + "language_loss": 0.81706989, + "learning_rate": 3.84788658233771e-06, + "loss": 0.84123135, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.52555513381958 + }, + { + "auxiliary_loss_clip": 0.01288177, + "auxiliary_loss_mlp": 0.01098076, + "balance_loss_clip": 1.06988811, + "balance_loss_mlp": 1.05678201, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.6325252742598786, + "language_loss": 0.8584758, + "learning_rate": 3.84951865465269e-06, + "loss": 0.8823384, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 2.5576722621917725 + }, + { + "auxiliary_loss_clip": 0.01156328, + "auxiliary_loss_mlp": 0.01071609, + "balance_loss_clip": 1.0415926, + "balance_loss_mlp": 1.06054616, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.936820124207531, + "language_loss": 0.63789123, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66017061, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 2.9486756324768066 + }, + { + "auxiliary_loss_clip": 0.01283572, + "auxiliary_loss_mlp": 0.0108104, + "balance_loss_clip": 1.06779253, + "balance_loss_mlp": 1.04179645, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6708159172435164, + "language_loss": 0.83555543, + "learning_rate": 3.852770440269372e-06, + "loss": 0.85920155, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 2.5300869941711426 + }, + { + "auxiliary_loss_clip": 0.01289044, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_clip": 1.07027328, + "balance_loss_mlp": 1.05418396, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.1206368945797127, + "language_loss": 0.84474528, + "learning_rate": 3.854390195044404e-06, + "loss": 0.8685872, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 2.5444695949554443 + }, + { + "auxiliary_loss_clip": 0.01290705, + "auxiliary_loss_mlp": 0.01091494, + "balance_loss_clip": 1.06793118, + "balance_loss_mlp": 1.04919887, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.514211722347479, + "language_loss": 0.85857749, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88239956, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 2.506898880004883 + }, + { + "auxiliary_loss_clip": 0.01285044, + "auxiliary_loss_mlp": 0.01102882, + "balance_loss_clip": 1.0695883, + "balance_loss_mlp": 1.06237471, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.1620313584624817, + "language_loss": 0.86443967, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88831896, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 2.5856680870056152 + }, + { + "auxiliary_loss_clip": 0.01293984, + "auxiliary_loss_mlp": 0.01091531, + "balance_loss_clip": 1.07461476, + "balance_loss_mlp": 1.05126202, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 2.2762334801976283, + "language_loss": 0.79477215, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81862736, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.567624092102051 + }, + { + "auxiliary_loss_clip": 0.01284535, + "auxiliary_loss_mlp": 0.01096955, + "balance_loss_clip": 1.06838131, + "balance_loss_mlp": 1.05706763, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.0398013608954537, + "language_loss": 0.78402162, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80783653, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.7652065753936768 + }, + { + "auxiliary_loss_clip": 0.0129061, + "auxiliary_loss_mlp": 0.01087882, + "balance_loss_clip": 1.06818652, + "balance_loss_mlp": 1.04541934, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.3285876020486, + "language_loss": 0.94761729, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97140217, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.5402824878692627 + }, + { + "auxiliary_loss_clip": 0.01293346, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_clip": 1.06794858, + "balance_loss_mlp": 1.05068588, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.953611961463431, + "language_loss": 0.99940896, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02327061, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.518383026123047 + }, + { + "auxiliary_loss_clip": 0.01293508, + "auxiliary_loss_mlp": 0.01101226, + "balance_loss_clip": 1.07133222, + "balance_loss_mlp": 1.06102848, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.855439675687524, + "language_loss": 0.8766979, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90064526, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.5337729454040527 + }, + { + "auxiliary_loss_clip": 0.01302485, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_clip": 1.07617819, + "balance_loss_mlp": 1.05600488, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.3308216213963986, + "language_loss": 0.93407226, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9580735, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.5317747592926025 + }, + { + "auxiliary_loss_clip": 0.01293145, + "auxiliary_loss_mlp": 0.01096379, + "balance_loss_clip": 1.07270718, + "balance_loss_mlp": 1.05372536, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.617981121710983, + "language_loss": 0.87233263, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89622784, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.5395843982696533 + }, + { + "auxiliary_loss_clip": 0.01290814, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_clip": 1.07310343, + "balance_loss_mlp": 1.04950178, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.560567169941996, + "language_loss": 0.74391699, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76773036, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 2.6071219444274902 + }, + { + "auxiliary_loss_clip": 0.01294803, + "auxiliary_loss_mlp": 0.01100586, + "balance_loss_clip": 1.07213211, + "balance_loss_mlp": 1.05805254, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 5.73280906848975, + "language_loss": 0.92671615, + "learning_rate": 3.871943634189376e-06, + "loss": 0.95067012, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 2.607236623764038 + }, + { + "auxiliary_loss_clip": 0.0129493, + "auxiliary_loss_mlp": 0.01082811, + "balance_loss_clip": 1.07492709, + "balance_loss_mlp": 1.04514039, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 3.266776223703825, + "language_loss": 0.82854092, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85231829, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 2.648296594619751 + }, + { + "auxiliary_loss_clip": 0.01294601, + "auxiliary_loss_mlp": 0.0109925, + "balance_loss_clip": 1.07295227, + "balance_loss_mlp": 1.05891001, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 3.0545373098280617, + "language_loss": 0.77805406, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80199254, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.5714011192321777 + }, + { + "auxiliary_loss_clip": 0.01293831, + "auxiliary_loss_mlp": 0.01110818, + "balance_loss_clip": 1.07010031, + "balance_loss_mlp": 1.06835556, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.388720871634143, + "language_loss": 0.86338478, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88743126, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 2.566649913787842 + }, + { + "auxiliary_loss_clip": 0.01155717, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.04233539, + "balance_loss_mlp": 1.02052438, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8430605056970429, + "language_loss": 0.58499187, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60686064, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 3.170842170715332 + }, + { + "auxiliary_loss_clip": 0.01286892, + "auxiliary_loss_mlp": 0.01101747, + "balance_loss_clip": 1.06861353, + "balance_loss_mlp": 1.05866444, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.9921021993493153, + "language_loss": 0.80517232, + "learning_rate": 3.879766964750006e-06, + "loss": 0.82905865, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.6551384925842285 + }, + { + "auxiliary_loss_clip": 0.01281527, + "auxiliary_loss_mlp": 0.01101186, + "balance_loss_clip": 1.06762242, + "balance_loss_mlp": 1.06129909, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.686344448096388, + "language_loss": 0.80307335, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82690048, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 2.5526487827301025 + }, + { + "auxiliary_loss_clip": 0.01301652, + "auxiliary_loss_mlp": 0.01089627, + "balance_loss_clip": 1.07557011, + "balance_loss_mlp": 1.04966807, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 3.247374588381748, + "language_loss": 0.96166837, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98558122, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 2.5244343280792236 + }, + { + "auxiliary_loss_clip": 0.01288561, + "auxiliary_loss_mlp": 0.01082691, + "balance_loss_clip": 1.0690738, + "balance_loss_mlp": 1.04053855, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 1.8865509115361674, + "language_loss": 0.77549553, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79920805, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 2.565366744995117 + }, + { + "auxiliary_loss_clip": 0.01284031, + "auxiliary_loss_mlp": 0.01093359, + "balance_loss_clip": 1.07163501, + "balance_loss_mlp": 1.05237532, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.5441526958400715, + "language_loss": 0.77059615, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79437006, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 2.6564736366271973 + }, + { + "auxiliary_loss_clip": 0.01293598, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_clip": 1.07336831, + "balance_loss_mlp": 1.04114699, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 5.18039174752567, + "language_loss": 0.81270468, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83643836, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.6773629188537598 + }, + { + "auxiliary_loss_clip": 0.01286847, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_clip": 1.0707593, + "balance_loss_mlp": 1.04833102, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9450280664928197, + "language_loss": 0.73568273, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.75946242, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.672666549682617 + }, + { + "auxiliary_loss_clip": 0.0128629, + "auxiliary_loss_mlp": 0.01099856, + "balance_loss_clip": 1.06754732, + "balance_loss_mlp": 1.0606842, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.927143403299602, + "language_loss": 0.79164326, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81550467, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.646073579788208 + }, + { + "auxiliary_loss_clip": 0.01287321, + "auxiliary_loss_mlp": 0.0109816, + "balance_loss_clip": 1.0724653, + "balance_loss_mlp": 1.05634117, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.561711042605271, + "language_loss": 0.81957674, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84343159, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.626814126968384 + }, + { + "auxiliary_loss_clip": 0.01287555, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.06770146, + "balance_loss_mlp": 1.04298735, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 2.1750932147582964, + "language_loss": 0.83558714, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85926306, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 2.5746910572052 + }, + { + "auxiliary_loss_clip": 0.0128145, + "auxiliary_loss_mlp": 0.01084462, + "balance_loss_clip": 1.06649852, + "balance_loss_mlp": 1.0456233, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 8.27522354029954, + "language_loss": 0.74331594, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76697505, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.6827125549316406 + }, + { + "auxiliary_loss_clip": 0.01292101, + "auxiliary_loss_mlp": 0.01100959, + "balance_loss_clip": 1.07212996, + "balance_loss_mlp": 1.06204939, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 3.210743302236996, + "language_loss": 0.83046168, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85439229, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 4.083936929702759 + }, + { + "auxiliary_loss_clip": 0.01287428, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_clip": 1.06519866, + "balance_loss_mlp": 1.05803442, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.395658201889729, + "language_loss": 0.85424078, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87811047, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.6128990650177 + }, + { + "auxiliary_loss_clip": 0.01159106, + "auxiliary_loss_mlp": 0.01058912, + "balance_loss_clip": 1.04712462, + "balance_loss_mlp": 1.04880321, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.9028051964954737, + "language_loss": 0.57188541, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59406561, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 4.631886005401611 + }, + { + "auxiliary_loss_clip": 0.01287499, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_clip": 1.07418776, + "balance_loss_mlp": 1.06520176, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.6285188647402853, + "language_loss": 0.88111973, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90502989, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 4.085601568222046 + }, + { + "auxiliary_loss_clip": 0.01278556, + "auxiliary_loss_mlp": 0.01088302, + "balance_loss_clip": 1.06656241, + "balance_loss_mlp": 1.04710376, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.7083196261005944, + "language_loss": 0.85818124, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88184983, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 4.075204849243164 + }, + { + "auxiliary_loss_clip": 0.01285333, + "auxiliary_loss_mlp": 0.01093308, + "balance_loss_clip": 1.06548214, + "balance_loss_mlp": 1.05239499, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.3457463156840666, + "language_loss": 0.88133526, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90512168, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.6290528774261475 + }, + { + "auxiliary_loss_clip": 0.01286846, + "auxiliary_loss_mlp": 0.01088207, + "balance_loss_clip": 1.07298231, + "balance_loss_mlp": 1.04977393, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 2.0961453803153423, + "language_loss": 0.84352684, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86727738, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.548088312149048 + }, + { + "auxiliary_loss_clip": 0.0128115, + "auxiliary_loss_mlp": 0.01085448, + "balance_loss_clip": 1.06748676, + "balance_loss_mlp": 1.04854047, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.953747181554095, + "language_loss": 0.8687371, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89240313, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.507948398590088 + }, + { + "auxiliary_loss_clip": 0.01283079, + "auxiliary_loss_mlp": 0.01081803, + "balance_loss_clip": 1.06836963, + "balance_loss_mlp": 1.04522967, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.726621162301442, + "language_loss": 0.75948763, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78313643, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.6152803897857666 + }, + { + "auxiliary_loss_clip": 0.01283976, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_clip": 1.06648946, + "balance_loss_mlp": 1.05767298, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 3.3092674039797534, + "language_loss": 0.90069103, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92453098, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 2.5231447219848633 + }, + { + "auxiliary_loss_clip": 0.01284605, + "auxiliary_loss_mlp": 0.01105978, + "balance_loss_clip": 1.07082868, + "balance_loss_mlp": 1.06556642, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 2.174660437135107, + "language_loss": 0.7986933, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82259917, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 2.508927822113037 + }, + { + "auxiliary_loss_clip": 0.01279699, + "auxiliary_loss_mlp": 0.01097982, + "balance_loss_clip": 1.0652678, + "balance_loss_mlp": 1.05785584, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 2.3287811458370298, + "language_loss": 0.86707664, + "learning_rate": 3.913103228936546e-06, + "loss": 0.89085346, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 2.5700480937957764 + }, + { + "auxiliary_loss_clip": 0.01283903, + "auxiliary_loss_mlp": 0.01105953, + "balance_loss_clip": 1.06975174, + "balance_loss_mlp": 1.06709075, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 5.007040931559613, + "language_loss": 0.74963772, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77353632, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 2.546663999557495 + }, + { + "auxiliary_loss_clip": 0.01278497, + "auxiliary_loss_mlp": 0.01107399, + "balance_loss_clip": 1.06791568, + "balance_loss_mlp": 1.0654614, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.382769648285649, + "language_loss": 0.9153254, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93918443, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 2.526486396789551 + }, + { + "auxiliary_loss_clip": 0.01146723, + "auxiliary_loss_mlp": 0.01062789, + "balance_loss_clip": 1.03865981, + "balance_loss_mlp": 1.0526799, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.874857852160406, + "language_loss": 0.62656558, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64866066, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.26533842086792 + }, + { + "auxiliary_loss_clip": 0.01290142, + "auxiliary_loss_mlp": 0.01102389, + "balance_loss_clip": 1.07271898, + "balance_loss_mlp": 1.0623107, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 1.8358891658418446, + "language_loss": 0.75876528, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78269064, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.6093342304229736 + }, + { + "auxiliary_loss_clip": 0.0128027, + "auxiliary_loss_mlp": 0.01080994, + "balance_loss_clip": 1.06848919, + "balance_loss_mlp": 1.0420599, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.8934603986381813, + "language_loss": 0.83243728, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85605001, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.53175950050354 + }, + { + "auxiliary_loss_clip": 0.012839, + "auxiliary_loss_mlp": 0.01086907, + "balance_loss_clip": 1.06750131, + "balance_loss_mlp": 1.04618549, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.3166792072759432, + "language_loss": 0.78313339, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80684137, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.588040828704834 + }, + { + "auxiliary_loss_clip": 0.01137547, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.03231645, + "balance_loss_mlp": 1.02604377, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9660635098735028, + "language_loss": 0.64454961, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66628134, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 2.9571967124938965 + }, + { + "auxiliary_loss_clip": 0.01284587, + "auxiliary_loss_mlp": 0.01092842, + "balance_loss_clip": 1.07184434, + "balance_loss_mlp": 1.05307364, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 1.8751908209728216, + "language_loss": 0.82358825, + "learning_rate": 3.924809954779425e-06, + "loss": 0.84736258, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.541508197784424 + }, + { + "auxiliary_loss_clip": 0.01287848, + "auxiliary_loss_mlp": 0.01089819, + "balance_loss_clip": 1.07099175, + "balance_loss_mlp": 1.04769015, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.2917286608880896, + "language_loss": 0.95703161, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.98080826, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.557766914367676 + }, + { + "auxiliary_loss_clip": 0.01281639, + "auxiliary_loss_mlp": 0.0109888, + "balance_loss_clip": 1.07008111, + "balance_loss_mlp": 1.05832529, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.3473719643626683, + "language_loss": 0.91987157, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94367683, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.5572149753570557 + }, + { + "auxiliary_loss_clip": 0.01281314, + "auxiliary_loss_mlp": 0.01092935, + "balance_loss_clip": 1.07117188, + "balance_loss_mlp": 1.05302334, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.507680973079564, + "language_loss": 0.79768538, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82142794, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.5331642627716064 + }, + { + "auxiliary_loss_clip": 0.01278461, + "auxiliary_loss_mlp": 0.01082046, + "balance_loss_clip": 1.06746125, + "balance_loss_mlp": 1.04528213, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.5490470760033457, + "language_loss": 0.86650223, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89010727, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 2.526880979537964 + }, + { + "auxiliary_loss_clip": 0.01273813, + "auxiliary_loss_mlp": 0.01101678, + "balance_loss_clip": 1.06722188, + "balance_loss_mlp": 1.06481862, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 3.200152010820515, + "language_loss": 0.88679242, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91054732, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 2.7094686031341553 + }, + { + "auxiliary_loss_clip": 0.01284894, + "auxiliary_loss_mlp": 0.01091092, + "balance_loss_clip": 1.06862938, + "balance_loss_mlp": 1.05063283, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 2.67581734562042, + "language_loss": 0.80654812, + "learning_rate": 3.933452395729493e-06, + "loss": 0.8303079, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.604323387145996 + }, + { + "auxiliary_loss_clip": 0.01277992, + "auxiliary_loss_mlp": 0.01089139, + "balance_loss_clip": 1.07158947, + "balance_loss_mlp": 1.05032432, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.357166139452422, + "language_loss": 0.81581175, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83948302, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 2.5555129051208496 + }, + { + "auxiliary_loss_clip": 0.01279734, + "auxiliary_loss_mlp": 0.01098519, + "balance_loss_clip": 1.07313991, + "balance_loss_mlp": 1.05944192, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.6745746697645962, + "language_loss": 0.77140629, + "learning_rate": 3.936307620734599e-06, + "loss": 0.7951889, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.5676541328430176 + }, + { + "auxiliary_loss_clip": 0.0127662, + "auxiliary_loss_mlp": 0.01092112, + "balance_loss_clip": 1.06924605, + "balance_loss_mlp": 1.05305922, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.7825296952124172, + "language_loss": 0.72994953, + "learning_rate": 3.937730499067294e-06, + "loss": 0.75363684, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 2.574093818664551 + }, + { + "auxiliary_loss_clip": 0.01272051, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_clip": 1.06655383, + "balance_loss_mlp": 1.05312324, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 2.178177564576638, + "language_loss": 0.82422662, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84785843, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.711076498031616 + }, + { + "auxiliary_loss_clip": 0.01278589, + "auxiliary_loss_mlp": 0.01090186, + "balance_loss_clip": 1.07088566, + "balance_loss_mlp": 1.05530488, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 3.0558721108866256, + "language_loss": 0.75478494, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77847266, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.5334231853485107 + }, + { + "auxiliary_loss_clip": 0.01278138, + "auxiliary_loss_mlp": 0.01098206, + "balance_loss_clip": 1.06822872, + "balance_loss_mlp": 1.06084609, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.0859972129635636, + "language_loss": 0.80590338, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82966679, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 2.5123064517974854 + }, + { + "auxiliary_loss_clip": 0.012717, + "auxiliary_loss_mlp": 0.01082652, + "balance_loss_clip": 1.06683517, + "balance_loss_mlp": 1.04374242, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.0330387275553656, + "language_loss": 0.81972301, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.84326649, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 2.497703790664673 + }, + { + "auxiliary_loss_clip": 0.01278807, + "auxiliary_loss_mlp": 0.01090149, + "balance_loss_clip": 1.06780875, + "balance_loss_mlp": 1.05257404, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.1135136516567603, + "language_loss": 0.94153559, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96522516, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 2.5877861976623535 + }, + { + "auxiliary_loss_clip": 0.01274353, + "auxiliary_loss_mlp": 0.01094238, + "balance_loss_clip": 1.06677461, + "balance_loss_mlp": 1.05809426, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.3330978239680027, + "language_loss": 0.7918433, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81552923, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.491400718688965 + }, + { + "auxiliary_loss_clip": 0.01280214, + "auxiliary_loss_mlp": 0.01092408, + "balance_loss_clip": 1.07340074, + "balance_loss_mlp": 1.05118561, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 2.614220839134655, + "language_loss": 0.83657372, + "learning_rate": 3.947603562811407e-06, + "loss": 0.86029994, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 2.5484800338745117 + }, + { + "auxiliary_loss_clip": 0.01130521, + "auxiliary_loss_mlp": 0.010604, + "balance_loss_clip": 1.02776396, + "balance_loss_mlp": 1.05048132, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5947252509302174, + "language_loss": 0.73590374, + "learning_rate": 3.949001722282675e-06, + "loss": 0.75781298, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 3.2046968936920166 + }, + { + "auxiliary_loss_clip": 0.01272945, + "auxiliary_loss_mlp": 0.01086001, + "balance_loss_clip": 1.07332468, + "balance_loss_mlp": 1.05114412, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.5590125372645867, + "language_loss": 0.81097579, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83456528, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.6252923011779785 + }, + { + "auxiliary_loss_clip": 0.01275073, + "auxiliary_loss_mlp": 0.01084545, + "balance_loss_clip": 1.07011127, + "balance_loss_mlp": 1.04983127, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.511740880241463, + "language_loss": 0.90481579, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92841196, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 2.54569411277771 + }, + { + "auxiliary_loss_clip": 0.01128152, + "auxiliary_loss_mlp": 0.01016766, + "balance_loss_clip": 1.02675152, + "balance_loss_mlp": 1.00780106, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8805704406193595, + "language_loss": 0.58995563, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61140478, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 3.0461196899414062 + }, + { + "auxiliary_loss_clip": 0.01286176, + "auxiliary_loss_mlp": 0.01102279, + "balance_loss_clip": 1.07499838, + "balance_loss_mlp": 1.06403685, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.4382483287604493, + "language_loss": 0.81294149, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83682603, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.563361167907715 + }, + { + "auxiliary_loss_clip": 0.01272281, + "auxiliary_loss_mlp": 0.01087136, + "balance_loss_clip": 1.06833649, + "balance_loss_mlp": 1.04972792, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.4697923054610196, + "language_loss": 0.78600025, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80959439, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 4.079269170761108 + }, + { + "auxiliary_loss_clip": 0.01274089, + "auxiliary_loss_mlp": 0.01096946, + "balance_loss_clip": 1.07202768, + "balance_loss_mlp": 1.06042027, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.3804683199552357, + "language_loss": 0.87671286, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90042323, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.541062355041504 + }, + { + "auxiliary_loss_clip": 0.01277608, + "auxiliary_loss_mlp": 0.01106149, + "balance_loss_clip": 1.070611, + "balance_loss_mlp": 1.06673813, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.0342630735886074, + "language_loss": 0.86001027, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88384783, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 3.9893898963928223 + }, + { + "auxiliary_loss_clip": 0.01274067, + "auxiliary_loss_mlp": 0.01088021, + "balance_loss_clip": 1.06860685, + "balance_loss_mlp": 1.04882479, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.347520995352097, + "language_loss": 0.91745555, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94107646, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 4.051377773284912 + }, + { + "auxiliary_loss_clip": 0.01271702, + "auxiliary_loss_mlp": 0.01098595, + "balance_loss_clip": 1.07024944, + "balance_loss_mlp": 1.06056714, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 2.8269019749443816, + "language_loss": 0.81677622, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84047914, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.508265495300293 + }, + { + "auxiliary_loss_clip": 0.01273653, + "auxiliary_loss_mlp": 0.01089104, + "balance_loss_clip": 1.06732512, + "balance_loss_mlp": 1.05090904, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.7751550898115807, + "language_loss": 0.93108201, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95470953, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.5595288276672363 + }, + { + "auxiliary_loss_clip": 0.01269902, + "auxiliary_loss_mlp": 0.01101171, + "balance_loss_clip": 1.06742334, + "balance_loss_mlp": 1.06426418, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.8686371571325149, + "language_loss": 0.76210201, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78581274, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 2.6120104789733887 + }, + { + "auxiliary_loss_clip": 0.01274321, + "auxiliary_loss_mlp": 0.01088336, + "balance_loss_clip": 1.06573415, + "balance_loss_mlp": 1.05293059, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 1.974475210966459, + "language_loss": 0.93830395, + "learning_rate": 3.965547014290071e-06, + "loss": 0.96193051, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 2.4823875427246094 + }, + { + "auxiliary_loss_clip": 0.01281167, + "auxiliary_loss_mlp": 0.01122267, + "balance_loss_clip": 1.07096219, + "balance_loss_mlp": 1.08586085, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 6.105125790558711, + "language_loss": 0.88372749, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90776181, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 2.4769086837768555 + }, + { + "auxiliary_loss_clip": 0.01271792, + "auxiliary_loss_mlp": 0.01092067, + "balance_loss_clip": 1.06941485, + "balance_loss_mlp": 1.05425429, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.3480664911430273, + "language_loss": 0.78960413, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81324267, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 2.5857224464416504 + }, + { + "auxiliary_loss_clip": 0.0112305, + "auxiliary_loss_mlp": 0.01064771, + "balance_loss_clip": 1.02450633, + "balance_loss_mlp": 1.05647421, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9255295960385355, + "language_loss": 0.66972196, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6916002, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 3.013180732727051 + }, + { + "auxiliary_loss_clip": 0.01273937, + "auxiliary_loss_mlp": 0.01088983, + "balance_loss_clip": 1.06962252, + "balance_loss_mlp": 1.05155158, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.072307559719196, + "language_loss": 0.83905113, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86268032, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.5768866539001465 + }, + { + "auxiliary_loss_clip": 0.01279867, + "auxiliary_loss_mlp": 0.01095837, + "balance_loss_clip": 1.07331085, + "balance_loss_mlp": 1.05792797, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.0953485394140516, + "language_loss": 0.82261348, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84637052, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.584162712097168 + }, + { + "auxiliary_loss_clip": 0.01271388, + "auxiliary_loss_mlp": 0.01082952, + "balance_loss_clip": 1.06734586, + "balance_loss_mlp": 1.04611588, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 6.261685712919392, + "language_loss": 0.81222117, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83576465, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.539417266845703 + }, + { + "auxiliary_loss_clip": 0.01270064, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.0673914, + "balance_loss_mlp": 1.04492259, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.8771117784500344, + "language_loss": 0.73552716, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75906825, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.5864062309265137 + }, + { + "auxiliary_loss_clip": 0.01267797, + "auxiliary_loss_mlp": 0.01071294, + "balance_loss_clip": 1.06778502, + "balance_loss_mlp": 1.0372715, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 3.2738053057163072, + "language_loss": 0.88070315, + "learning_rate": 3.976345626888605e-06, + "loss": 0.9040941, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.6305015087127686 + }, + { + "auxiliary_loss_clip": 0.01118706, + "auxiliary_loss_mlp": 0.01016877, + "balance_loss_clip": 1.0216912, + "balance_loss_mlp": 1.00896168, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8366490833075065, + "language_loss": 0.66080159, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68215746, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 2.869310140609741 + }, + { + "auxiliary_loss_clip": 0.01282482, + "auxiliary_loss_mlp": 0.01080734, + "balance_loss_clip": 1.07160664, + "balance_loss_mlp": 1.04540002, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.4372155828769455, + "language_loss": 0.7902379, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81387007, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.539362668991089 + }, + { + "auxiliary_loss_clip": 0.0127866, + "auxiliary_loss_mlp": 0.01102801, + "balance_loss_clip": 1.07147944, + "balance_loss_mlp": 1.06520271, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.4850292028054075, + "language_loss": 0.75702453, + "learning_rate": 3.980348865796749e-06, + "loss": 0.78083915, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.549746513366699 + }, + { + "auxiliary_loss_clip": 0.01274065, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.069417, + "balance_loss_mlp": 1.048244, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.2018456234931167, + "language_loss": 0.84376597, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86734128, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.502662181854248 + }, + { + "auxiliary_loss_clip": 0.0128078, + "auxiliary_loss_mlp": 0.01091766, + "balance_loss_clip": 1.07627821, + "balance_loss_mlp": 1.0547632, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 2.0046524288864096, + "language_loss": 0.84749031, + "learning_rate": 3.983003930109732e-06, + "loss": 0.87121582, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.534113883972168 + }, + { + "auxiliary_loss_clip": 0.01274492, + "auxiliary_loss_mlp": 0.01092658, + "balance_loss_clip": 1.06899977, + "balance_loss_mlp": 1.05577517, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 2.2103934308529376, + "language_loss": 0.88982558, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91349709, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.5498428344726562 + }, + { + "auxiliary_loss_clip": 0.01276458, + "auxiliary_loss_mlp": 0.01073324, + "balance_loss_clip": 1.07173443, + "balance_loss_mlp": 1.03872943, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.5489109559116434, + "language_loss": 0.88475323, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90825105, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.5332889556884766 + }, + { + "auxiliary_loss_clip": 0.01273118, + "auxiliary_loss_mlp": 0.01089864, + "balance_loss_clip": 1.07024741, + "balance_loss_mlp": 1.0529331, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 1.8331433038844194, + "language_loss": 0.88987857, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91350842, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 2.575556755065918 + }, + { + "auxiliary_loss_clip": 0.01265679, + "auxiliary_loss_mlp": 0.01079233, + "balance_loss_clip": 1.0648315, + "balance_loss_mlp": 1.04258847, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 1.8361732971788878, + "language_loss": 0.89023554, + "learning_rate": 3.988281436571815e-06, + "loss": 0.91368473, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.569300413131714 + }, + { + "auxiliary_loss_clip": 0.01274956, + "auxiliary_loss_mlp": 0.0109345, + "balance_loss_clip": 1.06900311, + "balance_loss_mlp": 1.05721021, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.3694169664575817, + "language_loss": 0.91858679, + "learning_rate": 3.989594081641164e-06, + "loss": 0.94227087, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.53529953956604 + }, + { + "auxiliary_loss_clip": 0.01262105, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_clip": 1.06616354, + "balance_loss_mlp": 1.04510951, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.0403710107404467, + "language_loss": 0.85600299, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87941581, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.5433921813964844 + }, + { + "auxiliary_loss_clip": 0.01275605, + "auxiliary_loss_mlp": 0.01108899, + "balance_loss_clip": 1.07250762, + "balance_loss_mlp": 1.07215881, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 4.4935545654425155, + "language_loss": 0.84504473, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86888981, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.5374603271484375 + }, + { + "auxiliary_loss_clip": 0.01269473, + "auxiliary_loss_mlp": 0.01080553, + "balance_loss_clip": 1.06644177, + "balance_loss_mlp": 1.04476631, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 2.493700260723298, + "language_loss": 0.86951196, + "learning_rate": 3.99351603600268e-06, + "loss": 0.89301217, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 2.5361576080322266 + }, + { + "auxiliary_loss_clip": 0.01275974, + "auxiliary_loss_mlp": 0.01088483, + "balance_loss_clip": 1.07058406, + "balance_loss_mlp": 1.05570054, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 2.2147230080299725, + "language_loss": 0.86484045, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88848501, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.554537296295166 + }, + { + "auxiliary_loss_clip": 0.01262908, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_clip": 1.06659114, + "balance_loss_mlp": 1.04680824, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.948081306069555, + "language_loss": 0.61792421, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64136302, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 2.6210403442382812 + }, + { + "auxiliary_loss_clip": 0.01268194, + "auxiliary_loss_mlp": 0.01092526, + "balance_loss_clip": 1.07123411, + "balance_loss_mlp": 1.05764532, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.7798665654333847, + "language_loss": 0.88862193, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91222906, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 2.561246871948242 + }, + { + "auxiliary_loss_clip": 0.01273402, + "auxiliary_loss_mlp": 0.01084418, + "balance_loss_clip": 1.07143545, + "balance_loss_mlp": 1.04991889, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 4.596412639632157, + "language_loss": 0.85344833, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87702656, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 2.6697838306427 + }, + { + "auxiliary_loss_clip": 0.01267885, + "auxiliary_loss_mlp": 0.01086307, + "balance_loss_clip": 1.06850874, + "balance_loss_mlp": 1.05304766, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 4.651577021535878, + "language_loss": 0.78260732, + "learning_rate": 4e-06, + "loss": 0.80614924, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 2.5154688358306885 + }, + { + "auxiliary_loss_clip": 0.01268907, + "auxiliary_loss_mlp": 0.01085953, + "balance_loss_clip": 1.07026303, + "balance_loss_mlp": 1.05171657, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 1.9897794525570651, + "language_loss": 0.8276726, + "learning_rate": 3.9999999620799e-06, + "loss": 0.8512212, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.5571722984313965 + }, + { + "auxiliary_loss_clip": 0.01260854, + "auxiliary_loss_mlp": 0.01090236, + "balance_loss_clip": 1.06479573, + "balance_loss_mlp": 1.05332875, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 4.763968655865359, + "language_loss": 0.88476515, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90827608, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.5335352420806885 + }, + { + "auxiliary_loss_clip": 0.01271133, + "auxiliary_loss_mlp": 0.0107753, + "balance_loss_clip": 1.06891644, + "balance_loss_mlp": 1.04484248, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.320618514876529, + "language_loss": 0.87014198, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.8936286, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.5028183460235596 + }, + { + "auxiliary_loss_clip": 0.01266281, + "auxiliary_loss_mlp": 0.01081986, + "balance_loss_clip": 1.06988883, + "balance_loss_mlp": 1.047225, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 3.7263690184393066, + "language_loss": 0.84570771, + "learning_rate": 3.999999393278425e-06, + "loss": 0.86919045, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 2.5289173126220703 + }, + { + "auxiliary_loss_clip": 0.01259427, + "auxiliary_loss_mlp": 0.01091878, + "balance_loss_clip": 1.06687224, + "balance_loss_mlp": 1.05759358, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.8872632885450973, + "language_loss": 0.88494086, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90845388, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.595611810684204 + }, + { + "auxiliary_loss_clip": 0.01262167, + "auxiliary_loss_mlp": 0.01095284, + "balance_loss_clip": 1.06611574, + "balance_loss_mlp": 1.06107116, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.102711043149905, + "language_loss": 0.779881, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80345553, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.51688814163208 + }, + { + "auxiliary_loss_clip": 0.01118602, + "auxiliary_loss_mlp": 0.01012149, + "balance_loss_clip": 1.02016056, + "balance_loss_mlp": 1.00437629, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8414061854424846, + "language_loss": 0.54953241, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57083988, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.213446855545044 + }, + { + "auxiliary_loss_clip": 0.01262586, + "auxiliary_loss_mlp": 0.01094173, + "balance_loss_clip": 1.06610966, + "balance_loss_mlp": 1.05957866, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 1.967340432650373, + "language_loss": 0.83298385, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85655141, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 4.117897987365723 + }, + { + "auxiliary_loss_clip": 0.01266729, + "auxiliary_loss_mlp": 0.01082717, + "balance_loss_clip": 1.066486, + "balance_loss_mlp": 1.04890895, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.8476833208414893, + "language_loss": 0.89040053, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91389501, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 2.6249780654907227 + }, + { + "auxiliary_loss_clip": 0.01267803, + "auxiliary_loss_mlp": 0.01068346, + "balance_loss_clip": 1.06680369, + "balance_loss_mlp": 1.03427625, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.0761082805716153, + "language_loss": 0.71733904, + "learning_rate": 3.999996207991165e-06, + "loss": 0.74070054, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 4.1388185024261475 + }, + { + "auxiliary_loss_clip": 0.01260102, + "auxiliary_loss_mlp": 0.01076496, + "balance_loss_clip": 1.06697178, + "balance_loss_mlp": 1.04485834, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 1.9950215103239763, + "language_loss": 0.82504928, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84841526, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 5.541987419128418 + }, + { + "auxiliary_loss_clip": 0.0126392, + "auxiliary_loss_mlp": 0.01084093, + "balance_loss_clip": 1.06921959, + "balance_loss_mlp": 1.04985583, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.2757657355046277, + "language_loss": 0.83919644, + "learning_rate": 3.999994539508036e-06, + "loss": 0.8626765, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 2.57700252532959 + }, + { + "auxiliary_loss_clip": 0.0126455, + "auxiliary_loss_mlp": 0.01082198, + "balance_loss_clip": 1.06628251, + "balance_loss_mlp": 1.0492723, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.070877336624771, + "language_loss": 0.82277048, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84623796, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 2.5570313930511475 + }, + { + "auxiliary_loss_clip": 0.01261583, + "auxiliary_loss_mlp": 0.01086965, + "balance_loss_clip": 1.06488419, + "balance_loss_mlp": 1.0522753, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 2.7590049807335704, + "language_loss": 0.86994737, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89343286, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 2.596407890319824 + }, + { + "auxiliary_loss_clip": 0.01271454, + "auxiliary_loss_mlp": 0.01085158, + "balance_loss_clip": 1.06964505, + "balance_loss_mlp": 1.05132651, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 1.771249935878634, + "language_loss": 0.7927115, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81627762, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 2.5229837894439697 + }, + { + "auxiliary_loss_clip": 0.01261428, + "auxiliary_loss_mlp": 0.010698, + "balance_loss_clip": 1.0681138, + "balance_loss_mlp": 1.03890085, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 10.305504069090663, + "language_loss": 0.77313948, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79645175, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 2.5413665771484375 + }, + { + "auxiliary_loss_clip": 0.0125862, + "auxiliary_loss_mlp": 0.01073338, + "balance_loss_clip": 1.0621419, + "balance_loss_mlp": 1.03917277, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 2.128210862378184, + "language_loss": 0.82824528, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85156482, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.746173858642578 + }, + { + "auxiliary_loss_clip": 0.01255728, + "auxiliary_loss_mlp": 0.0107716, + "balance_loss_clip": 1.06432688, + "balance_loss_mlp": 1.04413891, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 3.647818980283912, + "language_loss": 0.7885828, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81191158, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.5440173149108887 + }, + { + "auxiliary_loss_clip": 0.01254841, + "auxiliary_loss_mlp": 0.01076634, + "balance_loss_clip": 1.0654192, + "balance_loss_mlp": 1.04451919, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.6077003424406426, + "language_loss": 0.90824258, + "learning_rate": 3.999986310859396e-06, + "loss": 0.9315573, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.6628761291503906 + }, + { + "auxiliary_loss_clip": 0.01268702, + "auxiliary_loss_mlp": 0.01099721, + "balance_loss_clip": 1.07349682, + "balance_loss_mlp": 1.06414914, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 2.145454448285922, + "language_loss": 0.86498326, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88866746, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.539546251296997 + }, + { + "auxiliary_loss_clip": 0.01261636, + "auxiliary_loss_mlp": 0.01089488, + "balance_loss_clip": 1.06369662, + "balance_loss_mlp": 1.0568006, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.104613274826493, + "language_loss": 0.87264848, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89615977, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.5329573154449463 + }, + { + "auxiliary_loss_clip": 0.01265676, + "auxiliary_loss_mlp": 0.01088954, + "balance_loss_clip": 1.06702328, + "balance_loss_mlp": 1.05466914, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.7046132396730205, + "language_loss": 0.89388227, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91742849, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.560288667678833 + }, + { + "auxiliary_loss_clip": 0.01258736, + "auxiliary_loss_mlp": 0.01080572, + "balance_loss_clip": 1.06597185, + "balance_loss_mlp": 1.04612076, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 1.903993724758476, + "language_loss": 0.71223283, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73562586, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.5796680450439453 + }, + { + "auxiliary_loss_clip": 0.01263911, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_clip": 1.0653379, + "balance_loss_mlp": 1.05411029, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 2.947844747920329, + "language_loss": 0.84663802, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87014294, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.533125877380371 + }, + { + "auxiliary_loss_clip": 0.01266818, + "auxiliary_loss_mlp": 0.01076605, + "balance_loss_clip": 1.06591535, + "balance_loss_mlp": 1.04296446, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 2.2803771015432512, + "language_loss": 0.90228617, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92572039, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.5606021881103516 + }, + { + "auxiliary_loss_clip": 0.01268881, + "auxiliary_loss_mlp": 0.01087117, + "balance_loss_clip": 1.06898248, + "balance_loss_mlp": 1.05307102, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.7528475751949197, + "language_loss": 0.8005594, + "learning_rate": 3.999974366066933e-06, + "loss": 0.82411939, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.5914196968078613 + }, + { + "auxiliary_loss_clip": 0.01260018, + "auxiliary_loss_mlp": 0.01087733, + "balance_loss_clip": 1.06271255, + "balance_loss_mlp": 1.05433011, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.518847315810695, + "language_loss": 0.80628651, + "learning_rate": 3.999972356310538e-06, + "loss": 0.82976401, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.478001832962036 + }, + { + "auxiliary_loss_clip": 0.01271489, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_clip": 1.07118559, + "balance_loss_mlp": 1.03579473, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 1.9900772763887133, + "language_loss": 0.8147428, + "learning_rate": 3.999970270714991e-06, + "loss": 0.8381809, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 2.5245485305786133 + }, + { + "auxiliary_loss_clip": 0.01256686, + "auxiliary_loss_mlp": 0.0108201, + "balance_loss_clip": 1.062356, + "balance_loss_mlp": 1.04789221, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.015923297375545, + "language_loss": 0.93875521, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96214217, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.547663927078247 + }, + { + "auxiliary_loss_clip": 0.01258415, + "auxiliary_loss_mlp": 0.0107489, + "balance_loss_clip": 1.06303942, + "balance_loss_mlp": 1.04203606, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 2.0055538014982033, + "language_loss": 0.83978188, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86311489, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 2.6416690349578857 + }, + { + "auxiliary_loss_clip": 0.01262582, + "auxiliary_loss_mlp": 0.01090145, + "balance_loss_clip": 1.07145786, + "balance_loss_mlp": 1.05721927, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 1.90652992858638, + "language_loss": 0.90531284, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92884016, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.57738995552063 + }, + { + "auxiliary_loss_clip": 0.01256033, + "auxiliary_loss_mlp": 0.01078278, + "balance_loss_clip": 1.05900407, + "balance_loss_mlp": 1.04361248, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.0640295636685866, + "language_loss": 0.76172423, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78506732, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.559051513671875 + }, + { + "auxiliary_loss_clip": 0.01256545, + "auxiliary_loss_mlp": 0.01067409, + "balance_loss_clip": 1.0618583, + "balance_loss_mlp": 1.03288603, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.108841269085077, + "language_loss": 0.90798914, + "learning_rate": 3.999958705152843e-06, + "loss": 0.93122864, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 2.568490505218506 + }, + { + "auxiliary_loss_clip": 0.01118721, + "auxiliary_loss_mlp": 0.01010504, + "balance_loss_clip": 1.01899958, + "balance_loss_mlp": 1.00263584, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7264457656870527, + "language_loss": 0.57937825, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60067046, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.120417594909668 + }, + { + "auxiliary_loss_clip": 0.01255634, + "auxiliary_loss_mlp": 0.0108502, + "balance_loss_clip": 1.06201649, + "balance_loss_mlp": 1.05240405, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 2.119591872887368, + "language_loss": 0.86585176, + "learning_rate": 3.999953548056907e-06, + "loss": 0.88925833, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.599210500717163 + }, + { + "auxiliary_loss_clip": 0.01255278, + "auxiliary_loss_mlp": 0.01071041, + "balance_loss_clip": 1.06361055, + "balance_loss_mlp": 1.03823495, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.2596009156355277, + "language_loss": 0.77213687, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79540008, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 2.6486737728118896 + }, + { + "auxiliary_loss_clip": 0.01259407, + "auxiliary_loss_mlp": 0.01087976, + "balance_loss_clip": 1.06492329, + "balance_loss_mlp": 1.05471635, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.3397353397665146, + "language_loss": 0.80850625, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83198011, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.739222288131714 + }, + { + "auxiliary_loss_clip": 0.01259343, + "auxiliary_loss_mlp": 0.01081145, + "balance_loss_clip": 1.06512868, + "balance_loss_mlp": 1.04607391, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 1.9491049124679523, + "language_loss": 0.70174825, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72515315, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.01260696, + "auxiliary_loss_mlp": 0.01082414, + "balance_loss_clip": 1.0701201, + "balance_loss_mlp": 1.04927433, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 1.9944468169353644, + "language_loss": 0.82957113, + "learning_rate": 3.999942323804607e-06, + "loss": 0.85300219, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 2.582740545272827 + }, + { + "auxiliary_loss_clip": 0.01265957, + "auxiliary_loss_mlp": 0.01081969, + "balance_loss_clip": 1.06580925, + "balance_loss_mlp": 1.04861474, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8329566421711072, + "language_loss": 0.79564375, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81912297, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 2.5679707527160645 + }, + { + "auxiliary_loss_clip": 0.01258711, + "auxiliary_loss_mlp": 0.01069119, + "balance_loss_clip": 1.0649873, + "balance_loss_mlp": 1.03438139, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.661592447495145, + "language_loss": 0.77530646, + "learning_rate": 3.999936256649943e-06, + "loss": 0.79858482, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 2.5984208583831787 + }, + { + "auxiliary_loss_clip": 0.01267714, + "auxiliary_loss_mlp": 0.01076541, + "balance_loss_clip": 1.07006109, + "balance_loss_mlp": 1.0433296, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 1.9615925137285612, + "language_loss": 0.85767353, + "learning_rate": 3.999933109315878e-06, + "loss": 0.88111609, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.531059503555298 + }, + { + "auxiliary_loss_clip": 0.01255823, + "auxiliary_loss_mlp": 0.01090175, + "balance_loss_clip": 1.06604862, + "balance_loss_mlp": 1.05498421, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 4.057410733102146, + "language_loss": 0.88881409, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91227406, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.562098264694214 + }, + { + "auxiliary_loss_clip": 0.01259572, + "auxiliary_loss_mlp": 0.0107503, + "balance_loss_clip": 1.06483817, + "balance_loss_mlp": 1.04084063, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.3092746135337943, + "language_loss": 0.71408939, + "learning_rate": 3.999926587134879e-06, + "loss": 0.7374354, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 2.5527114868164062 + }, + { + "auxiliary_loss_clip": 0.01257209, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_clip": 1.05991006, + "balance_loss_mlp": 1.05617249, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.8703662511145804, + "language_loss": 0.91368985, + "learning_rate": 3.999923212288192e-06, + "loss": 0.9371686, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.541466474533081 + }, + { + "auxiliary_loss_clip": 0.01260622, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_clip": 1.06638014, + "balance_loss_mlp": 1.0544579, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 2.7848673124995447, + "language_loss": 0.66348648, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68694937, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 2.487335681915283 + }, + { + "auxiliary_loss_clip": 0.01259175, + "auxiliary_loss_mlp": 0.01070069, + "balance_loss_clip": 1.06314564, + "balance_loss_mlp": 1.03669047, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.1056436830821905, + "language_loss": 0.92580283, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94909537, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.608412504196167 + }, + { + "auxiliary_loss_clip": 0.01256014, + "auxiliary_loss_mlp": 0.0107248, + "balance_loss_clip": 1.06067681, + "balance_loss_mlp": 1.03824294, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 21.24033550313775, + "language_loss": 0.81975096, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84303588, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.565974235534668 + }, + { + "auxiliary_loss_clip": 0.012579, + "auxiliary_loss_mlp": 0.01074854, + "balance_loss_clip": 1.06490731, + "balance_loss_mlp": 1.04047453, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 1.9754628993603705, + "language_loss": 0.81215358, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83548111, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 4.004130601882935 + }, + { + "auxiliary_loss_clip": 0.01259362, + "auxiliary_loss_mlp": 0.01081836, + "balance_loss_clip": 1.06480491, + "balance_loss_mlp": 1.04602551, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.645305113544353, + "language_loss": 0.67428231, + "learning_rate": 3.999905200498087e-06, + "loss": 0.6976943, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 3.982468843460083 + }, + { + "auxiliary_loss_clip": 0.01253003, + "auxiliary_loss_mlp": 0.01080405, + "balance_loss_clip": 1.06526756, + "balance_loss_mlp": 1.0470506, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.1743807210981116, + "language_loss": 0.86280888, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88614297, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 2.515310287475586 + }, + { + "auxiliary_loss_clip": 0.01260785, + "auxiliary_loss_mlp": 0.01095586, + "balance_loss_clip": 1.06878924, + "balance_loss_mlp": 1.06168282, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 2.239354312899383, + "language_loss": 0.81227994, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83584368, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 4.015713930130005 + }, + { + "auxiliary_loss_clip": 0.0126844, + "auxiliary_loss_mlp": 0.01090398, + "balance_loss_clip": 1.06893408, + "balance_loss_mlp": 1.05551767, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.418749276562834, + "language_loss": 0.86088586, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88447428, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 4.055293560028076 + }, + { + "auxiliary_loss_clip": 0.01262287, + "auxiliary_loss_mlp": 0.0108151, + "balance_loss_clip": 1.06812859, + "balance_loss_mlp": 1.04541397, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.3900399002432597, + "language_loss": 0.92661953, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95005751, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 2.535492420196533 + }, + { + "auxiliary_loss_clip": 0.01257777, + "auxiliary_loss_mlp": 0.01080975, + "balance_loss_clip": 1.06446517, + "balance_loss_mlp": 1.04497421, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.7045279701442984, + "language_loss": 0.787902, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81128955, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 2.5701711177825928 + }, + { + "auxiliary_loss_clip": 0.01255353, + "auxiliary_loss_mlp": 0.01092094, + "balance_loss_clip": 1.06549299, + "balance_loss_mlp": 1.05573547, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.2122891359191517, + "language_loss": 0.82249641, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84597093, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.55344820022583 + }, + { + "auxiliary_loss_clip": 0.01258147, + "auxiliary_loss_mlp": 0.01086221, + "balance_loss_clip": 1.06335068, + "balance_loss_mlp": 1.05055308, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.452941576050112, + "language_loss": 0.88702792, + "learning_rate": 3.999876798858914e-06, + "loss": 0.91047156, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 2.6314570903778076 + }, + { + "auxiliary_loss_clip": 0.0125709, + "auxiliary_loss_mlp": 0.01090488, + "balance_loss_clip": 1.06447124, + "balance_loss_mlp": 1.05434418, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 1.9900583737976254, + "language_loss": 0.83833623, + "learning_rate": 3.999872438138503e-06, + "loss": 0.861812, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.595128059387207 + }, + { + "auxiliary_loss_clip": 0.01262652, + "auxiliary_loss_mlp": 0.01070204, + "balance_loss_clip": 1.0685904, + "balance_loss_mlp": 1.0371592, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 2.74925727199939, + "language_loss": 0.94333231, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96666086, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.495131015777588 + }, + { + "auxiliary_loss_clip": 0.01253194, + "auxiliary_loss_mlp": 0.01076415, + "balance_loss_clip": 1.0616312, + "balance_loss_mlp": 1.04153395, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.979035283565803, + "language_loss": 0.77607071, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79936683, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 2.5752689838409424 + }, + { + "auxiliary_loss_clip": 0.01257003, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_clip": 1.06394339, + "balance_loss_mlp": 1.05122352, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.2159684139800575, + "language_loss": 0.87830687, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90172178, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.5646255016326904 + }, + { + "auxiliary_loss_clip": 0.01251798, + "auxiliary_loss_mlp": 0.01069531, + "balance_loss_clip": 1.06145239, + "balance_loss_mlp": 1.038131, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.3238631022258622, + "language_loss": 0.8144508, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83766413, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.5390214920043945 + }, + { + "auxiliary_loss_clip": 0.0125248, + "auxiliary_loss_mlp": 0.01075665, + "balance_loss_clip": 1.06424809, + "balance_loss_mlp": 1.04331183, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.6872820109383453, + "language_loss": 0.82106751, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84434897, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.6197290420532227 + }, + { + "auxiliary_loss_clip": 0.01260476, + "auxiliary_loss_mlp": 0.01082105, + "balance_loss_clip": 1.06750143, + "balance_loss_mlp": 1.04779685, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 2.2387324308266283, + "language_loss": 0.83913577, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86256158, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.5425493717193604 + }, + { + "auxiliary_loss_clip": 0.01258725, + "auxiliary_loss_mlp": 0.01086989, + "balance_loss_clip": 1.06721175, + "balance_loss_mlp": 1.05258536, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.1724363971239806, + "language_loss": 0.94160283, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96506, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.549372434616089 + }, + { + "auxiliary_loss_clip": 0.01257476, + "auxiliary_loss_mlp": 0.01077538, + "balance_loss_clip": 1.06257188, + "balance_loss_mlp": 1.04289556, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 2.9914590282460725, + "language_loss": 0.94435853, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96770859, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.6379005908966064 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.01008085, + "balance_loss_clip": 1.02206254, + "balance_loss_mlp": 0.99974042, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1128099387981336, + "language_loss": 0.5488236, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57006633, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 3.185551166534424 + }, + { + "auxiliary_loss_clip": 0.01257888, + "auxiliary_loss_mlp": 0.01076481, + "balance_loss_clip": 1.06477976, + "balance_loss_mlp": 1.04031277, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 3.29733633697584, + "language_loss": 0.76799709, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79134083, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.6245510578155518 + }, + { + "auxiliary_loss_clip": 0.01253074, + "auxiliary_loss_mlp": 0.01085566, + "balance_loss_clip": 1.06471515, + "balance_loss_mlp": 1.05311716, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 26.00868731572758, + "language_loss": 0.81101716, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83440357, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 2.6056790351867676 + }, + { + "auxiliary_loss_clip": 0.01251112, + "auxiliary_loss_mlp": 0.01085416, + "balance_loss_clip": 1.06497717, + "balance_loss_mlp": 1.05203748, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.8274882738950509, + "language_loss": 0.86834979, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89171505, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 2.5524423122406006 + }, + { + "auxiliary_loss_clip": 0.01252474, + "auxiliary_loss_mlp": 0.01085345, + "balance_loss_clip": 1.06312251, + "balance_loss_mlp": 1.05141842, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.983219083453777, + "language_loss": 0.95975089, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98312902, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.5498008728027344 + }, + { + "auxiliary_loss_clip": 0.01253142, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_clip": 1.05982733, + "balance_loss_mlp": 1.04624152, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 1.987185885763928, + "language_loss": 0.79810488, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.82146472, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 2.582260847091675 + }, + { + "auxiliary_loss_clip": 0.01254468, + "auxiliary_loss_mlp": 0.01084743, + "balance_loss_clip": 1.06500041, + "balance_loss_mlp": 1.05019617, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.1199156234068393, + "language_loss": 0.80301845, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82641059, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.5640757083892822 + }, + { + "auxiliary_loss_clip": 0.01261206, + "auxiliary_loss_mlp": 0.01076214, + "balance_loss_clip": 1.06770849, + "balance_loss_mlp": 1.04238248, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 2.687097987659379, + "language_loss": 0.84524548, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86861968, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.5028765201568604 + }, + { + "auxiliary_loss_clip": 0.01254124, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_clip": 1.06132209, + "balance_loss_mlp": 1.03793716, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 2.5031397162852747, + "language_loss": 0.76298159, + "learning_rate": 3.999786703227023e-06, + "loss": 0.7862224, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.6707751750946045 + }, + { + "auxiliary_loss_clip": 0.01252209, + "auxiliary_loss_mlp": 0.01067118, + "balance_loss_clip": 1.06274843, + "balance_loss_mlp": 1.035146, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 3.266784402385472, + "language_loss": 0.84005415, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86324745, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.5297765731811523 + }, + { + "auxiliary_loss_clip": 0.0124854, + "auxiliary_loss_mlp": 0.01076531, + "balance_loss_clip": 1.06461692, + "balance_loss_mlp": 1.04408205, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.067730365965163, + "language_loss": 0.83978927, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86303997, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 2.522630453109741 + }, + { + "auxiliary_loss_clip": 0.01256103, + "auxiliary_loss_mlp": 0.01078128, + "balance_loss_clip": 1.07208753, + "balance_loss_mlp": 1.04656124, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.213184198738914, + "language_loss": 0.86347079, + "learning_rate": 3.99976929854497e-06, + "loss": 0.8868131, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 2.6175670623779297 + }, + { + "auxiliary_loss_clip": 0.01253789, + "auxiliary_loss_mlp": 0.0107794, + "balance_loss_clip": 1.06769788, + "balance_loss_mlp": 1.04518139, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 1.9618261648900162, + "language_loss": 0.72358489, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74690223, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 2.535010576248169 + }, + { + "auxiliary_loss_clip": 0.01252689, + "auxiliary_loss_mlp": 0.01081462, + "balance_loss_clip": 1.06450915, + "balance_loss_mlp": 1.0475831, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.842893394506794, + "language_loss": 0.77712977, + "learning_rate": 3.999757316265973e-06, + "loss": 0.80047131, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.567802906036377 + }, + { + "auxiliary_loss_clip": 0.01250764, + "auxiliary_loss_mlp": 0.01085166, + "balance_loss_clip": 1.06417465, + "balance_loss_mlp": 1.05016649, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 1.9748918549417915, + "language_loss": 0.87011576, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89347512, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 2.563528537750244 + }, + { + "auxiliary_loss_clip": 0.01256846, + "auxiliary_loss_mlp": 0.01073491, + "balance_loss_clip": 1.06509459, + "balance_loss_mlp": 1.04280698, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 2.7562874215783055, + "language_loss": 0.82246625, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84576964, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.512787103652954 + }, + { + "auxiliary_loss_clip": 0.01251237, + "auxiliary_loss_mlp": 0.01072672, + "balance_loss_clip": 1.06588221, + "balance_loss_mlp": 1.04182053, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2275592490658815, + "language_loss": 0.77233255, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79557163, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.4859378337860107 + }, + { + "auxiliary_loss_clip": 0.01251895, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_clip": 1.06763518, + "balance_loss_mlp": 1.04928112, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.5985891622630906, + "language_loss": 0.8768971, + "learning_rate": 3.999732441737877e-06, + "loss": 0.90024519, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.5232350826263428 + }, + { + "auxiliary_loss_clip": 0.01256793, + "auxiliary_loss_mlp": 0.01094387, + "balance_loss_clip": 1.06615567, + "balance_loss_mlp": 1.06086588, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.620995940897381, + "language_loss": 0.81146097, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83497274, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.5380990505218506 + }, + { + "auxiliary_loss_clip": 0.01249519, + "auxiliary_loss_mlp": 0.010732, + "balance_loss_clip": 1.06118071, + "balance_loss_mlp": 1.04022634, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.9723580871796353, + "language_loss": 0.93063056, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95385778, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.486078977584839 + }, + { + "auxiliary_loss_clip": 0.01252847, + "auxiliary_loss_mlp": 0.01076506, + "balance_loss_clip": 1.06435418, + "balance_loss_mlp": 1.04374719, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.4579336815370896, + "language_loss": 0.87990618, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.90319967, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 2.557159900665283 + }, + { + "auxiliary_loss_clip": 0.01259928, + "auxiliary_loss_mlp": 0.0108209, + "balance_loss_clip": 1.06815934, + "balance_loss_mlp": 1.04928398, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.8591937535481886, + "language_loss": 0.76645195, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78987211, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 2.564701557159424 + }, + { + "auxiliary_loss_clip": 0.01258904, + "auxiliary_loss_mlp": 0.01068837, + "balance_loss_clip": 1.06684113, + "balance_loss_mlp": 1.03450489, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.7184997946663163, + "language_loss": 0.78590906, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80918652, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 2.584683656692505 + }, + { + "auxiliary_loss_clip": 0.01257258, + "auxiliary_loss_mlp": 0.01082509, + "balance_loss_clip": 1.06506419, + "balance_loss_mlp": 1.0470562, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.2002615984549414, + "language_loss": 0.93698424, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96038193, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 4.1766204833984375 + }, + { + "auxiliary_loss_clip": 0.01258589, + "auxiliary_loss_mlp": 0.01075657, + "balance_loss_clip": 1.06701374, + "balance_loss_mlp": 1.04385209, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.264257183461086, + "language_loss": 0.84268427, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86602664, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 2.647249698638916 + }, + { + "auxiliary_loss_clip": 0.01249829, + "auxiliary_loss_mlp": 0.01071236, + "balance_loss_clip": 1.06523776, + "balance_loss_mlp": 1.04064727, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.1977712758931065, + "language_loss": 0.87186217, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89507282, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 3.99640154838562 + }, + { + "auxiliary_loss_clip": 0.01253017, + "auxiliary_loss_mlp": 0.01081237, + "balance_loss_clip": 1.06193829, + "balance_loss_mlp": 1.04747736, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 1.8741377180213068, + "language_loss": 0.8301512, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85349375, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 4.033992767333984 + }, + { + "auxiliary_loss_clip": 0.01110687, + "auxiliary_loss_mlp": 0.01014093, + "balance_loss_clip": 1.01814842, + "balance_loss_mlp": 1.0061779, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.881281357883232, + "language_loss": 0.59802687, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61927474, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 4.756572008132935 + }, + { + "auxiliary_loss_clip": 0.01254542, + "auxiliary_loss_mlp": 0.01076249, + "balance_loss_clip": 1.06925082, + "balance_loss_mlp": 1.04263198, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.7623335721985915, + "language_loss": 0.87016606, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89347392, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.512723922729492 + }, + { + "auxiliary_loss_clip": 0.01250693, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_clip": 1.06276512, + "balance_loss_mlp": 1.04333353, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.1397331666035564, + "language_loss": 0.83653235, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85979593, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.5994720458984375 + }, + { + "auxiliary_loss_clip": 0.01251845, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_clip": 1.06733894, + "balance_loss_mlp": 1.05119443, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 5.997094445117972, + "language_loss": 0.96253407, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98589015, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.523859739303589 + }, + { + "auxiliary_loss_clip": 0.01253981, + "auxiliary_loss_mlp": 0.01078657, + "balance_loss_clip": 1.06704748, + "balance_loss_mlp": 1.04740071, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 3.4186973719001528, + "language_loss": 0.82737648, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85070288, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.55279803276062 + }, + { + "auxiliary_loss_clip": 0.01249654, + "auxiliary_loss_mlp": 0.01074963, + "balance_loss_clip": 1.06723642, + "balance_loss_mlp": 1.04277635, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 1.955374182704815, + "language_loss": 0.81156272, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83480889, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.529000997543335 + }, + { + "auxiliary_loss_clip": 0.01243884, + "auxiliary_loss_mlp": 0.01079109, + "balance_loss_clip": 1.06517744, + "balance_loss_mlp": 1.04568267, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.8875907183063185, + "language_loss": 0.81174493, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83497483, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.6181230545043945 + }, + { + "auxiliary_loss_clip": 0.01252049, + "auxiliary_loss_mlp": 0.01077863, + "balance_loss_clip": 1.06338894, + "balance_loss_mlp": 1.04655886, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.1275245691376274, + "language_loss": 0.85453117, + "learning_rate": 3.999613189525668e-06, + "loss": 0.87783027, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.5540270805358887 + }, + { + "auxiliary_loss_clip": 0.01242875, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_clip": 1.05869532, + "balance_loss_mlp": 1.05222487, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 2.493522822923586, + "language_loss": 0.82120252, + "learning_rate": 3.999605492246508e-06, + "loss": 0.8444761, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.5466861724853516 + }, + { + "auxiliary_loss_clip": 0.01240351, + "auxiliary_loss_mlp": 0.01062712, + "balance_loss_clip": 1.05970979, + "balance_loss_mlp": 1.03124058, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.059408641625567, + "language_loss": 0.75622195, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77925253, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.5393593311309814 + }, + { + "auxiliary_loss_clip": 0.0124214, + "auxiliary_loss_mlp": 0.01063667, + "balance_loss_clip": 1.05990362, + "balance_loss_mlp": 1.03176641, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9608239398776977, + "language_loss": 0.80237114, + "learning_rate": 3.999589870212761e-06, + "loss": 0.8254292, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.583725690841675 + }, + { + "auxiliary_loss_clip": 0.01247497, + "auxiliary_loss_mlp": 0.01066022, + "balance_loss_clip": 1.06543612, + "balance_loss_mlp": 1.03493237, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 2.0898548238861707, + "language_loss": 0.86841178, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89154696, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 2.541794538497925 + }, + { + "auxiliary_loss_clip": 0.01249534, + "auxiliary_loss_mlp": 0.01075559, + "balance_loss_clip": 1.0661459, + "balance_loss_mlp": 1.0421567, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 3.552575037476644, + "language_loss": 0.80470598, + "learning_rate": 3.999573944880424e-06, + "loss": 0.82795691, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 2.516003131866455 + }, + { + "auxiliary_loss_clip": 0.01245047, + "auxiliary_loss_mlp": 0.01071999, + "balance_loss_clip": 1.0624702, + "balance_loss_mlp": 1.04193437, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 3.41027855683373, + "language_loss": 0.86026174, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.88343227, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 2.5278491973876953 + }, + { + "auxiliary_loss_clip": 0.012516, + "auxiliary_loss_mlp": 0.01077299, + "balance_loss_clip": 1.06500316, + "balance_loss_mlp": 1.04492188, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1327624095339806, + "language_loss": 0.82533932, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84862828, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 2.5402767658233643 + }, + { + "auxiliary_loss_clip": 0.01245886, + "auxiliary_loss_mlp": 0.0106839, + "balance_loss_clip": 1.06478441, + "balance_loss_mlp": 1.03799129, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.3781377520688207, + "language_loss": 0.83772498, + "learning_rate": 3.999549488202358e-06, + "loss": 0.86086774, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 2.578538656234741 + }, + { + "auxiliary_loss_clip": 0.01250996, + "auxiliary_loss_mlp": 0.01070424, + "balance_loss_clip": 1.06609952, + "balance_loss_mlp": 1.03649652, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 3.247263858338456, + "language_loss": 0.8206495, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84386367, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.546186923980713 + }, + { + "auxiliary_loss_clip": 0.01255511, + "auxiliary_loss_mlp": 0.01089622, + "balance_loss_clip": 1.07138455, + "balance_loss_mlp": 1.0593667, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.066131521455681, + "language_loss": 0.79672778, + "learning_rate": 3.999532804634215e-06, + "loss": 0.8201791, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.574014186859131 + }, + { + "auxiliary_loss_clip": 0.01254184, + "auxiliary_loss_mlp": 0.01084293, + "balance_loss_clip": 1.06770754, + "balance_loss_mlp": 1.05232143, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.8437379550012167, + "language_loss": 0.87572128, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89910603, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.6161746978759766 + }, + { + "auxiliary_loss_clip": 0.01248784, + "auxiliary_loss_mlp": 0.01097239, + "balance_loss_clip": 1.06804478, + "balance_loss_mlp": 1.06610203, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.1009241420235423, + "language_loss": 0.72942555, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75288582, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 2.579457998275757 + }, + { + "auxiliary_loss_clip": 0.01249686, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_clip": 1.06526732, + "balance_loss_mlp": 1.04642367, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.1283950946795858, + "language_loss": 0.78936636, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81264406, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.5435943603515625 + }, + { + "auxiliary_loss_clip": 0.01244012, + "auxiliary_loss_mlp": 0.01083186, + "balance_loss_clip": 1.06355739, + "balance_loss_mlp": 1.05216742, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.194463493516773, + "language_loss": 0.93565017, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95892215, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.5590720176696777 + }, + { + "auxiliary_loss_clip": 0.0125366, + "auxiliary_loss_mlp": 0.01082899, + "balance_loss_clip": 1.06841373, + "balance_loss_mlp": 1.04861426, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 3.1906762913585545, + "language_loss": 0.73066932, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75403488, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.624629259109497 + }, + { + "auxiliary_loss_clip": 0.01248039, + "auxiliary_loss_mlp": 0.01077896, + "balance_loss_clip": 1.0631032, + "balance_loss_mlp": 1.04690206, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 1.8748691365973955, + "language_loss": 0.81802154, + "learning_rate": 3.999480934200528e-06, + "loss": 0.84128082, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 2.605454206466675 + }, + { + "auxiliary_loss_clip": 0.01247017, + "auxiliary_loss_mlp": 0.01075523, + "balance_loss_clip": 1.06410289, + "balance_loss_mlp": 1.04572058, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 2.1080517724455645, + "language_loss": 0.6840539, + "learning_rate": 3.999472023754499e-06, + "loss": 0.70727921, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 2.6548304557800293 + }, + { + "auxiliary_loss_clip": 0.01253136, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_clip": 1.06794071, + "balance_loss_mlp": 1.03901899, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 4.5442370199383575, + "language_loss": 0.80395949, + "learning_rate": 3.99946303748829e-06, + "loss": 0.82721102, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 2.5531928539276123 + }, + { + "auxiliary_loss_clip": 0.01253837, + "auxiliary_loss_mlp": 0.01082084, + "balance_loss_clip": 1.06536937, + "balance_loss_mlp": 1.04803824, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 3.102833669233831, + "language_loss": 0.91271573, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93607497, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.5164384841918945 + }, + { + "auxiliary_loss_clip": 0.01250205, + "auxiliary_loss_mlp": 0.01086725, + "balance_loss_clip": 1.06862271, + "balance_loss_mlp": 1.05542123, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.2563654522735974, + "language_loss": 0.94080931, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96417856, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.5994818210601807 + }, + { + "auxiliary_loss_clip": 0.01248416, + "auxiliary_loss_mlp": 0.01083397, + "balance_loss_clip": 1.06546283, + "balance_loss_mlp": 1.04999471, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.6221290864790219, + "language_loss": 0.76981652, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79313469, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 2.5906760692596436 + }, + { + "auxiliary_loss_clip": 0.0124779, + "auxiliary_loss_mlp": 0.01065182, + "balance_loss_clip": 1.06850362, + "balance_loss_mlp": 1.03323424, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 3.3939464594973456, + "language_loss": 0.8689357, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89206547, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.5337541103363037 + }, + { + "auxiliary_loss_clip": 0.01246675, + "auxiliary_loss_mlp": 0.0107043, + "balance_loss_clip": 1.06473064, + "balance_loss_mlp": 1.03883922, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.2649755945105947, + "language_loss": 0.90214574, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92531681, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.62564754486084 + }, + { + "auxiliary_loss_clip": 0.01250394, + "auxiliary_loss_mlp": 0.0108703, + "balance_loss_clip": 1.06884241, + "balance_loss_mlp": 1.05510557, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9244859591466679, + "language_loss": 0.84203613, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86541039, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 2.6077942848205566 + }, + { + "auxiliary_loss_clip": 0.01250576, + "auxiliary_loss_mlp": 0.01078968, + "balance_loss_clip": 1.063429, + "balance_loss_mlp": 1.04666185, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 3.3420353017105326, + "language_loss": 0.66955811, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69285357, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 2.6712534427642822 + }, + { + "auxiliary_loss_clip": 0.01242662, + "auxiliary_loss_mlp": 0.01075393, + "balance_loss_clip": 1.06380725, + "balance_loss_mlp": 1.0417279, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.6450461409583883, + "language_loss": 0.77109951, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79428005, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.588193655014038 + }, + { + "auxiliary_loss_clip": 0.01245863, + "auxiliary_loss_mlp": 0.01078222, + "balance_loss_clip": 1.06554818, + "balance_loss_mlp": 1.04658437, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.371817512183301, + "language_loss": 0.81446218, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83770305, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 2.5318586826324463 + }, + { + "auxiliary_loss_clip": 0.01249787, + "auxiliary_loss_mlp": 0.01088218, + "balance_loss_clip": 1.06651008, + "balance_loss_mlp": 1.05593634, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 1.6147819587539733, + "language_loss": 0.88672507, + "learning_rate": 3.999369004792719e-06, + "loss": 0.91010511, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 2.5386691093444824 + }, + { + "auxiliary_loss_clip": 0.01244237, + "auxiliary_loss_mlp": 0.01077125, + "balance_loss_clip": 1.06115413, + "balance_loss_mlp": 1.04570138, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.666096883146676, + "language_loss": 0.7942369, + "learning_rate": 3.999359184527658e-06, + "loss": 0.81745052, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 2.589301586151123 + }, + { + "auxiliary_loss_clip": 0.01246089, + "auxiliary_loss_mlp": 0.01064843, + "balance_loss_clip": 1.06301475, + "balance_loss_mlp": 1.03524351, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.797387091127426, + "language_loss": 0.76647329, + "learning_rate": 3.999349288446696e-06, + "loss": 0.78958267, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 2.5727124214172363 + }, + { + "auxiliary_loss_clip": 0.01250229, + "auxiliary_loss_mlp": 0.01077985, + "balance_loss_clip": 1.0654546, + "balance_loss_mlp": 1.04670429, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 5.604223519107514, + "language_loss": 0.91451514, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93779725, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 3.98907732963562 + }, + { + "auxiliary_loss_clip": 0.01241387, + "auxiliary_loss_mlp": 0.0108662, + "balance_loss_clip": 1.06191349, + "balance_loss_mlp": 1.05200136, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.5741793323909523, + "language_loss": 0.92184705, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94512713, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 4.067040681838989 + }, + { + "auxiliary_loss_clip": 0.01241395, + "auxiliary_loss_mlp": 0.01065277, + "balance_loss_clip": 1.06316018, + "balance_loss_mlp": 1.03442621, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.046040700663083, + "language_loss": 0.8344878, + "learning_rate": 3.999319145312175e-06, + "loss": 0.8575545, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.5264480113983154 + }, + { + "auxiliary_loss_clip": 0.01243844, + "auxiliary_loss_mlp": 0.01072663, + "balance_loss_clip": 1.06215096, + "balance_loss_mlp": 1.04214513, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.7334532924980486, + "language_loss": 0.69960779, + "learning_rate": 3.999308945971392e-06, + "loss": 0.7227729, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.5937561988830566 + }, + { + "auxiliary_loss_clip": 0.01113105, + "auxiliary_loss_mlp": 0.01007864, + "balance_loss_clip": 1.02569699, + "balance_loss_mlp": 1.00037718, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8860573555741024, + "language_loss": 0.61709601, + "learning_rate": 3.999298670816614e-06, + "loss": 0.6383056, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 4.571063280105591 + }, + { + "auxiliary_loss_clip": 0.0124156, + "auxiliary_loss_mlp": 0.01069879, + "balance_loss_clip": 1.06199169, + "balance_loss_mlp": 1.03886104, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.2157547339488937, + "language_loss": 0.83688635, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86000073, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.5621957778930664 + }, + { + "auxiliary_loss_clip": 0.01244536, + "auxiliary_loss_mlp": 0.01086327, + "balance_loss_clip": 1.06390262, + "balance_loss_mlp": 1.05547547, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.7907248869775736, + "language_loss": 0.79582304, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81913167, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.545375108718872 + }, + { + "auxiliary_loss_clip": 0.01241908, + "auxiliary_loss_mlp": 0.01077936, + "balance_loss_clip": 1.0598228, + "balance_loss_mlp": 1.04603601, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 1.8165793602977116, + "language_loss": 0.83763349, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86083198, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.5392141342163086 + }, + { + "auxiliary_loss_clip": 0.01249769, + "auxiliary_loss_mlp": 0.01069149, + "balance_loss_clip": 1.06340384, + "balance_loss_mlp": 1.03693879, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.7365635625205944, + "language_loss": 0.6989345, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72212368, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.5617432594299316 + }, + { + "auxiliary_loss_clip": 0.01244166, + "auxiliary_loss_mlp": 0.01077398, + "balance_loss_clip": 1.0625577, + "balance_loss_mlp": 1.04559278, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.7124197840601125, + "language_loss": 0.85226631, + "learning_rate": 3.999246157846526e-06, + "loss": 0.8754819, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.54752516746521 + }, + { + "auxiliary_loss_clip": 0.01246573, + "auxiliary_loss_mlp": 0.01075064, + "balance_loss_clip": 1.06401765, + "balance_loss_mlp": 1.04274642, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.379160675903296, + "language_loss": 0.82260329, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84581971, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.6073598861694336 + }, + { + "auxiliary_loss_clip": 0.0110865, + "auxiliary_loss_mlp": 0.01011667, + "balance_loss_clip": 1.02066278, + "balance_loss_mlp": 1.00418031, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9018365670770221, + "language_loss": 0.65459156, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67579472, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.1065573692321777 + }, + { + "auxiliary_loss_clip": 0.01240999, + "auxiliary_loss_mlp": 0.01063165, + "balance_loss_clip": 1.06098986, + "balance_loss_mlp": 1.03374362, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 3.9879134959222107, + "language_loss": 0.79837298, + "learning_rate": 3.999213740321906e-06, + "loss": 0.82141465, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.600956678390503 + }, + { + "auxiliary_loss_clip": 0.01237549, + "auxiliary_loss_mlp": 0.01072473, + "balance_loss_clip": 1.05848289, + "balance_loss_mlp": 1.04239678, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.211845627418707, + "language_loss": 0.82709795, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85019815, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.5521390438079834 + }, + { + "auxiliary_loss_clip": 0.01242431, + "auxiliary_loss_mlp": 0.01068007, + "balance_loss_clip": 1.06088519, + "balance_loss_mlp": 1.03601134, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.3315126736840432, + "language_loss": 0.82302129, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84612566, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 2.6098899841308594 + }, + { + "auxiliary_loss_clip": 0.01245362, + "auxiliary_loss_mlp": 0.01068574, + "balance_loss_clip": 1.0624541, + "balance_loss_mlp": 1.03812754, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.1130416087393216, + "language_loss": 0.81789619, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.8410356, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 2.6198298931121826 + }, + { + "auxiliary_loss_clip": 0.01243204, + "auxiliary_loss_mlp": 0.0107958, + "balance_loss_clip": 1.06588292, + "balance_loss_mlp": 1.04887211, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 2.569376379295661, + "language_loss": 0.82004654, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84327441, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.535134792327881 + }, + { + "auxiliary_loss_clip": 0.01241305, + "auxiliary_loss_mlp": 0.01065831, + "balance_loss_clip": 1.06269121, + "balance_loss_mlp": 1.03595722, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 2.137000542043468, + "language_loss": 0.84165585, + "learning_rate": 3.999158194912106e-06, + "loss": 0.8647272, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.587214469909668 + }, + { + "auxiliary_loss_clip": 0.01240428, + "auxiliary_loss_mlp": 0.01072032, + "balance_loss_clip": 1.06228518, + "balance_loss_mlp": 1.04196715, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 2.1462150350472906, + "language_loss": 0.84663033, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86975491, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.53275465965271 + }, + { + "auxiliary_loss_clip": 0.01242385, + "auxiliary_loss_mlp": 0.01069495, + "balance_loss_clip": 1.06149745, + "balance_loss_mlp": 1.03788066, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.90429708347741, + "language_loss": 0.79848319, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82160199, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.4868626594543457 + }, + { + "auxiliary_loss_clip": 0.01234383, + "auxiliary_loss_mlp": 0.01069779, + "balance_loss_clip": 1.0575906, + "balance_loss_mlp": 1.03885651, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.1281513215339887, + "language_loss": 0.78650057, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80954218, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 2.5119447708129883 + }, + { + "auxiliary_loss_clip": 0.01237788, + "auxiliary_loss_mlp": 0.01078195, + "balance_loss_clip": 1.05940533, + "balance_loss_mlp": 1.04479313, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 3.3240951050326015, + "language_loss": 0.87405199, + "learning_rate": 3.999112394032757e-06, + "loss": 0.89721179, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 2.50270676612854 + }, + { + "auxiliary_loss_clip": 0.01232769, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.05891991, + "balance_loss_mlp": 1.04361665, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 3.797793593313396, + "language_loss": 0.7930243, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81608379, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.612523317337036 + }, + { + "auxiliary_loss_clip": 0.01244985, + "auxiliary_loss_mlp": 0.01070586, + "balance_loss_clip": 1.06143117, + "balance_loss_mlp": 1.0386622, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 5.733889703634457, + "language_loss": 0.86167157, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88482726, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 2.581401824951172 + }, + { + "auxiliary_loss_clip": 0.010996, + "auxiliary_loss_mlp": 0.01006292, + "balance_loss_clip": 1.01519132, + "balance_loss_mlp": 0.99932998, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7823963572639386, + "language_loss": 0.49905008, + "learning_rate": 3.999077247403041e-06, + "loss": 0.520109, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.151071071624756 + }, + { + "auxiliary_loss_clip": 0.01233663, + "auxiliary_loss_mlp": 0.01070775, + "balance_loss_clip": 1.06122541, + "balance_loss_mlp": 1.041592, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 1.98399683739614, + "language_loss": 0.81049323, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83353758, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 2.5791242122650146 + }, + { + "auxiliary_loss_clip": 0.01246791, + "auxiliary_loss_mlp": 0.01091322, + "balance_loss_clip": 1.06410837, + "balance_loss_mlp": 1.05505848, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 3.4113376701690075, + "language_loss": 0.76156223, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78494334, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 2.525067090988159 + }, + { + "auxiliary_loss_clip": 0.01243088, + "auxiliary_loss_mlp": 0.01077465, + "balance_loss_clip": 1.06237221, + "balance_loss_mlp": 1.04594588, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.021206690450567, + "language_loss": 0.81823361, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84143913, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 2.603595018386841 + }, + { + "auxiliary_loss_clip": 0.01234991, + "auxiliary_loss_mlp": 0.01079465, + "balance_loss_clip": 1.05905414, + "balance_loss_mlp": 1.04608667, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 4.00795421344032, + "language_loss": 0.91358006, + "learning_rate": 3.999029323959287e-06, + "loss": 0.9367246, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 2.556196689605713 + }, + { + "auxiliary_loss_clip": 0.01241427, + "auxiliary_loss_mlp": 0.01072246, + "balance_loss_clip": 1.06102538, + "balance_loss_mlp": 1.04149055, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 5.163656024482486, + "language_loss": 0.79477191, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81790864, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.4912030696868896 + }, + { + "auxiliary_loss_clip": 0.01239913, + "auxiliary_loss_mlp": 0.0107666, + "balance_loss_clip": 1.06396556, + "balance_loss_mlp": 1.04432988, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 2.0023151878639656, + "language_loss": 0.81792367, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84108937, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 2.5884342193603516 + }, + { + "auxiliary_loss_clip": 0.0109826, + "auxiliary_loss_mlp": 0.01012414, + "balance_loss_clip": 1.01463699, + "balance_loss_mlp": 1.00535679, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9091027636969186, + "language_loss": 0.69377434, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71488106, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 3.209947347640991 + }, + { + "auxiliary_loss_clip": 0.01242803, + "auxiliary_loss_mlp": 0.01077142, + "balance_loss_clip": 1.06477809, + "balance_loss_mlp": 1.0452894, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.6890785969426685, + "language_loss": 0.82886457, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85206401, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 2.5375802516937256 + }, + { + "auxiliary_loss_clip": 0.01248786, + "auxiliary_loss_mlp": 0.01066753, + "balance_loss_clip": 1.06678116, + "balance_loss_mlp": 1.03451848, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.129597799078573, + "language_loss": 0.87300158, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89615697, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.533820867538452 + }, + { + "auxiliary_loss_clip": 0.01234644, + "auxiliary_loss_mlp": 0.01069984, + "balance_loss_clip": 1.06109071, + "balance_loss_mlp": 1.03791642, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.013253227584385, + "language_loss": 0.85026228, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87330854, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 2.5147643089294434 + }, + { + "auxiliary_loss_clip": 0.01247746, + "auxiliary_loss_mlp": 0.01084097, + "balance_loss_clip": 1.06387091, + "balance_loss_mlp": 1.0506227, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.029158515334019, + "language_loss": 0.82030535, + "learning_rate": 3.998942539520158e-06, + "loss": 0.84362376, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 2.5996453762054443 + }, + { + "auxiliary_loss_clip": 0.01237231, + "auxiliary_loss_mlp": 0.01078089, + "balance_loss_clip": 1.06159282, + "balance_loss_mlp": 1.04492486, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.430463008208619, + "language_loss": 0.8684032, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89155638, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.567469358444214 + }, + { + "auxiliary_loss_clip": 0.01237394, + "auxiliary_loss_mlp": 0.01070164, + "balance_loss_clip": 1.06440401, + "balance_loss_mlp": 1.03890777, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.331812976165476, + "language_loss": 0.80588436, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82896, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 2.4862077236175537 + }, + { + "auxiliary_loss_clip": 0.0109431, + "auxiliary_loss_mlp": 0.01007409, + "balance_loss_clip": 1.0130477, + "balance_loss_mlp": 1.00132966, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7857392763556799, + "language_loss": 0.60031593, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62133312, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 3.2188706398010254 + }, + { + "auxiliary_loss_clip": 0.01238234, + "auxiliary_loss_mlp": 0.01072914, + "balance_loss_clip": 1.06022012, + "balance_loss_mlp": 1.04223001, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 3.025342512028596, + "language_loss": 0.86204094, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.8851524, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 2.5997252464294434 + }, + { + "auxiliary_loss_clip": 0.01236293, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_clip": 1.06219912, + "balance_loss_mlp": 1.04295397, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.8379645514695049, + "language_loss": 0.7543236, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77742058, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.5194127559661865 + }, + { + "auxiliary_loss_clip": 0.01244547, + "auxiliary_loss_mlp": 0.01078488, + "balance_loss_clip": 1.0662334, + "balance_loss_mlp": 1.04637277, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 1.899426155887782, + "language_loss": 0.92342681, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94665712, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 3.9678404331207275 + }, + { + "auxiliary_loss_clip": 0.01237444, + "auxiliary_loss_mlp": 0.01074222, + "balance_loss_clip": 1.06209397, + "balance_loss_mlp": 1.04115391, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.9939685274633434, + "language_loss": 0.90168393, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92480057, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 3.9797778129577637 + }, + { + "auxiliary_loss_clip": 0.01235557, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_clip": 1.06089115, + "balance_loss_mlp": 1.05125666, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 3.415844205044553, + "language_loss": 0.75092465, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77411705, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 4.169960975646973 + }, + { + "auxiliary_loss_clip": 0.01245979, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_clip": 1.06444621, + "balance_loss_mlp": 1.03717256, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.564192589652927, + "language_loss": 0.7816211, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80477494, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.534050226211548 + }, + { + "auxiliary_loss_clip": 0.01238836, + "auxiliary_loss_mlp": 0.01083201, + "balance_loss_clip": 1.06217313, + "balance_loss_mlp": 1.04956031, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.859630659468838, + "language_loss": 0.76709008, + "learning_rate": 3.998812118783757e-06, + "loss": 0.7903105, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 4.016777992248535 + }, + { + "auxiliary_loss_clip": 0.01243806, + "auxiliary_loss_mlp": 0.01080364, + "balance_loss_clip": 1.06467319, + "balance_loss_mlp": 1.04824924, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.571790741946581, + "language_loss": 0.85551107, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87875283, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.551741123199463 + }, + { + "auxiliary_loss_clip": 0.01235133, + "auxiliary_loss_mlp": 0.0107153, + "balance_loss_clip": 1.06124687, + "balance_loss_mlp": 1.03909314, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 2.3161563936555334, + "language_loss": 0.76543331, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78849995, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.5658352375030518 + }, + { + "auxiliary_loss_clip": 0.01237445, + "auxiliary_loss_mlp": 0.01070772, + "balance_loss_clip": 1.06129718, + "balance_loss_mlp": 1.04015934, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.7934768285127267, + "language_loss": 0.82272804, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84581023, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.6351006031036377 + }, + { + "auxiliary_loss_clip": 0.01243308, + "auxiliary_loss_mlp": 0.01060328, + "balance_loss_clip": 1.06795669, + "balance_loss_mlp": 1.02919054, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.69061730767787, + "language_loss": 0.76238763, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78542405, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.767930746078491 + }, + { + "auxiliary_loss_clip": 0.01240391, + "auxiliary_loss_mlp": 0.01069828, + "balance_loss_clip": 1.06082845, + "balance_loss_mlp": 1.03575826, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.8302997000768677, + "language_loss": 0.83451509, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85761726, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.57321834564209 + }, + { + "auxiliary_loss_clip": 0.01237216, + "auxiliary_loss_mlp": 0.01066739, + "balance_loss_clip": 1.05883729, + "balance_loss_mlp": 1.03393292, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 7.575072562553889, + "language_loss": 0.71609837, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73913789, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.5446372032165527 + }, + { + "auxiliary_loss_clip": 0.01237072, + "auxiliary_loss_mlp": 0.01075836, + "balance_loss_clip": 1.06019545, + "balance_loss_mlp": 1.04553342, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7478324469300461, + "language_loss": 0.72420824, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74733734, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 2.5075128078460693 + }, + { + "auxiliary_loss_clip": 0.01235276, + "auxiliary_loss_mlp": 0.01079348, + "balance_loss_clip": 1.065552, + "balance_loss_mlp": 1.04801965, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 3.1953219779782263, + "language_loss": 0.81358826, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83673447, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.5322728157043457 + }, + { + "auxiliary_loss_clip": 0.01238337, + "auxiliary_loss_mlp": 0.01073654, + "balance_loss_clip": 1.06428313, + "balance_loss_mlp": 1.04048991, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.564016925388342, + "language_loss": 0.90783489, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.93095481, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.620356321334839 + }, + { + "auxiliary_loss_clip": 0.01238676, + "auxiliary_loss_mlp": 0.01075603, + "balance_loss_clip": 1.06253421, + "balance_loss_mlp": 1.04391682, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.2959463862226817, + "language_loss": 0.87865114, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90179396, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 2.5428245067596436 + }, + { + "auxiliary_loss_clip": 0.01244259, + "auxiliary_loss_mlp": 0.01076201, + "balance_loss_clip": 1.06330895, + "balance_loss_mlp": 1.04468167, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.8407049437790362, + "language_loss": 0.7151016, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73830622, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 2.518721342086792 + }, + { + "auxiliary_loss_clip": 0.01237481, + "auxiliary_loss_mlp": 0.01076661, + "balance_loss_clip": 1.06735706, + "balance_loss_mlp": 1.04712069, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 1.5802202879997045, + "language_loss": 0.86078084, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88392228, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.525193691253662 + }, + { + "auxiliary_loss_clip": 0.01237142, + "auxiliary_loss_mlp": 0.01083825, + "balance_loss_clip": 1.06507111, + "balance_loss_mlp": 1.05280697, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.9358246085623687, + "language_loss": 0.83168423, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85489392, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.634655475616455 + }, + { + "auxiliary_loss_clip": 0.01233552, + "auxiliary_loss_mlp": 0.01083572, + "balance_loss_clip": 1.06067586, + "balance_loss_mlp": 1.0526011, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 5.14343794774404, + "language_loss": 0.68329322, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70646447, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 2.6255524158477783 + }, + { + "auxiliary_loss_clip": 0.01230653, + "auxiliary_loss_mlp": 0.01077957, + "balance_loss_clip": 1.05881953, + "balance_loss_mlp": 1.04724836, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.7038898879834317, + "language_loss": 0.7519238, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77500987, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.505007028579712 + }, + { + "auxiliary_loss_clip": 0.01234141, + "auxiliary_loss_mlp": 0.01078897, + "balance_loss_clip": 1.06254065, + "balance_loss_mlp": 1.0473063, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.2309875262348156, + "language_loss": 0.84421772, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86734807, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.5776920318603516 + }, + { + "auxiliary_loss_clip": 0.0123779, + "auxiliary_loss_mlp": 0.01079237, + "balance_loss_clip": 1.06035602, + "balance_loss_mlp": 1.04612017, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.3427293221358885, + "language_loss": 0.89054161, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91371191, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.5143909454345703 + }, + { + "auxiliary_loss_clip": 0.01239904, + "auxiliary_loss_mlp": 0.01070409, + "balance_loss_clip": 1.06487513, + "balance_loss_mlp": 1.04036868, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.506301179713492, + "language_loss": 0.8196125, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84271562, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.5877344608306885 + }, + { + "auxiliary_loss_clip": 0.01239757, + "auxiliary_loss_mlp": 0.01074484, + "balance_loss_clip": 1.06126106, + "balance_loss_mlp": 1.04265523, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.284366010284094, + "language_loss": 0.83667254, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.859815, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.5634212493896484 + }, + { + "auxiliary_loss_clip": 0.01236013, + "auxiliary_loss_mlp": 0.01080113, + "balance_loss_clip": 1.06158113, + "balance_loss_mlp": 1.04749775, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.04299765664182, + "language_loss": 0.84449244, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86765373, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 2.4861114025115967 + }, + { + "auxiliary_loss_clip": 0.01234098, + "auxiliary_loss_mlp": 0.01079493, + "balance_loss_clip": 1.06088567, + "balance_loss_mlp": 1.0475688, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.5944454156037473, + "language_loss": 0.92798018, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95111614, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 2.549992561340332 + }, + { + "auxiliary_loss_clip": 0.0122933, + "auxiliary_loss_mlp": 0.01074968, + "balance_loss_clip": 1.05913198, + "balance_loss_mlp": 1.04535627, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.5113936655097993, + "language_loss": 0.83916163, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86220461, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 2.5295350551605225 + }, + { + "auxiliary_loss_clip": 0.01234971, + "auxiliary_loss_mlp": 0.01091126, + "balance_loss_clip": 1.06091011, + "balance_loss_mlp": 1.05896354, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 2.517662049704662, + "language_loss": 0.91474771, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93800867, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 2.51291823387146 + }, + { + "auxiliary_loss_clip": 0.01095979, + "auxiliary_loss_mlp": 0.01006124, + "balance_loss_clip": 1.0159626, + "balance_loss_mlp": 1.0004735, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.904150010395762, + "language_loss": 0.67917681, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70019782, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 3.1529126167297363 + }, + { + "auxiliary_loss_clip": 0.01240997, + "auxiliary_loss_mlp": 0.01079986, + "balance_loss_clip": 1.06410122, + "balance_loss_mlp": 1.04722714, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.5052820211824645, + "language_loss": 0.88982344, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91303325, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 2.4972481727600098 + }, + { + "auxiliary_loss_clip": 0.01236312, + "auxiliary_loss_mlp": 0.0107611, + "balance_loss_clip": 1.06571496, + "balance_loss_mlp": 1.04475832, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.462940492566292, + "language_loss": 0.67570913, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69883335, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 2.556713819503784 + }, + { + "auxiliary_loss_clip": 0.0109323, + "auxiliary_loss_mlp": 0.01007969, + "balance_loss_clip": 1.01350069, + "balance_loss_mlp": 1.00219905, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8597644878901813, + "language_loss": 0.60741138, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62842339, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.163820266723633 + }, + { + "auxiliary_loss_clip": 0.01092712, + "auxiliary_loss_mlp": 0.01008384, + "balance_loss_clip": 1.01375151, + "balance_loss_mlp": 1.00280499, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.063653070328352, + "language_loss": 0.57728708, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59829807, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 3.0001847743988037 + }, + { + "auxiliary_loss_clip": 0.01237968, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_clip": 1.06318545, + "balance_loss_mlp": 1.04191422, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.33169480575485, + "language_loss": 0.87852854, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90164465, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 2.6255526542663574 + }, + { + "auxiliary_loss_clip": 0.01223363, + "auxiliary_loss_mlp": 0.01068778, + "balance_loss_clip": 1.05786085, + "balance_loss_mlp": 1.04052508, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.8585139033875928, + "language_loss": 0.7159822, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73890364, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 2.5531439781188965 + }, + { + "auxiliary_loss_clip": 0.01229097, + "auxiliary_loss_mlp": 0.01070102, + "balance_loss_clip": 1.06035423, + "balance_loss_mlp": 1.03872573, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.7988721727596975, + "language_loss": 0.93754935, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.96054131, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.560542583465576 + }, + { + "auxiliary_loss_clip": 0.01230434, + "auxiliary_loss_mlp": 0.01068164, + "balance_loss_clip": 1.0564028, + "balance_loss_mlp": 1.03604925, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 1.9630947572755195, + "language_loss": 0.81388354, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83686948, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.6363747119903564 + }, + { + "auxiliary_loss_clip": 0.01237486, + "auxiliary_loss_mlp": 0.01081476, + "balance_loss_clip": 1.06094027, + "balance_loss_mlp": 1.04838359, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.205015408039801, + "language_loss": 0.8229903, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84617996, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.5448784828186035 + }, + { + "auxiliary_loss_clip": 0.01240928, + "auxiliary_loss_mlp": 0.01068211, + "balance_loss_clip": 1.06435537, + "balance_loss_mlp": 1.03719282, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 3.403253443613219, + "language_loss": 0.85844386, + "learning_rate": 3.99831199671276e-06, + "loss": 0.88153529, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.545515298843384 + }, + { + "auxiliary_loss_clip": 0.01239152, + "auxiliary_loss_mlp": 0.0107577, + "balance_loss_clip": 1.06622434, + "balance_loss_mlp": 1.04608727, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 2.5926766835652213, + "language_loss": 0.84672976, + "learning_rate": 3.998295961044662e-06, + "loss": 0.86987901, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.5359270572662354 + }, + { + "auxiliary_loss_clip": 0.01231611, + "auxiliary_loss_mlp": 0.01071884, + "balance_loss_clip": 1.0596242, + "balance_loss_mlp": 1.04029334, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 1.6020774494127241, + "language_loss": 0.85245031, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.8754853, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 2.5465784072875977 + }, + { + "auxiliary_loss_clip": 0.01236252, + "auxiliary_loss_mlp": 0.0107273, + "balance_loss_clip": 1.05857146, + "balance_loss_mlp": 1.04342794, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 6.880895734300662, + "language_loss": 0.9086982, + "learning_rate": 3.998263662382328e-06, + "loss": 0.93178803, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 4.22198224067688 + }, + { + "auxiliary_loss_clip": 0.01086566, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.00951588, + "balance_loss_mlp": 1.02452815, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8752082440538951, + "language_loss": 0.63734078, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65850437, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 4.812143087387085 + }, + { + "auxiliary_loss_clip": 0.01234044, + "auxiliary_loss_mlp": 0.01087657, + "balance_loss_clip": 1.06463814, + "balance_loss_mlp": 1.05690134, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 1.861741372459436, + "language_loss": 0.74902123, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77223825, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.7123281955718994 + }, + { + "auxiliary_loss_clip": 0.01235052, + "auxiliary_loss_mlp": 0.01079001, + "balance_loss_clip": 1.06396031, + "balance_loss_mlp": 1.04667175, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 1.8167413748558117, + "language_loss": 0.72687376, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75001431, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.6108264923095703 + }, + { + "auxiliary_loss_clip": 0.01085137, + "auxiliary_loss_mlp": 0.01012883, + "balance_loss_clip": 1.00892961, + "balance_loss_mlp": 1.00780427, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.8979588175780957, + "language_loss": 0.65558219, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67656243, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 4.5331292152404785 + }, + { + "auxiliary_loss_clip": 0.010853, + "auxiliary_loss_mlp": 0.01006332, + "balance_loss_clip": 1.00853121, + "balance_loss_mlp": 1.00108719, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.982493539647793, + "language_loss": 0.58795023, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60886657, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 2.913209915161133 + }, + { + "auxiliary_loss_clip": 0.0123175, + "auxiliary_loss_mlp": 0.01073733, + "balance_loss_clip": 1.06305623, + "balance_loss_mlp": 1.04128468, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8317177813240224, + "language_loss": 0.91525507, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93830991, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.5208427906036377 + }, + { + "auxiliary_loss_clip": 0.01229071, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_clip": 1.05827379, + "balance_loss_mlp": 1.03847194, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.9500145624126928, + "language_loss": 0.66609669, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68906921, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.6299710273742676 + }, + { + "auxiliary_loss_clip": 0.01226996, + "auxiliary_loss_mlp": 0.01070236, + "balance_loss_clip": 1.05942988, + "balance_loss_mlp": 1.04039812, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.9760164099552735, + "language_loss": 0.7786628, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80163515, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.514277458190918 + }, + { + "auxiliary_loss_clip": 0.01233776, + "auxiliary_loss_mlp": 0.01083313, + "balance_loss_clip": 1.0635283, + "balance_loss_mlp": 1.05368948, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 2.676751822014668, + "language_loss": 0.88136226, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90453315, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.512913465499878 + }, + { + "auxiliary_loss_clip": 0.01238351, + "auxiliary_loss_mlp": 0.01072051, + "balance_loss_clip": 1.06433785, + "balance_loss_mlp": 1.04305983, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 2.250072664040333, + "language_loss": 0.84504426, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86814821, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 2.6089394092559814 + }, + { + "auxiliary_loss_clip": 0.0123924, + "auxiliary_loss_mlp": 0.01076277, + "balance_loss_clip": 1.06591225, + "balance_loss_mlp": 1.04625988, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.7537352279672418, + "language_loss": 0.83090472, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85405988, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.5661637783050537 + }, + { + "auxiliary_loss_clip": 0.01231466, + "auxiliary_loss_mlp": 0.0108826, + "balance_loss_clip": 1.06195188, + "balance_loss_mlp": 1.05781424, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.9070230325381279, + "language_loss": 0.79553962, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81873679, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 2.51831316947937 + }, + { + "auxiliary_loss_clip": 0.01244349, + "auxiliary_loss_mlp": 0.01074423, + "balance_loss_clip": 1.0649904, + "balance_loss_mlp": 1.04283237, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 1.9494742348346212, + "language_loss": 0.86929357, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89248133, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 2.573237180709839 + }, + { + "auxiliary_loss_clip": 0.01087879, + "auxiliary_loss_mlp": 0.01041551, + "balance_loss_clip": 1.01113629, + "balance_loss_mlp": 1.03661537, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9254956958348481, + "language_loss": 0.5588336, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58012789, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.2095181941986084 + }, + { + "auxiliary_loss_clip": 0.01235812, + "auxiliary_loss_mlp": 0.01077639, + "balance_loss_clip": 1.06384563, + "balance_loss_mlp": 1.04700232, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.2911644630819237, + "language_loss": 0.82242137, + "learning_rate": 3.998011761530112e-06, + "loss": 0.8455559, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.526912212371826 + }, + { + "auxiliary_loss_clip": 0.01228893, + "auxiliary_loss_mlp": 0.01070517, + "balance_loss_clip": 1.06279838, + "balance_loss_mlp": 1.04209805, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2926199979584627, + "language_loss": 0.7754575, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79845166, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 2.562457799911499 + }, + { + "auxiliary_loss_clip": 0.01235351, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.061198, + "balance_loss_mlp": 1.04550433, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 3.099516126694705, + "language_loss": 0.95314205, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97625053, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.6545963287353516 + }, + { + "auxiliary_loss_clip": 0.01229565, + "auxiliary_loss_mlp": 0.01070545, + "balance_loss_clip": 1.05723763, + "balance_loss_mlp": 1.03874028, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.238064377299334, + "language_loss": 0.88302815, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90602922, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.5103671550750732 + }, + { + "auxiliary_loss_clip": 0.01233208, + "auxiliary_loss_mlp": 0.01070897, + "balance_loss_clip": 1.06170559, + "balance_loss_mlp": 1.04294205, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.0707951581785187, + "language_loss": 0.88430405, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90734506, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.484365224838257 + }, + { + "auxiliary_loss_clip": 0.01233108, + "auxiliary_loss_mlp": 0.01083941, + "balance_loss_clip": 1.06136477, + "balance_loss_mlp": 1.05344689, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.13592723516504, + "language_loss": 0.85624826, + "learning_rate": 3.997924006231419e-06, + "loss": 0.87941879, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.5576658248901367 + }, + { + "auxiliary_loss_clip": 0.01238141, + "auxiliary_loss_mlp": 0.01088395, + "balance_loss_clip": 1.06267726, + "balance_loss_mlp": 1.0555644, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.4273311529973496, + "language_loss": 0.91231906, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93558443, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.49458909034729 + }, + { + "auxiliary_loss_clip": 0.01230749, + "auxiliary_loss_mlp": 0.01070929, + "balance_loss_clip": 1.06307018, + "balance_loss_mlp": 1.0423305, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.8826473904835073, + "language_loss": 0.78349179, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80650854, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.664990186691284 + }, + { + "auxiliary_loss_clip": 0.01225518, + "auxiliary_loss_mlp": 0.01076708, + "balance_loss_clip": 1.0577445, + "balance_loss_mlp": 1.04782367, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 1.97179700606982, + "language_loss": 0.88277811, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90580034, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 2.6206815242767334 + }, + { + "auxiliary_loss_clip": 0.01226914, + "auxiliary_loss_mlp": 0.01079274, + "balance_loss_clip": 1.06284952, + "balance_loss_mlp": 1.05054498, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.9419224571077243, + "language_loss": 0.84443617, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86749804, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 2.7486813068389893 + }, + { + "auxiliary_loss_clip": 0.01234065, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.06291342, + "balance_loss_mlp": 1.03914416, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.0709756087287468, + "language_loss": 0.8498584, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87291646, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 2.6964035034179688 + }, + { + "auxiliary_loss_clip": 0.01092481, + "auxiliary_loss_mlp": 0.01066245, + "balance_loss_clip": 1.01554084, + "balance_loss_mlp": 1.06193006, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8864710724635062, + "language_loss": 0.5910176, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61260486, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 3.105881929397583 + }, + { + "auxiliary_loss_clip": 0.01231686, + "auxiliary_loss_mlp": 0.01069529, + "balance_loss_clip": 1.06358933, + "balance_loss_mlp": 1.04027557, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.1042017308931906, + "language_loss": 0.91355926, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93657148, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 2.6217539310455322 + }, + { + "auxiliary_loss_clip": 0.01235665, + "auxiliary_loss_mlp": 0.01068659, + "balance_loss_clip": 1.06598186, + "balance_loss_mlp": 1.03950024, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.0508613455682, + "language_loss": 0.72125334, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74429655, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.642972946166992 + }, + { + "auxiliary_loss_clip": 0.01225904, + "auxiliary_loss_mlp": 0.01070615, + "balance_loss_clip": 1.05952644, + "balance_loss_mlp": 1.04178989, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.7005812931465065, + "language_loss": 0.88840491, + "learning_rate": 3.997761273778037e-06, + "loss": 0.91137004, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.01227148, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_clip": 1.06052864, + "balance_loss_mlp": 1.03271878, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 7.716696155855104, + "language_loss": 0.8430692, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86597615, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 2.5818235874176025 + }, + { + "auxiliary_loss_clip": 0.01234923, + "auxiliary_loss_mlp": 0.01073008, + "balance_loss_clip": 1.06285048, + "balance_loss_mlp": 1.04382598, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 2.118693951963605, + "language_loss": 0.80267739, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82575673, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 2.6087806224823 + }, + { + "auxiliary_loss_clip": 0.01227674, + "auxiliary_loss_mlp": 0.01068774, + "balance_loss_clip": 1.06107163, + "balance_loss_mlp": 1.04005623, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.3641297190474795, + "language_loss": 0.85741913, + "learning_rate": 3.99770566600649e-06, + "loss": 0.88038361, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.542243480682373 + }, + { + "auxiliary_loss_clip": 0.01227484, + "auxiliary_loss_mlp": 0.01067399, + "balance_loss_clip": 1.06047237, + "balance_loss_mlp": 1.03726304, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 1.6951275741742653, + "language_loss": 0.6915164, + "learning_rate": 3.997686978575302e-06, + "loss": 0.7144652, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.7041563987731934 + }, + { + "auxiliary_loss_clip": 0.01237315, + "auxiliary_loss_mlp": 0.01082784, + "balance_loss_clip": 1.06715393, + "balance_loss_mlp": 1.05169439, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.9832670543220625, + "language_loss": 0.68794954, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71115053, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 2.58034348487854 + }, + { + "auxiliary_loss_clip": 0.01236049, + "auxiliary_loss_mlp": 0.01082674, + "balance_loss_clip": 1.06441903, + "balance_loss_mlp": 1.05117929, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 2.4780255254991275, + "language_loss": 0.6682387, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69142592, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 2.5115535259246826 + }, + { + "auxiliary_loss_clip": 0.01237155, + "auxiliary_loss_mlp": 0.01082051, + "balance_loss_clip": 1.06744671, + "balance_loss_mlp": 1.05139077, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.7523747123525504, + "language_loss": 0.76889777, + "learning_rate": 3.997630461769647e-06, + "loss": 0.79208982, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 2.4991557598114014 + }, + { + "auxiliary_loss_clip": 0.0123616, + "auxiliary_loss_mlp": 0.01078881, + "balance_loss_clip": 1.06621003, + "balance_loss_mlp": 1.04907894, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.5529755073543465, + "language_loss": 0.89209974, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91525012, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.52601957321167 + }, + { + "auxiliary_loss_clip": 0.01232741, + "auxiliary_loss_mlp": 0.01080753, + "balance_loss_clip": 1.06092739, + "balance_loss_mlp": 1.04720795, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 3.4979520527594263, + "language_loss": 0.74621713, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.76935208, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.5877909660339355 + }, + { + "auxiliary_loss_clip": 0.0122968, + "auxiliary_loss_mlp": 0.01081526, + "balance_loss_clip": 1.06131792, + "balance_loss_mlp": 1.05177164, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.185526656358645, + "language_loss": 0.69072843, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71384048, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 2.581725835800171 + }, + { + "auxiliary_loss_clip": 0.01231095, + "auxiliary_loss_mlp": 0.01058962, + "balance_loss_clip": 1.06124568, + "balance_loss_mlp": 1.02951741, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.4475416466304774, + "language_loss": 0.92238677, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94528735, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.524151563644409 + }, + { + "auxiliary_loss_clip": 0.01232692, + "auxiliary_loss_mlp": 0.01077494, + "balance_loss_clip": 1.06313968, + "balance_loss_mlp": 1.04759669, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.076192413425849, + "language_loss": 0.90972781, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93282968, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 4.118892192840576 + }, + { + "auxiliary_loss_clip": 0.01224394, + "auxiliary_loss_mlp": 0.0107736, + "balance_loss_clip": 1.0613935, + "balance_loss_mlp": 1.04658055, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.0215763014584325, + "language_loss": 0.78365481, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80667233, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.5590567588806152 + }, + { + "auxiliary_loss_clip": 0.01236704, + "auxiliary_loss_mlp": 0.01081884, + "balance_loss_clip": 1.06552386, + "balance_loss_mlp": 1.05184364, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 2.413702070115812, + "language_loss": 0.78595936, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80914527, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 4.065551042556763 + }, + { + "auxiliary_loss_clip": 0.01101636, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.02202153, + "balance_loss_mlp": 1.02922201, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8198042241607055, + "language_loss": 0.62659961, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64795303, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.1907107830047607 + }, + { + "auxiliary_loss_clip": 0.01229488, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_clip": 1.06226897, + "balance_loss_mlp": 1.04210758, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5320093311308578, + "language_loss": 0.84422326, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86723197, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 4.093153715133667 + }, + { + "auxiliary_loss_clip": 0.01223714, + "auxiliary_loss_mlp": 0.01063918, + "balance_loss_clip": 1.05749738, + "balance_loss_mlp": 1.03608274, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.467327938180554, + "language_loss": 0.88283443, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90571082, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.55934739112854 + }, + { + "auxiliary_loss_clip": 0.01234078, + "auxiliary_loss_mlp": 0.010797, + "balance_loss_clip": 1.06401622, + "balance_loss_mlp": 1.04901552, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.061845557759306, + "language_loss": 0.73880124, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.76193905, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.584949493408203 + }, + { + "auxiliary_loss_clip": 0.01232958, + "auxiliary_loss_mlp": 0.01076601, + "balance_loss_clip": 1.06544709, + "balance_loss_mlp": 1.04820514, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.182577883418223, + "language_loss": 0.82444894, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84754455, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.626950740814209 + }, + { + "auxiliary_loss_clip": 0.01223876, + "auxiliary_loss_mlp": 0.01076697, + "balance_loss_clip": 1.05937028, + "balance_loss_mlp": 1.04749036, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.826286755686155, + "language_loss": 0.79873532, + "learning_rate": 3.997377677828266e-06, + "loss": 0.8217411, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.6236724853515625 + }, + { + "auxiliary_loss_clip": 0.01087999, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.01251841, + "balance_loss_mlp": 1.02983022, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.0086449392219679, + "language_loss": 0.58754444, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60876632, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 3.1054513454437256 + }, + { + "auxiliary_loss_clip": 0.01230028, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_clip": 1.06097591, + "balance_loss_mlp": 1.05493116, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.4755330572045646, + "language_loss": 0.8823002, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.90544426, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 2.5965263843536377 + }, + { + "auxiliary_loss_clip": 0.0123248, + "auxiliary_loss_mlp": 0.01083178, + "balance_loss_clip": 1.06508207, + "balance_loss_mlp": 1.05359018, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 2.6467333594767144, + "language_loss": 0.85721755, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88037407, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.6673786640167236 + }, + { + "auxiliary_loss_clip": 0.01234462, + "auxiliary_loss_mlp": 0.01074838, + "balance_loss_clip": 1.06334543, + "balance_loss_mlp": 1.04321218, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.079844328570001, + "language_loss": 0.88003349, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90312648, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 2.569160223007202 + }, + { + "auxiliary_loss_clip": 0.01227772, + "auxiliary_loss_mlp": 0.01077909, + "balance_loss_clip": 1.06061924, + "balance_loss_mlp": 1.04875088, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.9530874469396364, + "language_loss": 0.84409571, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86715257, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.6481435298919678 + }, + { + "auxiliary_loss_clip": 0.01232282, + "auxiliary_loss_mlp": 0.01063438, + "balance_loss_clip": 1.06572223, + "balance_loss_mlp": 1.03345656, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.037003271570151, + "language_loss": 0.87384123, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89679849, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.6390624046325684 + }, + { + "auxiliary_loss_clip": 0.01226835, + "auxiliary_loss_mlp": 0.01078227, + "balance_loss_clip": 1.05718923, + "balance_loss_mlp": 1.04956961, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.321392508167211, + "language_loss": 0.75656259, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.7796132, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 2.5960898399353027 + }, + { + "auxiliary_loss_clip": 0.01222551, + "auxiliary_loss_mlp": 0.01073055, + "balance_loss_clip": 1.05894399, + "balance_loss_mlp": 1.04628038, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 2.5390472648154576, + "language_loss": 0.86376971, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88672578, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.5770955085754395 + }, + { + "auxiliary_loss_clip": 0.01231889, + "auxiliary_loss_mlp": 0.01075, + "balance_loss_clip": 1.05944192, + "balance_loss_mlp": 1.04555511, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.5984128639236252, + "language_loss": 0.87345672, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89652562, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.5866260528564453 + }, + { + "auxiliary_loss_clip": 0.01232491, + "auxiliary_loss_mlp": 0.01083298, + "balance_loss_clip": 1.05894816, + "balance_loss_mlp": 1.05254161, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 3.229524210650681, + "language_loss": 0.8379488, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86110663, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.5574512481689453 + }, + { + "auxiliary_loss_clip": 0.01228886, + "auxiliary_loss_mlp": 0.01068939, + "balance_loss_clip": 1.06414914, + "balance_loss_mlp": 1.0412581, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.3708048119134597, + "language_loss": 0.74300504, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76598328, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.63714337348938 + }, + { + "auxiliary_loss_clip": 0.01225416, + "auxiliary_loss_mlp": 0.01074867, + "balance_loss_clip": 1.06238043, + "balance_loss_mlp": 1.04411113, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 1.9783027920775573, + "language_loss": 0.78504366, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80804652, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.5659923553466797 + }, + { + "auxiliary_loss_clip": 0.01223717, + "auxiliary_loss_mlp": 0.01064354, + "balance_loss_clip": 1.06054354, + "balance_loss_mlp": 1.03707922, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.361803321774686, + "language_loss": 0.73623681, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75911748, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 2.6943743228912354 + }, + { + "auxiliary_loss_clip": 0.01226067, + "auxiliary_loss_mlp": 0.01064325, + "balance_loss_clip": 1.05951262, + "balance_loss_mlp": 1.03483319, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.9387456516713713, + "language_loss": 0.77302271, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79592663, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 2.5916919708251953 + }, + { + "auxiliary_loss_clip": 0.01220261, + "auxiliary_loss_mlp": 0.01067689, + "balance_loss_clip": 1.05845308, + "balance_loss_mlp": 1.04048586, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.926923566922942, + "language_loss": 0.71170914, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73458862, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 2.579301118850708 + }, + { + "auxiliary_loss_clip": 0.0122662, + "auxiliary_loss_mlp": 0.01078899, + "balance_loss_clip": 1.06103313, + "balance_loss_mlp": 1.04907298, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 2.046489845797133, + "language_loss": 0.76754582, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79060102, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 2.543827533721924 + }, + { + "auxiliary_loss_clip": 0.01221432, + "auxiliary_loss_mlp": 0.01070377, + "balance_loss_clip": 1.05907536, + "balance_loss_mlp": 1.0407536, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.332249058353214, + "language_loss": 0.79189217, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81481028, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 2.535888910293579 + }, + { + "auxiliary_loss_clip": 0.01216405, + "auxiliary_loss_mlp": 0.01069632, + "balance_loss_clip": 1.05675459, + "balance_loss_mlp": 1.04241681, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 3.011111547184208, + "language_loss": 0.77632183, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79918224, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 2.5872578620910645 + }, + { + "auxiliary_loss_clip": 0.0121954, + "auxiliary_loss_mlp": 0.01070333, + "balance_loss_clip": 1.05876231, + "balance_loss_mlp": 1.04413068, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 1.6727111506679533, + "language_loss": 0.76563817, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78853691, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 2.5531437397003174 + }, + { + "auxiliary_loss_clip": 0.01230187, + "auxiliary_loss_mlp": 0.0107563, + "balance_loss_clip": 1.06182921, + "balance_loss_mlp": 1.04570806, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.2179633191516395, + "language_loss": 0.73960912, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76266724, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 2.609363555908203 + }, + { + "auxiliary_loss_clip": 0.01225431, + "auxiliary_loss_mlp": 0.01063912, + "balance_loss_clip": 1.06277752, + "balance_loss_mlp": 1.03710198, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.1673087142739167, + "language_loss": 0.80195665, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82485008, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 2.521433115005493 + }, + { + "auxiliary_loss_clip": 0.01216689, + "auxiliary_loss_mlp": 0.01059837, + "balance_loss_clip": 1.05923533, + "balance_loss_mlp": 1.03337288, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 1.8507946915223188, + "language_loss": 0.81600839, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83877367, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.5187487602233887 + }, + { + "auxiliary_loss_clip": 0.01222727, + "auxiliary_loss_mlp": 0.01077609, + "balance_loss_clip": 1.05643368, + "balance_loss_mlp": 1.05075133, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.3444692462101, + "language_loss": 0.80379367, + "learning_rate": 3.996899089108607e-06, + "loss": 0.82679713, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 2.6802444458007812 + }, + { + "auxiliary_loss_clip": 0.01223941, + "auxiliary_loss_mlp": 0.01062589, + "balance_loss_clip": 1.06334114, + "balance_loss_mlp": 1.03656602, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 1.8752271707957793, + "language_loss": 0.90013123, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92299652, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.6496269702911377 + }, + { + "auxiliary_loss_clip": 0.01223247, + "auxiliary_loss_mlp": 0.0107025, + "balance_loss_clip": 1.05468929, + "balance_loss_mlp": 1.03982782, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.4479638745362027, + "language_loss": 0.76358163, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78651661, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.5559604167938232 + }, + { + "auxiliary_loss_clip": 0.01229713, + "auxiliary_loss_mlp": 0.01068824, + "balance_loss_clip": 1.06484532, + "balance_loss_mlp": 1.04078603, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.6889530436703994, + "language_loss": 0.81049448, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83347982, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.578477621078491 + }, + { + "auxiliary_loss_clip": 0.01218744, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_clip": 1.0602988, + "balance_loss_mlp": 1.04787362, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 1.9264844405228028, + "language_loss": 0.84905607, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87202001, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.5814805030822754 + }, + { + "auxiliary_loss_clip": 0.01222164, + "auxiliary_loss_mlp": 0.01067489, + "balance_loss_clip": 1.05995512, + "balance_loss_mlp": 1.04109597, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 2.1273184845570783, + "language_loss": 0.82306314, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84595966, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.508052349090576 + }, + { + "auxiliary_loss_clip": 0.01219643, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_clip": 1.05766106, + "balance_loss_mlp": 1.03467703, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 8.95353752521802, + "language_loss": 0.88042331, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90324754, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.704960823059082 + }, + { + "auxiliary_loss_clip": 0.01222426, + "auxiliary_loss_mlp": 0.01066371, + "balance_loss_clip": 1.06065786, + "balance_loss_mlp": 1.03869033, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 3.1445302669850634, + "language_loss": 0.90314186, + "learning_rate": 3.996745480347854e-06, + "loss": 0.9260298, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.524369955062866 + }, + { + "auxiliary_loss_clip": 0.01222725, + "auxiliary_loss_mlp": 0.01076114, + "balance_loss_clip": 1.0591433, + "balance_loss_mlp": 1.04960179, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9995666612321539, + "language_loss": 0.73756593, + "learning_rate": 3.996723233365324e-06, + "loss": 0.76055431, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 2.5109941959381104 + }, + { + "auxiliary_loss_clip": 0.01226222, + "auxiliary_loss_mlp": 0.01064827, + "balance_loss_clip": 1.0608151, + "balance_loss_mlp": 1.036479, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0987615229142427, + "language_loss": 0.8626008, + "learning_rate": 3.996700910666847e-06, + "loss": 0.8855114, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 3.998858690261841 + }, + { + "auxiliary_loss_clip": 0.01223373, + "auxiliary_loss_mlp": 0.01076104, + "balance_loss_clip": 1.05868816, + "balance_loss_mlp": 1.04797041, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 2.5681321869804705, + "language_loss": 0.70133698, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72433174, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 4.1807637214660645 + }, + { + "auxiliary_loss_clip": 0.0122136, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_clip": 1.06013203, + "balance_loss_mlp": 1.04943156, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 1.9218419666546116, + "language_loss": 0.80924225, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83222562, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 2.557271957397461 + }, + { + "auxiliary_loss_clip": 0.01224757, + "auxiliary_loss_mlp": 0.01062637, + "balance_loss_clip": 1.06225002, + "balance_loss_mlp": 1.03467035, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.095590169118188, + "language_loss": 0.81671542, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83958936, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 3.985780954360962 + }, + { + "auxiliary_loss_clip": 0.01093495, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.01889241, + "balance_loss_mlp": 1.02555048, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9184639331186578, + "language_loss": 0.64443576, + "learning_rate": 3.996610862730465e-06, + "loss": 0.6656701, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 3.0414888858795166 + }, + { + "auxiliary_loss_clip": 0.0122697, + "auxiliary_loss_mlp": 0.01071026, + "balance_loss_clip": 1.05765676, + "balance_loss_mlp": 1.04381037, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 2.297009543343717, + "language_loss": 0.91319889, + "learning_rate": 3.996588161465018e-06, + "loss": 0.9361788, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 4.086096525192261 + }, + { + "auxiliary_loss_clip": 0.01224869, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_clip": 1.06339586, + "balance_loss_mlp": 1.04120469, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.137627211075001, + "language_loss": 0.86897326, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89191604, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.598067045211792 + }, + { + "auxiliary_loss_clip": 0.01224795, + "auxiliary_loss_mlp": 0.01067689, + "balance_loss_clip": 1.05994821, + "balance_loss_mlp": 1.04135513, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 2.937527341629371, + "language_loss": 0.84637415, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86929893, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.5635619163513184 + }, + { + "auxiliary_loss_clip": 0.01223773, + "auxiliary_loss_mlp": 0.01072737, + "balance_loss_clip": 1.06025457, + "balance_loss_mlp": 1.04554534, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.9193193688212518, + "language_loss": 0.80273545, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82570052, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 2.712559223175049 + }, + { + "auxiliary_loss_clip": 0.01224795, + "auxiliary_loss_mlp": 0.01063418, + "balance_loss_clip": 1.0628612, + "balance_loss_mlp": 1.03670347, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 1.7980897327751768, + "language_loss": 0.86853909, + "learning_rate": 3.996496599303649e-06, + "loss": 0.89142126, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 2.5635924339294434 + }, + { + "auxiliary_loss_clip": 0.01221204, + "auxiliary_loss_mlp": 0.0105878, + "balance_loss_clip": 1.06216061, + "balance_loss_mlp": 1.03148127, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.789558893396335, + "language_loss": 0.85003376, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87283361, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 2.49169659614563 + }, + { + "auxiliary_loss_clip": 0.01221967, + "auxiliary_loss_mlp": 0.01066938, + "balance_loss_clip": 1.060341, + "balance_loss_mlp": 1.03949642, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 1.8415939572192042, + "language_loss": 0.86143214, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88432121, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.5320217609405518 + }, + { + "auxiliary_loss_clip": 0.01216429, + "auxiliary_loss_mlp": 0.01065386, + "balance_loss_clip": 1.05705047, + "balance_loss_mlp": 1.03892136, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 2.3081597113018084, + "language_loss": 0.68304837, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70586646, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.52925968170166 + }, + { + "auxiliary_loss_clip": 0.01215464, + "auxiliary_loss_mlp": 0.01062083, + "balance_loss_clip": 1.05797648, + "balance_loss_mlp": 1.03509402, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.1645596904655684, + "language_loss": 0.76817405, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79094946, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.5822713375091553 + }, + { + "auxiliary_loss_clip": 0.01214571, + "auxiliary_loss_mlp": 0.01064678, + "balance_loss_clip": 1.0547955, + "balance_loss_mlp": 1.0380348, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.056315800019186, + "language_loss": 0.86499739, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88778985, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 2.50998854637146 + }, + { + "auxiliary_loss_clip": 0.01223938, + "auxiliary_loss_mlp": 0.01067992, + "balance_loss_clip": 1.05895364, + "balance_loss_mlp": 1.04049015, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 2.0190004340856516, + "language_loss": 0.89823592, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92115521, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.5279955863952637 + }, + { + "auxiliary_loss_clip": 0.01223651, + "auxiliary_loss_mlp": 0.01067927, + "balance_loss_clip": 1.06230807, + "balance_loss_mlp": 1.04054451, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 1.9591016122929479, + "language_loss": 0.848535, + "learning_rate": 3.996333450822208e-06, + "loss": 0.87145078, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 2.5787353515625 + }, + { + "auxiliary_loss_clip": 0.01224424, + "auxiliary_loss_mlp": 0.01069352, + "balance_loss_clip": 1.06086397, + "balance_loss_mlp": 1.04207683, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.7150880285774892, + "language_loss": 0.81133747, + "learning_rate": 3.99630984108452e-06, + "loss": 0.83427525, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.5659866333007812 + }, + { + "auxiliary_loss_clip": 0.0121494, + "auxiliary_loss_mlp": 0.01068572, + "balance_loss_clip": 1.05762708, + "balance_loss_mlp": 1.04178584, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 3.595070164264677, + "language_loss": 0.74346745, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.76630259, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.4837725162506104 + }, + { + "auxiliary_loss_clip": 0.01216427, + "auxiliary_loss_mlp": 0.01075819, + "balance_loss_clip": 1.06066155, + "balance_loss_mlp": 1.04970062, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 2.2129661762401858, + "language_loss": 0.89851618, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92143857, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.5810153484344482 + }, + { + "auxiliary_loss_clip": 0.01216542, + "auxiliary_loss_mlp": 0.01057801, + "balance_loss_clip": 1.05914629, + "balance_loss_mlp": 1.03193235, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 1.762173334665777, + "language_loss": 0.74685729, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76960069, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 2.538386583328247 + }, + { + "auxiliary_loss_clip": 0.01217992, + "auxiliary_loss_mlp": 0.01073744, + "balance_loss_clip": 1.05848074, + "balance_loss_mlp": 1.04597986, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.1322653752681147, + "language_loss": 0.83683658, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.85975385, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 2.588604688644409 + }, + { + "auxiliary_loss_clip": 0.01221181, + "auxiliary_loss_mlp": 0.01063263, + "balance_loss_clip": 1.06081724, + "balance_loss_mlp": 1.03621483, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.1343190704750246, + "language_loss": 0.91215819, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93500268, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 2.550574541091919 + }, + { + "auxiliary_loss_clip": 0.01222347, + "auxiliary_loss_mlp": 0.01065186, + "balance_loss_clip": 1.06005812, + "balance_loss_mlp": 1.0383637, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.5852049173314806, + "language_loss": 0.80061984, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82349515, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 2.510561227798462 + }, + { + "auxiliary_loss_clip": 0.01218355, + "auxiliary_loss_mlp": 0.0107938, + "balance_loss_clip": 1.06024408, + "balance_loss_mlp": 1.05239081, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.9947543200333386, + "language_loss": 0.84803653, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87101388, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 2.5331332683563232 + }, + { + "auxiliary_loss_clip": 0.01223303, + "auxiliary_loss_mlp": 0.01064525, + "balance_loss_clip": 1.0595758, + "balance_loss_mlp": 1.03696346, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.2054280573367127, + "language_loss": 0.75774002, + "learning_rate": 3.996118238049124e-06, + "loss": 0.78061831, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 2.5172111988067627 + }, + { + "auxiliary_loss_clip": 0.0122226, + "auxiliary_loss_mlp": 0.0106957, + "balance_loss_clip": 1.06370008, + "balance_loss_mlp": 1.04477417, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.3672322905026064, + "language_loss": 0.85089958, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87381792, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.469421148300171 + }, + { + "auxiliary_loss_clip": 0.0121723, + "auxiliary_loss_mlp": 0.01065344, + "balance_loss_clip": 1.05805373, + "balance_loss_mlp": 1.03868914, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.7902933697918277, + "language_loss": 0.90587771, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92870355, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 2.504417896270752 + }, + { + "auxiliary_loss_clip": 0.01216951, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_clip": 1.057603, + "balance_loss_mlp": 1.04829025, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 1.8466862246835343, + "language_loss": 0.89740491, + "learning_rate": 3.996045137951188e-06, + "loss": 0.92032099, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.461290121078491 + }, + { + "auxiliary_loss_clip": 0.01219468, + "auxiliary_loss_mlp": 0.01063424, + "balance_loss_clip": 1.06117225, + "balance_loss_mlp": 1.03427696, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 1.9496465519795594, + "language_loss": 0.67538679, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69821572, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.6557867527008057 + }, + { + "auxiliary_loss_clip": 0.01092359, + "auxiliary_loss_mlp": 0.01010814, + "balance_loss_clip": 1.01828051, + "balance_loss_mlp": 1.00595045, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3205335663629543, + "language_loss": 0.62235123, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64338291, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.175591468811035 + }, + { + "auxiliary_loss_clip": 0.01221691, + "auxiliary_loss_mlp": 0.01075295, + "balance_loss_clip": 1.05984926, + "balance_loss_mlp": 1.04594588, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 1.8712780306868257, + "language_loss": 0.9044292, + "learning_rate": 3.995971356641185e-06, + "loss": 0.9273991, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.5780117511749268 + }, + { + "auxiliary_loss_clip": 0.01220834, + "auxiliary_loss_mlp": 0.01068115, + "balance_loss_clip": 1.05996108, + "balance_loss_mlp": 1.03907561, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 2.7282063088517954, + "language_loss": 0.67131931, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.6942088, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 2.573397636413574 + }, + { + "auxiliary_loss_clip": 0.01221531, + "auxiliary_loss_mlp": 0.01071801, + "balance_loss_clip": 1.06045842, + "balance_loss_mlp": 1.04335773, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 5.242222029540928, + "language_loss": 0.78216511, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80509841, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.554018020629883 + }, + { + "auxiliary_loss_clip": 0.01222986, + "auxiliary_loss_mlp": 0.01079955, + "balance_loss_clip": 1.06038451, + "balance_loss_mlp": 1.05102301, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.03965168428283, + "language_loss": 0.78603876, + "learning_rate": 3.995896894144294e-06, + "loss": 0.8090682, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.6816344261169434 + }, + { + "auxiliary_loss_clip": 0.01212557, + "auxiliary_loss_mlp": 0.01060656, + "balance_loss_clip": 1.05728018, + "balance_loss_mlp": 1.0341444, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.3604250857968254, + "language_loss": 0.83773994, + "learning_rate": 3.995871921941519e-06, + "loss": 0.86047208, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.525698184967041 + }, + { + "auxiliary_loss_clip": 0.01220475, + "auxiliary_loss_mlp": 0.01079921, + "balance_loss_clip": 1.05846906, + "balance_loss_mlp": 1.04878354, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.0800635425391265, + "language_loss": 0.75220287, + "learning_rate": 3.99584687405508e-06, + "loss": 0.7752068, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.5408735275268555 + }, + { + "auxiliary_loss_clip": 0.01217727, + "auxiliary_loss_mlp": 0.01074432, + "balance_loss_clip": 1.05790579, + "balance_loss_mlp": 1.04612017, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.8133282561241735, + "language_loss": 0.79415244, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81707406, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.5202529430389404 + }, + { + "auxiliary_loss_clip": 0.01160897, + "auxiliary_loss_mlp": 0.01076089, + "balance_loss_clip": 1.05177021, + "balance_loss_mlp": 1.04864693, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.399553014842414, + "language_loss": 0.9160679, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93843776, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.7140941619873047 + }, + { + "auxiliary_loss_clip": 0.01184287, + "auxiliary_loss_mlp": 0.01081676, + "balance_loss_clip": 1.05517375, + "balance_loss_mlp": 1.05548549, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 2.3336403248274067, + "language_loss": 0.83292377, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85558337, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 2.875067949295044 + }, + { + "auxiliary_loss_clip": 0.01194201, + "auxiliary_loss_mlp": 0.01064451, + "balance_loss_clip": 1.05644894, + "balance_loss_mlp": 1.03560281, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.560028634426205, + "language_loss": 0.8198446, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84243113, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.7735490798950195 + }, + { + "auxiliary_loss_clip": 0.01208531, + "auxiliary_loss_mlp": 0.01074085, + "balance_loss_clip": 1.05816305, + "balance_loss_mlp": 1.04471231, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.192028986700683, + "language_loss": 0.91807014, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94089627, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 3.9965245723724365 + }, + { + "auxiliary_loss_clip": 0.01220565, + "auxiliary_loss_mlp": 0.01073668, + "balance_loss_clip": 1.05650139, + "balance_loss_mlp": 1.04307926, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 3.100793996359278, + "language_loss": 0.76302624, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78596854, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 4.00517201423645 + }, + { + "auxiliary_loss_clip": 0.0120358, + "auxiliary_loss_mlp": 0.01074854, + "balance_loss_clip": 1.05901098, + "balance_loss_mlp": 1.04705429, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.2158231223225053, + "language_loss": 0.83693659, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85972095, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 4.1283087730407715 + }, + { + "auxiliary_loss_clip": 0.01190009, + "auxiliary_loss_mlp": 0.00752855, + "balance_loss_clip": 1.05900347, + "balance_loss_mlp": 1.00062299, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 5.0765667478126195, + "language_loss": 0.72923666, + "learning_rate": 3.995643766466275e-06, + "loss": 0.74866527, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.605175495147705 + }, + { + "auxiliary_loss_clip": 0.01178804, + "auxiliary_loss_mlp": 0.0107405, + "balance_loss_clip": 1.0503726, + "balance_loss_mlp": 1.04572606, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.7576269881744764, + "language_loss": 0.83430851, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85683703, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.5992937088012695 + }, + { + "auxiliary_loss_clip": 0.01212178, + "auxiliary_loss_mlp": 0.01075096, + "balance_loss_clip": 1.05603158, + "balance_loss_mlp": 1.04722512, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.1480285614363313, + "language_loss": 0.85760152, + "learning_rate": 3.995592232799595e-06, + "loss": 0.88047421, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 4.0126259326934814 + }, + { + "auxiliary_loss_clip": 0.01180207, + "auxiliary_loss_mlp": 0.01066262, + "balance_loss_clip": 1.05128884, + "balance_loss_mlp": 1.03622127, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 1.7498873441746559, + "language_loss": 0.94611478, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96857953, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.6473734378814697 + }, + { + "auxiliary_loss_clip": 0.01220021, + "auxiliary_loss_mlp": 0.01073162, + "balance_loss_clip": 1.05963326, + "balance_loss_mlp": 1.04331195, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.102320167778716, + "language_loss": 0.77541721, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79834902, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 2.634192943572998 + }, + { + "auxiliary_loss_clip": 0.01210121, + "auxiliary_loss_mlp": 0.01073729, + "balance_loss_clip": 1.061216, + "balance_loss_mlp": 1.0435214, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.6958391895477316, + "language_loss": 0.7828294, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80566788, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 2.554885149002075 + }, + { + "auxiliary_loss_clip": 0.0120778, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.06037545, + "balance_loss_mlp": 1.0399096, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.5735698219073972, + "language_loss": 0.82930291, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85204935, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.7130846977233887 + }, + { + "auxiliary_loss_clip": 0.01209269, + "auxiliary_loss_mlp": 0.01065805, + "balance_loss_clip": 1.05962229, + "balance_loss_mlp": 1.03759992, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.083909845955241, + "language_loss": 0.76088655, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78363729, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 2.622920036315918 + }, + { + "auxiliary_loss_clip": 0.01197415, + "auxiliary_loss_mlp": 0.0106708, + "balance_loss_clip": 1.05316317, + "balance_loss_mlp": 1.03843355, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 3.077009483636115, + "language_loss": 0.87628144, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89892644, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.5992815494537354 + }, + { + "auxiliary_loss_clip": 0.01196079, + "auxiliary_loss_mlp": 0.01072872, + "balance_loss_clip": 1.05642664, + "balance_loss_mlp": 1.04383302, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.965788809840043, + "language_loss": 0.87649751, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89918709, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 2.6616995334625244 + }, + { + "auxiliary_loss_clip": 0.0116646, + "auxiliary_loss_mlp": 0.01067008, + "balance_loss_clip": 1.05729246, + "balance_loss_mlp": 1.03812397, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.8377529990332997, + "language_loss": 0.82320321, + "learning_rate": 3.995383071289462e-06, + "loss": 0.8455379, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.707226037979126 + }, + { + "auxiliary_loss_clip": 0.01219378, + "auxiliary_loss_mlp": 0.01075427, + "balance_loss_clip": 1.06180322, + "balance_loss_mlp": 1.04667425, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.8314482246395596, + "language_loss": 0.87435335, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89730138, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 2.639620304107666 + }, + { + "auxiliary_loss_clip": 0.01213313, + "auxiliary_loss_mlp": 0.01058503, + "balance_loss_clip": 1.05647922, + "balance_loss_mlp": 1.03047729, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.9311541201673807, + "language_loss": 0.83207059, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85478878, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.5941410064697266 + }, + { + "auxiliary_loss_clip": 0.01202886, + "auxiliary_loss_mlp": 0.0106598, + "balance_loss_clip": 1.05658031, + "balance_loss_mlp": 1.03823972, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.0230066069920842, + "language_loss": 0.65107286, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67376149, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.76772403717041 + }, + { + "auxiliary_loss_clip": 0.01200375, + "auxiliary_loss_mlp": 0.01073657, + "balance_loss_clip": 1.05511534, + "balance_loss_mlp": 1.04304361, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.170527238917256, + "language_loss": 0.83745062, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86019099, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 2.6067028045654297 + }, + { + "auxiliary_loss_clip": 0.01189014, + "auxiliary_loss_mlp": 0.01074544, + "balance_loss_clip": 1.05601478, + "balance_loss_mlp": 1.0446105, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.914992859970532, + "language_loss": 0.80388057, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82651615, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 2.7295119762420654 + }, + { + "auxiliary_loss_clip": 0.01215068, + "auxiliary_loss_mlp": 0.01074301, + "balance_loss_clip": 1.05830765, + "balance_loss_mlp": 1.04430842, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 2.295468885739581, + "language_loss": 0.76045692, + "learning_rate": 3.995223022193999e-06, + "loss": 0.78335059, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.5982825756073 + }, + { + "auxiliary_loss_clip": 0.01192277, + "auxiliary_loss_mlp": 0.01072947, + "balance_loss_clip": 1.05653036, + "balance_loss_mlp": 1.04337132, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.0995183101304886, + "language_loss": 0.81449604, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83714825, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.705984354019165 + }, + { + "auxiliary_loss_clip": 0.01061398, + "auxiliary_loss_mlp": 0.00750895, + "balance_loss_clip": 1.02090907, + "balance_loss_mlp": 1.00052512, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 0.9723879033724239, + "language_loss": 0.65651417, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67463708, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 3.170414447784424 + }, + { + "auxiliary_loss_clip": 0.01184651, + "auxiliary_loss_mlp": 0.01060603, + "balance_loss_clip": 1.05434871, + "balance_loss_mlp": 1.03215945, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.786233213683338, + "language_loss": 0.77030027, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.7927528, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 2.718254566192627 + }, + { + "auxiliary_loss_clip": 0.01170805, + "auxiliary_loss_mlp": 0.01065037, + "balance_loss_clip": 1.05092347, + "balance_loss_mlp": 1.03540206, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.185878483409036, + "language_loss": 0.8908999, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91325837, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 2.644718647003174 + }, + { + "auxiliary_loss_clip": 0.01190387, + "auxiliary_loss_mlp": 0.01063422, + "balance_loss_clip": 1.05799985, + "balance_loss_mlp": 1.03435922, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 2.048682300904175, + "language_loss": 0.75311625, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77565438, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 2.6076042652130127 + }, + { + "auxiliary_loss_clip": 0.01176086, + "auxiliary_loss_mlp": 0.01081136, + "balance_loss_clip": 1.05373716, + "balance_loss_mlp": 1.05023718, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.044210969446307, + "language_loss": 0.90982366, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93239582, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.5934531688690186 + }, + { + "auxiliary_loss_clip": 0.01218718, + "auxiliary_loss_mlp": 0.01066206, + "balance_loss_clip": 1.06148505, + "balance_loss_mlp": 1.03900242, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.968456673987507, + "language_loss": 0.82359993, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8464492, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 2.5687315464019775 + }, + { + "auxiliary_loss_clip": 0.011932, + "auxiliary_loss_mlp": 0.0106197, + "balance_loss_clip": 1.0580225, + "balance_loss_mlp": 1.03405094, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 1.7576799159286969, + "language_loss": 0.78711683, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80966854, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.646284580230713 + }, + { + "auxiliary_loss_clip": 0.01181252, + "auxiliary_loss_mlp": 0.0107243, + "balance_loss_clip": 1.05614972, + "balance_loss_mlp": 1.04552436, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 2.0213097372231656, + "language_loss": 0.8910991, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91363585, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.594451665878296 + }, + { + "auxiliary_loss_clip": 0.01195401, + "auxiliary_loss_mlp": 0.01068011, + "balance_loss_clip": 1.05988717, + "balance_loss_mlp": 1.03758836, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.655150021937709, + "language_loss": 0.75788605, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78052014, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.6841413974761963 + }, + { + "auxiliary_loss_clip": 0.01174914, + "auxiliary_loss_mlp": 0.01073449, + "balance_loss_clip": 1.05302298, + "balance_loss_mlp": 1.04357505, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 2.4795427786210205, + "language_loss": 0.78884387, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81132746, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.674816608428955 + }, + { + "auxiliary_loss_clip": 0.01205745, + "auxiliary_loss_mlp": 0.0107013, + "balance_loss_clip": 1.05836785, + "balance_loss_mlp": 1.04137647, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.2902425359820966, + "language_loss": 0.8595556, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88231438, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 2.5711276531219482 + }, + { + "auxiliary_loss_clip": 0.01177574, + "auxiliary_loss_mlp": 0.01066265, + "balance_loss_clip": 1.05900121, + "balance_loss_mlp": 1.03788173, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.417264812956134, + "language_loss": 0.87153888, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89397728, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.705500602722168 + }, + { + "auxiliary_loss_clip": 0.01157, + "auxiliary_loss_mlp": 0.01073231, + "balance_loss_clip": 1.05455315, + "balance_loss_mlp": 1.04748178, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.4524542466910229, + "language_loss": 0.63683259, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65913492, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.776444911956787 + }, + { + "auxiliary_loss_clip": 0.01218804, + "auxiliary_loss_mlp": 0.0107158, + "balance_loss_clip": 1.05939937, + "balance_loss_mlp": 1.04093146, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 1.9055976952109885, + "language_loss": 0.83395457, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85685837, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.5471041202545166 + }, + { + "auxiliary_loss_clip": 0.0120762, + "auxiliary_loss_mlp": 0.01065238, + "balance_loss_clip": 1.05957282, + "balance_loss_mlp": 1.03665113, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.0397066812467, + "language_loss": 0.87784207, + "learning_rate": 3.994782909218751e-06, + "loss": 0.90057063, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.6269986629486084 + }, + { + "auxiliary_loss_clip": 0.01220763, + "auxiliary_loss_mlp": 0.01068993, + "balance_loss_clip": 1.06274438, + "balance_loss_mlp": 1.04094291, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 2.1665131328252243, + "language_loss": 0.81195498, + "learning_rate": 3.994754759152854e-06, + "loss": 0.83485258, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.5395150184631348 + }, + { + "auxiliary_loss_clip": 0.01188821, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_clip": 1.06138003, + "balance_loss_mlp": 1.04165006, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.9280578590636648, + "language_loss": 0.81398362, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83655751, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.674868106842041 + }, + { + "auxiliary_loss_clip": 0.0106313, + "auxiliary_loss_mlp": 0.01021997, + "balance_loss_clip": 1.02304721, + "balance_loss_mlp": 1.01653767, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8825969904012524, + "language_loss": 0.61598384, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63683516, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 3.069946765899658 + }, + { + "auxiliary_loss_clip": 0.01185847, + "auxiliary_loss_mlp": 0.01059824, + "balance_loss_clip": 1.05623567, + "balance_loss_mlp": 1.03190482, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.7825933705355714, + "language_loss": 0.88758194, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91003865, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.6266121864318848 + }, + { + "auxiliary_loss_clip": 0.01186152, + "auxiliary_loss_mlp": 0.01068967, + "balance_loss_clip": 1.05425262, + "balance_loss_mlp": 1.04083371, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.8983674688698118, + "language_loss": 0.74474853, + "learning_rate": 3.994641402486977e-06, + "loss": 0.76729977, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.7473747730255127 + }, + { + "auxiliary_loss_clip": 0.01196602, + "auxiliary_loss_mlp": 0.01061032, + "balance_loss_clip": 1.05673611, + "balance_loss_mlp": 1.03190899, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 1.6589131391474916, + "language_loss": 0.9263081, + "learning_rate": 3.99461287422531e-06, + "loss": 0.94888449, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 5.77654504776001 + }, + { + "auxiliary_loss_clip": 0.01089175, + "auxiliary_loss_mlp": 0.01014047, + "balance_loss_clip": 1.01949275, + "balance_loss_mlp": 1.00975537, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8199865893372356, + "language_loss": 0.6293065, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65033871, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 4.595096111297607 + }, + { + "auxiliary_loss_clip": 0.0119005, + "auxiliary_loss_mlp": 0.01071383, + "balance_loss_clip": 1.05512345, + "balance_loss_mlp": 1.0418787, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.9662881424705603, + "language_loss": 0.85915613, + "learning_rate": 3.994555590795299e-06, + "loss": 0.88177049, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.7196595668792725 + }, + { + "auxiliary_loss_clip": 0.01216112, + "auxiliary_loss_mlp": 0.01066838, + "balance_loss_clip": 1.05884051, + "balance_loss_mlp": 1.03895509, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 1.694555060982645, + "language_loss": 0.83109915, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85392863, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 4.134324312210083 + }, + { + "auxiliary_loss_clip": 0.01183366, + "auxiliary_loss_mlp": 0.01071157, + "balance_loss_clip": 1.05681157, + "balance_loss_mlp": 1.04230809, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 2.4637650292156468, + "language_loss": 0.8458904, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86843562, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 2.714526414871216 + }, + { + "auxiliary_loss_clip": 0.01174148, + "auxiliary_loss_mlp": 0.01069393, + "balance_loss_clip": 1.0558821, + "balance_loss_mlp": 1.04159331, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.270381987155053, + "language_loss": 0.87093872, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89337409, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.671323776245117 + }, + { + "auxiliary_loss_clip": 0.01199225, + "auxiliary_loss_mlp": 0.01060052, + "balance_loss_clip": 1.05556989, + "balance_loss_mlp": 1.03096545, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9419801567190365, + "language_loss": 0.87986141, + "learning_rate": 3.994440116339046e-06, + "loss": 0.9024542, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 2.684614419937134 + }, + { + "auxiliary_loss_clip": 0.01216439, + "auxiliary_loss_mlp": 0.01063476, + "balance_loss_clip": 1.05858696, + "balance_loss_mlp": 1.03426981, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.436318462174364, + "language_loss": 0.69233847, + "learning_rate": 3.994411058648816e-06, + "loss": 0.7151376, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.6566503047943115 + }, + { + "auxiliary_loss_clip": 0.01156902, + "auxiliary_loss_mlp": 0.01069479, + "balance_loss_clip": 1.05241418, + "balance_loss_mlp": 1.04148889, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 2.5151631806508323, + "language_loss": 0.7619527, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78421652, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.7089884281158447 + }, + { + "auxiliary_loss_clip": 0.01170269, + "auxiliary_loss_mlp": 0.01067442, + "balance_loss_clip": 1.06496477, + "balance_loss_mlp": 1.04060757, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 1.8799758150624382, + "language_loss": 0.85828793, + "learning_rate": 3.994352716384659e-06, + "loss": 0.88066506, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.7855401039123535 + }, + { + "auxiliary_loss_clip": 0.01172922, + "auxiliary_loss_mlp": 0.01070087, + "balance_loss_clip": 1.05136669, + "balance_loss_mlp": 1.04195321, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 3.202926155832138, + "language_loss": 0.86133784, + "learning_rate": 3.994323431812945e-06, + "loss": 0.8837679, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 2.5910990238189697 + }, + { + "auxiliary_loss_clip": 0.01165473, + "auxiliary_loss_mlp": 0.0106886, + "balance_loss_clip": 1.05285704, + "balance_loss_mlp": 1.0397377, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.4829289672264854, + "language_loss": 0.8949213, + "learning_rate": 3.994294071616286e-06, + "loss": 0.9172647, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.6461801528930664 + }, + { + "auxiliary_loss_clip": 0.01130062, + "auxiliary_loss_mlp": 0.01074964, + "balance_loss_clip": 1.04351866, + "balance_loss_mlp": 1.04385066, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 1.8316326375026477, + "language_loss": 0.7518689, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77391911, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 2.711977481842041 + }, + { + "auxiliary_loss_clip": 0.01157644, + "auxiliary_loss_mlp": 0.01076117, + "balance_loss_clip": 1.0542233, + "balance_loss_mlp": 1.04602826, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.049056221178542, + "language_loss": 0.8865875, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90892512, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.6752073764801025 + }, + { + "auxiliary_loss_clip": 0.01207144, + "auxiliary_loss_mlp": 0.01051024, + "balance_loss_clip": 1.05618441, + "balance_loss_mlp": 1.02435732, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 2.0403297204419975, + "language_loss": 0.88984883, + "learning_rate": 3.994205537287791e-06, + "loss": 0.91243052, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 2.51710844039917 + }, + { + "auxiliary_loss_clip": 0.01187019, + "auxiliary_loss_mlp": 0.01071887, + "balance_loss_clip": 1.05410218, + "balance_loss_mlp": 1.04564929, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.1049261506898844, + "language_loss": 0.92605603, + "learning_rate": 3.994175874602517e-06, + "loss": 0.94864506, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 2.656891345977783 + }, + { + "auxiliary_loss_clip": 0.01179181, + "auxiliary_loss_mlp": 0.01071019, + "balance_loss_clip": 1.05308437, + "balance_loss_mlp": 1.04095459, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.0713780545879903, + "language_loss": 0.7194652, + "learning_rate": 3.994146136297893e-06, + "loss": 0.7419672, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.5371851921081543 + }, + { + "auxiliary_loss_clip": 0.01185892, + "auxiliary_loss_mlp": 0.00752851, + "balance_loss_clip": 1.0559715, + "balance_loss_mlp": 1.00055313, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 1.9105516532509959, + "language_loss": 0.81995177, + "learning_rate": 3.994116322375049e-06, + "loss": 0.83933914, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 2.7209582328796387 + }, + { + "auxiliary_loss_clip": 0.01188825, + "auxiliary_loss_mlp": 0.01068869, + "balance_loss_clip": 1.05601978, + "balance_loss_mlp": 1.04189205, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 1.9292438507595129, + "language_loss": 0.81859177, + "learning_rate": 3.994086432835114e-06, + "loss": 0.8411687, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 2.709705114364624 + }, + { + "auxiliary_loss_clip": 0.01190433, + "auxiliary_loss_mlp": 0.01063433, + "balance_loss_clip": 1.05448115, + "balance_loss_mlp": 1.03665817, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.535468151297998, + "language_loss": 0.75896209, + "learning_rate": 3.994056467679221e-06, + "loss": 0.78150076, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 2.598442554473877 + }, + { + "auxiliary_loss_clip": 0.01191559, + "auxiliary_loss_mlp": 0.01061694, + "balance_loss_clip": 1.05933905, + "balance_loss_mlp": 1.03418016, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 1.9102809054893617, + "language_loss": 0.86626554, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88879806, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 2.661485433578491 + }, + { + "auxiliary_loss_clip": 0.01214157, + "auxiliary_loss_mlp": 0.00752835, + "balance_loss_clip": 1.05733526, + "balance_loss_mlp": 1.00058293, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.776430131573475, + "language_loss": 0.87992913, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.89959908, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 2.6426846981048584 + }, + { + "auxiliary_loss_clip": 0.0119455, + "auxiliary_loss_mlp": 0.01068735, + "balance_loss_clip": 1.05682516, + "balance_loss_mlp": 1.03969526, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.733482997812678, + "language_loss": 0.90221035, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92484319, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 2.6239981651306152 + }, + { + "auxiliary_loss_clip": 0.01188356, + "auxiliary_loss_mlp": 0.01076498, + "balance_loss_clip": 1.05293536, + "balance_loss_mlp": 1.04896069, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 2.7054896505739903, + "language_loss": 0.91933954, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94198811, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.6253273487091064 + }, + { + "auxiliary_loss_clip": 0.01174933, + "auxiliary_loss_mlp": 0.01069878, + "balance_loss_clip": 1.05333257, + "balance_loss_mlp": 1.04150653, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.0158482921957783, + "language_loss": 0.76074129, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.78318942, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 2.675446033477783 + }, + { + "auxiliary_loss_clip": 0.01201494, + "auxiliary_loss_mlp": 0.01061384, + "balance_loss_clip": 1.05501592, + "balance_loss_mlp": 1.03500283, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.238708287324031, + "language_loss": 0.74045956, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76308835, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.6140708923339844 + }, + { + "auxiliary_loss_clip": 0.01166667, + "auxiliary_loss_mlp": 0.01070842, + "balance_loss_clip": 1.05236757, + "balance_loss_mlp": 1.04517651, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.197279924765735, + "language_loss": 0.8509832, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87335837, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.6636788845062256 + }, + { + "auxiliary_loss_clip": 0.01143462, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.04441166, + "balance_loss_mlp": 1.03976941, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.941556653947837, + "language_loss": 0.86628211, + "learning_rate": 3.993814024394569e-06, + "loss": 0.88838816, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.7540841102600098 + }, + { + "auxiliary_loss_clip": 0.01197296, + "auxiliary_loss_mlp": 0.01063259, + "balance_loss_clip": 1.0551616, + "balance_loss_mlp": 1.03756976, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.5176331605736055, + "language_loss": 0.75163698, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77424258, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 2.546194553375244 + }, + { + "auxiliary_loss_clip": 0.01195105, + "auxiliary_loss_mlp": 0.01073653, + "balance_loss_clip": 1.05503154, + "balance_loss_mlp": 1.04753435, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.4605407830202632, + "language_loss": 0.85748696, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88017458, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 2.5465219020843506 + }, + { + "auxiliary_loss_clip": 0.01186766, + "auxiliary_loss_mlp": 0.0107289, + "balance_loss_clip": 1.05915391, + "balance_loss_mlp": 1.04792726, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.8866182567035292, + "language_loss": 0.74383491, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76643145, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.6459507942199707 + }, + { + "auxiliary_loss_clip": 0.01182751, + "auxiliary_loss_mlp": 0.01069695, + "balance_loss_clip": 1.05455828, + "balance_loss_mlp": 1.04309952, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.044070457998416, + "language_loss": 0.87461746, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89714193, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.654320001602173 + }, + { + "auxiliary_loss_clip": 0.01194924, + "auxiliary_loss_mlp": 0.01072083, + "balance_loss_clip": 1.05651617, + "balance_loss_mlp": 1.04541612, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.621802993451919, + "language_loss": 0.8711611, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89383113, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.5830466747283936 + }, + { + "auxiliary_loss_clip": 0.01189896, + "auxiliary_loss_mlp": 0.01070035, + "balance_loss_clip": 1.05468917, + "balance_loss_mlp": 1.04233098, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.5964416245039343, + "language_loss": 0.89565003, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91824937, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.736260414123535 + }, + { + "auxiliary_loss_clip": 0.01191977, + "auxiliary_loss_mlp": 0.01074221, + "balance_loss_clip": 1.05724859, + "balance_loss_mlp": 1.04645669, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.384931207297163, + "language_loss": 0.70736706, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73002899, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.7046077251434326 + }, + { + "auxiliary_loss_clip": 0.01175353, + "auxiliary_loss_mlp": 0.01054433, + "balance_loss_clip": 1.05119038, + "balance_loss_mlp": 1.0291369, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 1.8092932923239031, + "language_loss": 0.83410561, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85640347, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.6533281803131104 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_clip": 1.04991698, + "balance_loss_mlp": 1.04423904, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.2148468939970867, + "language_loss": 0.76022863, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78269416, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 2.5910775661468506 + }, + { + "auxiliary_loss_clip": 0.01179025, + "auxiliary_loss_mlp": 0.0105078, + "balance_loss_clip": 1.05361629, + "balance_loss_mlp": 1.02541184, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.6693285755478717, + "language_loss": 0.82829869, + "learning_rate": 3.993504165853694e-06, + "loss": 0.85059673, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.6490893363952637 + }, + { + "auxiliary_loss_clip": 0.01182648, + "auxiliary_loss_mlp": 0.01059119, + "balance_loss_clip": 1.05408454, + "balance_loss_mlp": 1.034109, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 1.630811767832591, + "language_loss": 0.83674836, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85916615, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.6707983016967773 + }, + { + "auxiliary_loss_clip": 0.01199051, + "auxiliary_loss_mlp": 0.0075269, + "balance_loss_clip": 1.05711913, + "balance_loss_mlp": 1.00057435, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.30433876466486, + "language_loss": 0.90227193, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92178935, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.5905373096466064 + }, + { + "auxiliary_loss_clip": 0.01194052, + "auxiliary_loss_mlp": 0.01060045, + "balance_loss_clip": 1.06008601, + "balance_loss_mlp": 1.03520203, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.8147881729142088, + "language_loss": 0.89979112, + "learning_rate": 3.993409734157064e-06, + "loss": 0.92233205, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 5.500051975250244 + }, + { + "auxiliary_loss_clip": 0.01163559, + "auxiliary_loss_mlp": 0.01068556, + "balance_loss_clip": 1.0540216, + "balance_loss_mlp": 1.0424968, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.91547392660418, + "language_loss": 0.80295461, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82527578, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 4.205201625823975 + }, + { + "auxiliary_loss_clip": 0.01125837, + "auxiliary_loss_mlp": 0.01067784, + "balance_loss_clip": 1.04687536, + "balance_loss_mlp": 1.04046142, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9534304172312555, + "language_loss": 0.79891276, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.820849, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.732006311416626 + }, + { + "auxiliary_loss_clip": 0.01190086, + "auxiliary_loss_mlp": 0.010566, + "balance_loss_clip": 1.05309331, + "balance_loss_mlp": 1.0315659, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 2.083994659970972, + "language_loss": 0.89120686, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91367364, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 4.0657360553741455 + }, + { + "auxiliary_loss_clip": 0.01202766, + "auxiliary_loss_mlp": 0.01065461, + "balance_loss_clip": 1.05390894, + "balance_loss_mlp": 1.03950953, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.640329098933693, + "language_loss": 0.87183172, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89451396, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 2.5618929862976074 + }, + { + "auxiliary_loss_clip": 0.01181863, + "auxiliary_loss_mlp": 0.01061907, + "balance_loss_clip": 1.05642498, + "balance_loss_mlp": 1.03680158, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.626386688181919, + "language_loss": 0.66346163, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68589926, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.7510130405426025 + }, + { + "auxiliary_loss_clip": 0.01200967, + "auxiliary_loss_mlp": 0.01068722, + "balance_loss_clip": 1.05752897, + "balance_loss_mlp": 1.04071951, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 5.357892148503818, + "language_loss": 0.71989524, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74259216, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.5985350608825684 + }, + { + "auxiliary_loss_clip": 0.01175708, + "auxiliary_loss_mlp": 0.01066172, + "balance_loss_clip": 1.05213284, + "balance_loss_mlp": 1.03924274, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.5192744477114095, + "language_loss": 0.82710332, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84952211, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.6717922687530518 + }, + { + "auxiliary_loss_clip": 0.01193536, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_clip": 1.0531323, + "balance_loss_mlp": 1.04158878, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.0851510779450426, + "language_loss": 0.78744638, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81005263, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.567423105239868 + }, + { + "auxiliary_loss_clip": 0.01157288, + "auxiliary_loss_mlp": 0.0105703, + "balance_loss_clip": 1.05012941, + "balance_loss_mlp": 1.03039896, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.953717879366296, + "language_loss": 1.02367783, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04582095, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 2.92439866065979 + }, + { + "auxiliary_loss_clip": 0.01149941, + "auxiliary_loss_mlp": 0.01058585, + "balance_loss_clip": 1.04798722, + "balance_loss_mlp": 1.03191745, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.0094455815515615, + "language_loss": 0.81130767, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83339292, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 2.7098703384399414 + }, + { + "auxiliary_loss_clip": 0.01201698, + "auxiliary_loss_mlp": 0.01065952, + "balance_loss_clip": 1.05686975, + "balance_loss_mlp": 1.03834271, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.554252786688519, + "language_loss": 0.73544025, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75811672, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.597735643386841 + }, + { + "auxiliary_loss_clip": 0.01083547, + "auxiliary_loss_mlp": 0.01022608, + "balance_loss_clip": 1.01559031, + "balance_loss_mlp": 1.01817346, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7848356777762073, + "language_loss": 0.59883499, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.61989653, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.1156792640686035 + }, + { + "auxiliary_loss_clip": 0.01198853, + "auxiliary_loss_mlp": 0.01066042, + "balance_loss_clip": 1.05766487, + "balance_loss_mlp": 1.04029274, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.1555551347963173, + "language_loss": 0.94558221, + "learning_rate": 3.992992669166168e-06, + "loss": 0.96823114, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 2.655709981918335 + }, + { + "auxiliary_loss_clip": 0.01162752, + "auxiliary_loss_mlp": 0.01070029, + "balance_loss_clip": 1.05007935, + "balance_loss_mlp": 1.04103792, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1609593891681813, + "language_loss": 0.72019964, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74252748, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.7252726554870605 + }, + { + "auxiliary_loss_clip": 0.0118412, + "auxiliary_loss_mlp": 0.01063194, + "balance_loss_clip": 1.05655551, + "balance_loss_mlp": 1.03594255, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.557854563544276, + "language_loss": 0.85466564, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87713879, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 2.7300832271575928 + }, + { + "auxiliary_loss_clip": 0.01193728, + "auxiliary_loss_mlp": 0.00752737, + "balance_loss_clip": 1.05479252, + "balance_loss_mlp": 1.00056076, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.873021677691888, + "language_loss": 0.83711421, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85657883, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 2.66131591796875 + }, + { + "auxiliary_loss_clip": 0.01195123, + "auxiliary_loss_mlp": 0.01074621, + "balance_loss_clip": 1.05802143, + "balance_loss_mlp": 1.0466783, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 1.8856619762251026, + "language_loss": 0.73840129, + "learning_rate": 3.992861771819365e-06, + "loss": 0.76109874, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 2.588533639907837 + }, + { + "auxiliary_loss_clip": 0.01141199, + "auxiliary_loss_mlp": 0.01068613, + "balance_loss_clip": 1.04609966, + "balance_loss_mlp": 1.04175544, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.4606378480049758, + "language_loss": 0.86492503, + "learning_rate": 3.99282885855576e-06, + "loss": 0.88702321, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.6819722652435303 + }, + { + "auxiliary_loss_clip": 0.01152777, + "auxiliary_loss_mlp": 0.01067481, + "balance_loss_clip": 1.05183601, + "balance_loss_mlp": 1.04217291, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.171030401267924, + "language_loss": 0.80456126, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82676387, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 2.704094648361206 + }, + { + "auxiliary_loss_clip": 0.01079888, + "auxiliary_loss_mlp": 0.01022503, + "balance_loss_clip": 1.01460409, + "balance_loss_mlp": 1.0184263, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8243947384201008, + "language_loss": 0.69160748, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71263134, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 3.052823066711426 + }, + { + "auxiliary_loss_clip": 0.01206952, + "auxiliary_loss_mlp": 0.01063749, + "balance_loss_clip": 1.05525625, + "balance_loss_mlp": 1.03810704, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 1.8581720466861453, + "language_loss": 0.75958121, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78228831, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 2.561613082885742 + }, + { + "auxiliary_loss_clip": 0.01067694, + "auxiliary_loss_mlp": 0.01008637, + "balance_loss_clip": 1.01206279, + "balance_loss_mlp": 1.00427389, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8585972139388334, + "language_loss": 0.64382541, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66458869, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 3.1107687950134277 + }, + { + "auxiliary_loss_clip": 0.01155587, + "auxiliary_loss_mlp": 0.01068381, + "balance_loss_clip": 1.0482378, + "balance_loss_mlp": 1.04076076, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 4.428729426281594, + "language_loss": 0.79379427, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81603396, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 2.7740960121154785 + }, + { + "auxiliary_loss_clip": 0.01160063, + "auxiliary_loss_mlp": 0.0106806, + "balance_loss_clip": 1.04742789, + "balance_loss_mlp": 1.04245424, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.8970942887963222, + "language_loss": 0.74141812, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76369929, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 2.704179525375366 + }, + { + "auxiliary_loss_clip": 0.01191735, + "auxiliary_loss_mlp": 0.01062984, + "balance_loss_clip": 1.05546892, + "balance_loss_mlp": 1.03654361, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 2.6041248194772804, + "language_loss": 0.70721614, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72976339, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.7676806449890137 + }, + { + "auxiliary_loss_clip": 0.01119675, + "auxiliary_loss_mlp": 0.0106149, + "balance_loss_clip": 1.0464468, + "balance_loss_mlp": 1.03483522, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 1.9646174597577537, + "language_loss": 0.80838919, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83020079, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.7753944396972656 + }, + { + "auxiliary_loss_clip": 0.01174358, + "auxiliary_loss_mlp": 0.01065878, + "balance_loss_clip": 1.04931414, + "balance_loss_mlp": 1.0382452, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.158079865450183, + "language_loss": 0.88231516, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.90471745, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.5681605339050293 + }, + { + "auxiliary_loss_clip": 0.01192385, + "auxiliary_loss_mlp": 0.01060247, + "balance_loss_clip": 1.05792522, + "balance_loss_mlp": 1.03465271, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.431423989510667, + "language_loss": 0.754722, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77724832, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 2.5353026390075684 + }, + { + "auxiliary_loss_clip": 0.01198545, + "auxiliary_loss_mlp": 0.01064502, + "balance_loss_clip": 1.0574801, + "balance_loss_mlp": 1.03952789, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.6002170571867573, + "language_loss": 0.7955544, + "learning_rate": 3.992461825426906e-06, + "loss": 0.81818485, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.601437568664551 + }, + { + "auxiliary_loss_clip": 0.01188336, + "auxiliary_loss_mlp": 0.0105933, + "balance_loss_clip": 1.05383408, + "balance_loss_mlp": 1.03334248, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.811722410944769, + "language_loss": 0.82452929, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84700602, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.52274489402771 + }, + { + "auxiliary_loss_clip": 0.0120864, + "auxiliary_loss_mlp": 0.01060068, + "balance_loss_clip": 1.05570543, + "balance_loss_mlp": 1.03298342, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.1769888098877073, + "language_loss": 0.79211557, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81480265, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.527374029159546 + }, + { + "auxiliary_loss_clip": 0.01163133, + "auxiliary_loss_mlp": 0.01069232, + "balance_loss_clip": 1.05341315, + "balance_loss_mlp": 1.04244542, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 2.6304544288919836, + "language_loss": 0.85662746, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.87895107, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.5875353813171387 + }, + { + "auxiliary_loss_clip": 0.01204582, + "auxiliary_loss_mlp": 0.01071296, + "balance_loss_clip": 1.05580926, + "balance_loss_mlp": 1.04297233, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 2.251078797077791, + "language_loss": 0.87214202, + "learning_rate": 3.992326092115019e-06, + "loss": 0.8949008, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.5310957431793213 + }, + { + "auxiliary_loss_clip": 0.01185087, + "auxiliary_loss_mlp": 0.01072087, + "balance_loss_clip": 1.05445313, + "balance_loss_mlp": 1.04681492, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.214636119921201, + "language_loss": 0.79322243, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81579411, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.624356508255005 + }, + { + "auxiliary_loss_clip": 0.01172669, + "auxiliary_loss_mlp": 0.01079134, + "balance_loss_clip": 1.0514487, + "balance_loss_mlp": 1.05280066, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.0919298821043038, + "language_loss": 0.82398206, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84650004, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.764803171157837 + }, + { + "auxiliary_loss_clip": 0.01151701, + "auxiliary_loss_mlp": 0.01062359, + "balance_loss_clip": 1.04525018, + "balance_loss_mlp": 1.03354609, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.59017444805129, + "language_loss": 0.8670513, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88919187, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 2.638928174972534 + }, + { + "auxiliary_loss_clip": 0.01181657, + "auxiliary_loss_mlp": 0.01061496, + "balance_loss_clip": 1.04997206, + "balance_loss_mlp": 1.03150296, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 2.651906934860361, + "language_loss": 0.7901423, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81257379, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.6624934673309326 + }, + { + "auxiliary_loss_clip": 0.01176869, + "auxiliary_loss_mlp": 0.01069845, + "balance_loss_clip": 1.05428898, + "balance_loss_mlp": 1.04208088, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.1455462759005903, + "language_loss": 0.86790347, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89037061, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.8030624389648438 + }, + { + "auxiliary_loss_clip": 0.0120209, + "auxiliary_loss_mlp": 0.01065773, + "balance_loss_clip": 1.05760002, + "balance_loss_mlp": 1.03938007, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.2881351665020118, + "language_loss": 0.8813448, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90402341, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 4.1264753341674805 + }, + { + "auxiliary_loss_clip": 0.01165846, + "auxiliary_loss_mlp": 0.0106658, + "balance_loss_clip": 1.04740322, + "balance_loss_mlp": 1.03968644, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 2.032506314347756, + "language_loss": 0.89646149, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91878569, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 4.049628496170044 + }, + { + "auxiliary_loss_clip": 0.01157345, + "auxiliary_loss_mlp": 0.0105851, + "balance_loss_clip": 1.0535295, + "balance_loss_mlp": 1.03102088, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 2.0608444389346787, + "language_loss": 0.75422072, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77637929, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 4.235389709472656 + }, + { + "auxiliary_loss_clip": 0.01164495, + "auxiliary_loss_mlp": 0.01075056, + "balance_loss_clip": 1.05021131, + "balance_loss_mlp": 1.04464531, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8901511878142627, + "language_loss": 0.80120468, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82360017, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 2.685004949569702 + }, + { + "auxiliary_loss_clip": 0.01175658, + "auxiliary_loss_mlp": 0.01061342, + "balance_loss_clip": 1.05144453, + "balance_loss_mlp": 1.03508019, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.5566666274015173, + "language_loss": 0.88310426, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90547419, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 2.692375898361206 + }, + { + "auxiliary_loss_clip": 0.01161443, + "auxiliary_loss_mlp": 0.01061959, + "balance_loss_clip": 1.05197954, + "balance_loss_mlp": 1.03550661, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.076459776758619, + "language_loss": 0.78706455, + "learning_rate": 3.991946592948529e-06, + "loss": 0.80929857, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 4.219623804092407 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01070783, + "balance_loss_clip": 1.04909539, + "balance_loss_mlp": 1.04099286, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 1.9259461433406098, + "language_loss": 0.93048131, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95241469, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.7189483642578125 + }, + { + "auxiliary_loss_clip": 0.0116952, + "auxiliary_loss_mlp": 0.01071579, + "balance_loss_clip": 1.05071414, + "balance_loss_mlp": 1.04265893, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.2123959230560124, + "language_loss": 0.68387759, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70628858, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.739684581756592 + }, + { + "auxiliary_loss_clip": 0.01144377, + "auxiliary_loss_mlp": 0.01069004, + "balance_loss_clip": 1.04752612, + "balance_loss_mlp": 1.04296875, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.462097806951931, + "language_loss": 0.88525063, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90738446, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.70977783203125 + }, + { + "auxiliary_loss_clip": 0.01169764, + "auxiliary_loss_mlp": 0.01061222, + "balance_loss_clip": 1.05548453, + "balance_loss_mlp": 1.03327954, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.2642071960140338, + "language_loss": 0.85196817, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87427807, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 2.7352750301361084 + }, + { + "auxiliary_loss_clip": 0.01184607, + "auxiliary_loss_mlp": 0.01067579, + "balance_loss_clip": 1.05758238, + "balance_loss_mlp": 1.03923142, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 12.267913131582485, + "language_loss": 0.78844714, + "learning_rate": 3.99177107182976e-06, + "loss": 0.810969, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.635113000869751 + }, + { + "auxiliary_loss_clip": 0.0114066, + "auxiliary_loss_mlp": 0.01073186, + "balance_loss_clip": 1.04756629, + "balance_loss_mlp": 1.04567266, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 1.9800805933020829, + "language_loss": 0.81448984, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83662832, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 2.6412360668182373 + }, + { + "auxiliary_loss_clip": 0.01190126, + "auxiliary_loss_mlp": 0.01063412, + "balance_loss_clip": 1.05486584, + "balance_loss_mlp": 1.03800821, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 2.112656306737491, + "language_loss": 0.76659238, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78912771, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 2.5908849239349365 + }, + { + "auxiliary_loss_clip": 0.01081461, + "auxiliary_loss_mlp": 0.01017576, + "balance_loss_clip": 1.02310908, + "balance_loss_mlp": 1.01295078, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.8186296966883373, + "language_loss": 0.57270956, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59369993, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 3.1084494590759277 + }, + { + "auxiliary_loss_clip": 0.01179933, + "auxiliary_loss_mlp": 0.01063782, + "balance_loss_clip": 1.05475903, + "balance_loss_mlp": 1.03481388, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.2477071918322746, + "language_loss": 0.82751226, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84994942, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 2.5888030529022217 + }, + { + "auxiliary_loss_clip": 0.01199599, + "auxiliary_loss_mlp": 0.00752939, + "balance_loss_clip": 1.05964279, + "balance_loss_mlp": 1.00079083, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.1507895724866763, + "language_loss": 0.77946925, + "learning_rate": 3.991593662507167e-06, + "loss": 0.79899466, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 2.6369316577911377 + }, + { + "auxiliary_loss_clip": 0.01172896, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_clip": 1.0527854, + "balance_loss_mlp": 1.03793156, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.492694929623179, + "language_loss": 0.91923499, + "learning_rate": 3.991557954072958e-06, + "loss": 0.9416312, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 2.7760443687438965 + }, + { + "auxiliary_loss_clip": 0.01169141, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.04862595, + "balance_loss_mlp": 1.03393197, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 1.7786897212903379, + "language_loss": 0.85970777, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88200402, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 2.645322799682617 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01067471, + "balance_loss_clip": 1.05372918, + "balance_loss_mlp": 1.04169834, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.084240492187768, + "language_loss": 0.8717193, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89400697, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 2.6643364429473877 + }, + { + "auxiliary_loss_clip": 0.01195931, + "auxiliary_loss_mlp": 0.00753044, + "balance_loss_clip": 1.05516696, + "balance_loss_mlp": 1.00088835, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 2.042074943254009, + "language_loss": 0.74939173, + "learning_rate": 3.991450375655301e-06, + "loss": 0.76888144, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.5572052001953125 + }, + { + "auxiliary_loss_clip": 0.01189861, + "auxiliary_loss_mlp": 0.00752837, + "balance_loss_clip": 1.05628037, + "balance_loss_mlp": 1.00079715, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.6390750959173432, + "language_loss": 0.768803, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78823, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 2.7455008029937744 + }, + { + "auxiliary_loss_clip": 0.01206812, + "auxiliary_loss_mlp": 0.01069763, + "balance_loss_clip": 1.05659819, + "balance_loss_mlp": 1.04303658, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.080054598728305, + "language_loss": 0.76837826, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79114401, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 2.5690295696258545 + }, + { + "auxiliary_loss_clip": 0.01163413, + "auxiliary_loss_mlp": 0.01073668, + "balance_loss_clip": 1.04956841, + "balance_loss_mlp": 1.04491496, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.9893732686547012, + "language_loss": 0.87423939, + "learning_rate": 3.991342117593679e-06, + "loss": 0.8966102, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 2.72697114944458 + }, + { + "auxiliary_loss_clip": 0.01175281, + "auxiliary_loss_mlp": 0.01064924, + "balance_loss_clip": 1.05585861, + "balance_loss_mlp": 1.03791142, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.5298616581438755, + "language_loss": 0.7926687, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81507075, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.6409990787506104 + }, + { + "auxiliary_loss_clip": 0.01122925, + "auxiliary_loss_mlp": 0.01079735, + "balance_loss_clip": 1.05143404, + "balance_loss_mlp": 1.05071914, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.331823409642334, + "language_loss": 0.80676931, + "learning_rate": 3.991269567990855e-06, + "loss": 0.82879591, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 3.1090087890625 + }, + { + "auxiliary_loss_clip": 0.01055769, + "auxiliary_loss_mlp": 0.01009985, + "balance_loss_clip": 1.01127398, + "balance_loss_mlp": 1.0053122, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.9314986850198186, + "language_loss": 0.59025991, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61091745, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 3.549330234527588 + }, + { + "auxiliary_loss_clip": 0.01203459, + "auxiliary_loss_mlp": 0.0107189, + "balance_loss_clip": 1.0583781, + "balance_loss_mlp": 1.04381657, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.764825266543242, + "language_loss": 0.86452806, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88728154, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.5369009971618652 + }, + { + "auxiliary_loss_clip": 0.0118093, + "auxiliary_loss_mlp": 0.01063257, + "balance_loss_clip": 1.05550861, + "balance_loss_mlp": 1.03874755, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.080323409229732, + "language_loss": 0.79810584, + "learning_rate": 3.991160177271513e-06, + "loss": 0.8205477, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.671111822128296 + }, + { + "auxiliary_loss_clip": 0.01179884, + "auxiliary_loss_mlp": 0.01067176, + "balance_loss_clip": 1.05469346, + "balance_loss_mlp": 1.03978205, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.067281885154853, + "language_loss": 0.84280837, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86527896, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.661471128463745 + }, + { + "auxiliary_loss_clip": 0.01188114, + "auxiliary_loss_mlp": 0.01072779, + "balance_loss_clip": 1.05406237, + "balance_loss_mlp": 1.04663682, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8751198114043144, + "language_loss": 0.84961212, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.87222099, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 2.625871181488037 + }, + { + "auxiliary_loss_clip": 0.01177862, + "auxiliary_loss_mlp": 0.01057291, + "balance_loss_clip": 1.05369174, + "balance_loss_mlp": 1.03234053, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.124961504392792, + "language_loss": 0.77546853, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79782003, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 2.612701892852783 + }, + { + "auxiliary_loss_clip": 0.01132836, + "auxiliary_loss_mlp": 0.01070551, + "balance_loss_clip": 1.0490067, + "balance_loss_mlp": 1.0435735, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 1.9403149016050636, + "language_loss": 0.90634304, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92837691, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 2.6943891048431396 + }, + { + "auxiliary_loss_clip": 0.01189639, + "auxiliary_loss_mlp": 0.01065019, + "balance_loss_clip": 1.05128849, + "balance_loss_mlp": 1.03595519, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.1647187532453365, + "language_loss": 0.76038325, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78292978, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.7403006553649902 + }, + { + "auxiliary_loss_clip": 0.01190384, + "auxiliary_loss_mlp": 0.01061397, + "balance_loss_clip": 1.05385602, + "balance_loss_mlp": 1.03494453, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 1.8583888068769223, + "language_loss": 0.71845245, + "learning_rate": 3.990939357235621e-06, + "loss": 0.74097025, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.74969744682312 + }, + { + "auxiliary_loss_clip": 0.01041469, + "auxiliary_loss_mlp": 0.01004117, + "balance_loss_clip": 1.0131228, + "balance_loss_mlp": 0.99989742, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9277892102476163, + "language_loss": 0.71108872, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73154461, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.0538322925567627 + }, + { + "auxiliary_loss_clip": 0.01162464, + "auxiliary_loss_mlp": 0.0107872, + "balance_loss_clip": 1.05306351, + "balance_loss_mlp": 1.04867899, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 2.176864746601067, + "language_loss": 0.78264672, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80505854, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.722571849822998 + }, + { + "auxiliary_loss_clip": 0.01186413, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.05550826, + "balance_loss_mlp": 1.03098249, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.086988349089924, + "language_loss": 0.86139315, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88384008, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.7165303230285645 + }, + { + "auxiliary_loss_clip": 0.01205339, + "auxiliary_loss_mlp": 0.01072435, + "balance_loss_clip": 1.05520296, + "balance_loss_mlp": 1.045959, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 1.9493971070169274, + "language_loss": 0.76650101, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.78927875, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 2.583408832550049 + }, + { + "auxiliary_loss_clip": 0.01133053, + "auxiliary_loss_mlp": 0.01065298, + "balance_loss_clip": 1.05057657, + "balance_loss_mlp": 1.04014456, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.8040214562193704, + "language_loss": 0.74874592, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77072936, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.7333004474639893 + }, + { + "auxiliary_loss_clip": 0.01139029, + "auxiliary_loss_mlp": 0.01078258, + "balance_loss_clip": 1.048419, + "balance_loss_mlp": 1.04920673, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 2.3848158569942295, + "language_loss": 0.79027486, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81244767, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.709474802017212 + }, + { + "auxiliary_loss_clip": 0.01202928, + "auxiliary_loss_mlp": 0.01082381, + "balance_loss_clip": 1.05578637, + "balance_loss_mlp": 1.05543995, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.7874728288960666, + "language_loss": 0.80144757, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82430065, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 4.1764936447143555 + }, + { + "auxiliary_loss_clip": 0.01155223, + "auxiliary_loss_mlp": 0.01065673, + "balance_loss_clip": 1.05059743, + "balance_loss_mlp": 1.03850496, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.0490889371097083, + "language_loss": 0.87108052, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89328951, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.6772289276123047 + }, + { + "auxiliary_loss_clip": 0.01157173, + "auxiliary_loss_mlp": 0.01078391, + "balance_loss_clip": 1.04849672, + "balance_loss_mlp": 1.04753971, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 2.9464749525051417, + "language_loss": 0.87877309, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90112877, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 5.707984209060669 + }, + { + "auxiliary_loss_clip": 0.01056575, + "auxiliary_loss_mlp": 0.01003597, + "balance_loss_clip": 1.01170993, + "balance_loss_mlp": 0.99944842, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.0199172682012798, + "language_loss": 0.75402552, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77462727, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 3.225921154022217 + }, + { + "auxiliary_loss_clip": 0.01152105, + "auxiliary_loss_mlp": 0.0107339, + "balance_loss_clip": 1.05090415, + "balance_loss_mlp": 1.04592419, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 2.603370034591926, + "language_loss": 0.75759673, + "learning_rate": 3.990527461790013e-06, + "loss": 0.77985168, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 4.249046325683594 + }, + { + "auxiliary_loss_clip": 0.01181972, + "auxiliary_loss_mlp": 0.01063675, + "balance_loss_clip": 1.04968739, + "balance_loss_mlp": 1.03600609, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.835287643090905, + "language_loss": 0.82909262, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85154915, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.6427319049835205 + }, + { + "auxiliary_loss_clip": 0.01166035, + "auxiliary_loss_mlp": 0.01065472, + "balance_loss_clip": 1.04999542, + "balance_loss_mlp": 1.03748131, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 2.225933408728123, + "language_loss": 0.86086863, + "learning_rate": 3.990451590400309e-06, + "loss": 0.88318372, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.6687428951263428 + }, + { + "auxiliary_loss_clip": 0.01178037, + "auxiliary_loss_mlp": 0.01066277, + "balance_loss_clip": 1.05413413, + "balance_loss_mlp": 1.03988373, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.050106227020421, + "language_loss": 0.74008316, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76252627, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 2.637087821960449 + }, + { + "auxiliary_loss_clip": 0.01201971, + "auxiliary_loss_mlp": 0.010633, + "balance_loss_clip": 1.05559802, + "balance_loss_mlp": 1.03739595, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.064446232212607, + "language_loss": 0.7562809, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77893358, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.5407116413116455 + }, + { + "auxiliary_loss_clip": 0.01174424, + "auxiliary_loss_mlp": 0.01071057, + "balance_loss_clip": 1.05373192, + "balance_loss_mlp": 1.04342461, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.0855521725207296, + "language_loss": 0.70108199, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72353685, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.6228787899017334 + }, + { + "auxiliary_loss_clip": 0.01200953, + "auxiliary_loss_mlp": 0.01076228, + "balance_loss_clip": 1.05962563, + "balance_loss_mlp": 1.04987085, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.933807373100518, + "language_loss": 0.83648187, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85925364, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 2.5582144260406494 + }, + { + "auxiliary_loss_clip": 0.01064868, + "auxiliary_loss_mlp": 0.01002872, + "balance_loss_clip": 1.01204276, + "balance_loss_mlp": 0.99884284, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.9293009217332526, + "language_loss": 0.59007192, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.6107493, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 3.1855852603912354 + }, + { + "auxiliary_loss_clip": 0.01165473, + "auxiliary_loss_mlp": 0.01058728, + "balance_loss_clip": 1.04686141, + "balance_loss_mlp": 1.03219175, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 1.7841996880378812, + "language_loss": 0.74150193, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76374388, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 2.754744052886963 + }, + { + "auxiliary_loss_clip": 0.01182199, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.0543189, + "balance_loss_mlp": 1.03429198, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.7492328790664209, + "language_loss": 0.8104552, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83289731, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 2.6730737686157227 + }, + { + "auxiliary_loss_clip": 0.0116142, + "auxiliary_loss_mlp": 0.01058627, + "balance_loss_clip": 1.05376446, + "balance_loss_mlp": 1.03202009, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.8601382741609729, + "language_loss": 0.78311199, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80531245, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 2.6508655548095703 + }, + { + "auxiliary_loss_clip": 0.01177063, + "auxiliary_loss_mlp": 0.01058995, + "balance_loss_clip": 1.05372727, + "balance_loss_mlp": 1.0338304, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.1943523445329394, + "language_loss": 0.92939961, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95176017, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.583918809890747 + }, + { + "auxiliary_loss_clip": 0.01130792, + "auxiliary_loss_mlp": 0.00753074, + "balance_loss_clip": 1.04622841, + "balance_loss_mlp": 1.00105023, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 2.6346304015237867, + "language_loss": 0.71815789, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.73699653, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 3.0456113815307617 + }, + { + "auxiliary_loss_clip": 0.01176708, + "auxiliary_loss_mlp": 0.01075569, + "balance_loss_clip": 1.05228245, + "balance_loss_mlp": 1.04781699, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 3.135401706273943, + "language_loss": 0.87589657, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89841938, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.9194514751434326 + }, + { + "auxiliary_loss_clip": 0.01167578, + "auxiliary_loss_mlp": 0.01064139, + "balance_loss_clip": 1.04935956, + "balance_loss_mlp": 1.03881919, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 2.4993216173277597, + "language_loss": 0.76894909, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79126626, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 2.686624526977539 + }, + { + "auxiliary_loss_clip": 0.01192167, + "auxiliary_loss_mlp": 0.00752915, + "balance_loss_clip": 1.05615091, + "balance_loss_mlp": 1.0010128, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 16.727419842682764, + "language_loss": 0.85672551, + "learning_rate": 3.98995106776885e-06, + "loss": 0.8761763, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 2.617886781692505 + }, + { + "auxiliary_loss_clip": 0.01200462, + "auxiliary_loss_mlp": 0.01072594, + "balance_loss_clip": 1.05754209, + "balance_loss_mlp": 1.04367399, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 3.1938764993164903, + "language_loss": 0.72955561, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75228614, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 2.6446616649627686 + }, + { + "auxiliary_loss_clip": 0.01171033, + "auxiliary_loss_mlp": 0.01070675, + "balance_loss_clip": 1.05191064, + "balance_loss_mlp": 1.04441309, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.6173982063223922, + "language_loss": 0.79382879, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81624585, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 2.7032768726348877 + }, + { + "auxiliary_loss_clip": 0.01155354, + "auxiliary_loss_mlp": 0.01054817, + "balance_loss_clip": 1.05148363, + "balance_loss_mlp": 1.02963984, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 2.2633912193687267, + "language_loss": 0.76084185, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78294355, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 2.7820286750793457 + }, + { + "auxiliary_loss_clip": 0.01167065, + "auxiliary_loss_mlp": 0.01072677, + "balance_loss_clip": 1.05446506, + "balance_loss_mlp": 1.04584265, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 2.02164630986771, + "language_loss": 0.85668766, + "learning_rate": 3.989794495044685e-06, + "loss": 0.87908506, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.6272215843200684 + }, + { + "auxiliary_loss_clip": 0.01155226, + "auxiliary_loss_mlp": 0.01078488, + "balance_loss_clip": 1.05285478, + "balance_loss_mlp": 1.05050969, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 9.328538197575705, + "language_loss": 0.77612954, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79846668, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 2.661100149154663 + }, + { + "auxiliary_loss_clip": 0.01146328, + "auxiliary_loss_mlp": 0.01061248, + "balance_loss_clip": 1.04827094, + "balance_loss_mlp": 1.03433061, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 2.388332759010989, + "language_loss": 0.84172779, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86380357, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.8420121669769287 + }, + { + "auxiliary_loss_clip": 0.01179869, + "auxiliary_loss_mlp": 0.01060088, + "balance_loss_clip": 1.05465412, + "balance_loss_mlp": 1.03270566, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.0750570277568636, + "language_loss": 0.79528344, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81768292, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.7995858192443848 + }, + { + "auxiliary_loss_clip": 0.01167083, + "auxiliary_loss_mlp": 0.01061995, + "balance_loss_clip": 1.04984534, + "balance_loss_mlp": 1.03908277, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.1396038381377562, + "language_loss": 0.88102114, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.90331197, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.6701860427856445 + }, + { + "auxiliary_loss_clip": 0.01180365, + "auxiliary_loss_mlp": 0.01068506, + "balance_loss_clip": 1.05750608, + "balance_loss_mlp": 1.04243469, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.7730367135575296, + "language_loss": 0.82979202, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85228074, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 2.719599723815918 + }, + { + "auxiliary_loss_clip": 0.0107745, + "auxiliary_loss_mlp": 0.01015852, + "balance_loss_clip": 1.01814461, + "balance_loss_mlp": 1.01218057, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8966779261822796, + "language_loss": 0.65033996, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67127299, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.153235673904419 + }, + { + "auxiliary_loss_clip": 0.01145466, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_clip": 1.05090332, + "balance_loss_mlp": 1.04650354, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 2.9711341872737345, + "language_loss": 0.88355422, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90574944, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.6616315841674805 + }, + { + "auxiliary_loss_clip": 0.0117929, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_clip": 1.05306232, + "balance_loss_mlp": 1.04032612, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.901412770399583, + "language_loss": 0.84630901, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86875629, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.6363229751586914 + }, + { + "auxiliary_loss_clip": 0.01136827, + "auxiliary_loss_mlp": 0.01075042, + "balance_loss_clip": 1.04684114, + "balance_loss_mlp": 1.04857779, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.8665010461409943, + "language_loss": 0.82009429, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84221298, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.6159920692443848 + }, + { + "auxiliary_loss_clip": 0.01145662, + "auxiliary_loss_mlp": 0.0106294, + "balance_loss_clip": 1.05288541, + "balance_loss_mlp": 1.03794217, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 3.2163076937121615, + "language_loss": 0.84350365, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86558968, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.615004062652588 + }, + { + "auxiliary_loss_clip": 0.01049794, + "auxiliary_loss_mlp": 0.01006855, + "balance_loss_clip": 1.01417959, + "balance_loss_mlp": 1.00346971, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.934715886626333, + "language_loss": 0.6045838, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62515026, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 2.986149787902832 + }, + { + "auxiliary_loss_clip": 0.01156617, + "auxiliary_loss_mlp": 0.01072601, + "balance_loss_clip": 1.04715562, + "balance_loss_mlp": 1.0458622, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.0027667553745245, + "language_loss": 0.8248145, + "learning_rate": 3.98931753374834e-06, + "loss": 0.8471067, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.718496322631836 + }, + { + "auxiliary_loss_clip": 0.01205581, + "auxiliary_loss_mlp": 0.01074446, + "balance_loss_clip": 1.05721736, + "balance_loss_mlp": 1.04781449, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.3056840516046075, + "language_loss": 0.79562736, + "learning_rate": 3.989277296609237e-06, + "loss": 0.81842768, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 2.540186882019043 + }, + { + "auxiliary_loss_clip": 0.01173857, + "auxiliary_loss_mlp": 0.01072554, + "balance_loss_clip": 1.05516613, + "balance_loss_mlp": 1.04567206, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.7335865398521408, + "language_loss": 0.7730571, + "learning_rate": 3.98923698403654e-06, + "loss": 0.7955212, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 2.623427629470825 + }, + { + "auxiliary_loss_clip": 0.01181488, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.05089211, + "balance_loss_mlp": 1.04186749, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 2.177547437658931, + "language_loss": 0.89271116, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91520792, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 2.632747173309326 + }, + { + "auxiliary_loss_clip": 0.01192608, + "auxiliary_loss_mlp": 0.01059545, + "balance_loss_clip": 1.05406046, + "balance_loss_mlp": 1.03503561, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.651518711142855, + "language_loss": 0.84761739, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87013894, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 2.6326475143432617 + }, + { + "auxiliary_loss_clip": 0.01172656, + "auxiliary_loss_mlp": 0.01063537, + "balance_loss_clip": 1.05571091, + "balance_loss_mlp": 1.03711987, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 2.227260536981979, + "language_loss": 0.80730295, + "learning_rate": 3.989115593732182e-06, + "loss": 0.82966495, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 4.3598809242248535 + }, + { + "auxiliary_loss_clip": 0.01142702, + "auxiliary_loss_mlp": 0.01071045, + "balance_loss_clip": 1.05247164, + "balance_loss_mlp": 1.04293561, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.7512812862098466, + "language_loss": 0.78838575, + "learning_rate": 3.989074979440421e-06, + "loss": 0.81052327, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 4.3368144035339355 + }, + { + "auxiliary_loss_clip": 0.01181229, + "auxiliary_loss_mlp": 0.01070213, + "balance_loss_clip": 1.05322063, + "balance_loss_mlp": 1.04463124, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 3.3529468354466836, + "language_loss": 0.86667514, + "learning_rate": 3.989034289722739e-06, + "loss": 0.8891896, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.616023063659668 + }, + { + "auxiliary_loss_clip": 0.01183745, + "auxiliary_loss_mlp": 0.01056212, + "balance_loss_clip": 1.05223274, + "balance_loss_mlp": 1.02900839, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.48132357149024, + "language_loss": 0.8045724, + "learning_rate": 3.988993524580676e-06, + "loss": 0.82697201, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 4.112295866012573 + }, + { + "auxiliary_loss_clip": 0.01128623, + "auxiliary_loss_mlp": 0.01079856, + "balance_loss_clip": 1.04753709, + "balance_loss_mlp": 1.05086422, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 1.8123814598872954, + "language_loss": 0.85742533, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87951016, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.6851232051849365 + }, + { + "auxiliary_loss_clip": 0.01178368, + "auxiliary_loss_mlp": 0.01071851, + "balance_loss_clip": 1.05350327, + "balance_loss_mlp": 1.04569697, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.8989924685411665, + "language_loss": 0.8065325, + "learning_rate": 3.9889117680296e-06, + "loss": 0.82903469, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 4.23150897026062 + }, + { + "auxiliary_loss_clip": 0.01204055, + "auxiliary_loss_mlp": 0.01067513, + "balance_loss_clip": 1.0589124, + "balance_loss_mlp": 1.04132247, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.37358872253849, + "language_loss": 0.70080304, + "learning_rate": 3.988870776623685e-06, + "loss": 0.72351873, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 2.595461368560791 + }, + { + "auxiliary_loss_clip": 0.01201638, + "auxiliary_loss_mlp": 0.01054882, + "balance_loss_clip": 1.05448961, + "balance_loss_mlp": 1.02828646, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 3.184391080608744, + "language_loss": 0.81229955, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.8348648, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 2.5993740558624268 + }, + { + "auxiliary_loss_clip": 0.01202524, + "auxiliary_loss_mlp": 0.01060316, + "balance_loss_clip": 1.05771399, + "balance_loss_mlp": 1.03505504, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.877014986266713, + "language_loss": 0.76358533, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78621376, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 2.6952452659606934 + }, + { + "auxiliary_loss_clip": 0.01180011, + "auxiliary_loss_mlp": 0.01065624, + "balance_loss_clip": 1.05408072, + "balance_loss_mlp": 1.04055452, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 6.1393869319201535, + "language_loss": 0.92471969, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94717604, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 2.584080219268799 + }, + { + "auxiliary_loss_clip": 0.01180962, + "auxiliary_loss_mlp": 0.01073452, + "balance_loss_clip": 1.05206561, + "balance_loss_mlp": 1.04780972, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 2.064935689467345, + "language_loss": 0.86125946, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88380355, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 2.641648769378662 + }, + { + "auxiliary_loss_clip": 0.01168909, + "auxiliary_loss_mlp": 0.01067211, + "balance_loss_clip": 1.05088854, + "balance_loss_mlp": 1.04211783, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 1.9174276555150715, + "language_loss": 0.78300989, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80537117, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 2.7098546028137207 + }, + { + "auxiliary_loss_clip": 0.01186052, + "auxiliary_loss_mlp": 0.01070927, + "balance_loss_clip": 1.0549866, + "balance_loss_mlp": 1.04613113, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.8784246646180334, + "language_loss": 0.77747393, + "learning_rate": 3.988623244461039e-06, + "loss": 0.80004376, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 2.626983404159546 + }, + { + "auxiliary_loss_clip": 0.01188854, + "auxiliary_loss_mlp": 0.01061164, + "balance_loss_clip": 1.05499196, + "balance_loss_mlp": 1.03576052, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.8714184741452153, + "language_loss": 0.77042913, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79292923, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 2.775979995727539 + }, + { + "auxiliary_loss_clip": 0.01171786, + "auxiliary_loss_mlp": 0.01067476, + "balance_loss_clip": 1.05405736, + "balance_loss_mlp": 1.04129803, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 4.277094455202222, + "language_loss": 0.76978862, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79218125, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 2.6812193393707275 + }, + { + "auxiliary_loss_clip": 0.01185205, + "auxiliary_loss_mlp": 0.01060755, + "balance_loss_clip": 1.0555141, + "balance_loss_mlp": 1.03578055, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.048361236490933, + "language_loss": 0.83109605, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85355568, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.579072952270508 + }, + { + "auxiliary_loss_clip": 0.01199161, + "auxiliary_loss_mlp": 0.01065287, + "balance_loss_clip": 1.05769181, + "balance_loss_mlp": 1.04072976, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 1.9356726048789106, + "language_loss": 0.76942217, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79206669, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.577176570892334 + }, + { + "auxiliary_loss_clip": 0.01176547, + "auxiliary_loss_mlp": 0.01069411, + "balance_loss_clip": 1.05584407, + "balance_loss_mlp": 1.04362607, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.330650316843313, + "language_loss": 0.80054927, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82300889, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.766293525695801 + }, + { + "auxiliary_loss_clip": 0.01200907, + "auxiliary_loss_mlp": 0.0106366, + "balance_loss_clip": 1.05626905, + "balance_loss_mlp": 1.03787494, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 5.695616707293024, + "language_loss": 0.77599716, + "learning_rate": 3.988372997582155e-06, + "loss": 0.79864287, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 2.5764646530151367 + }, + { + "auxiliary_loss_clip": 0.01181274, + "auxiliary_loss_mlp": 0.00752537, + "balance_loss_clip": 1.05733776, + "balance_loss_mlp": 1.0008415, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.9647921401990665, + "language_loss": 0.84957111, + "learning_rate": 3.988331025862195e-06, + "loss": 0.86890924, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.6825056076049805 + }, + { + "auxiliary_loss_clip": 0.01155123, + "auxiliary_loss_mlp": 0.01066689, + "balance_loss_clip": 1.0491581, + "balance_loss_mlp": 1.0413096, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 1.8868809713548236, + "language_loss": 0.85525781, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87747598, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 2.643362045288086 + }, + { + "auxiliary_loss_clip": 0.01151592, + "auxiliary_loss_mlp": 0.01081104, + "balance_loss_clip": 1.05211759, + "balance_loss_mlp": 1.05416298, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 3.099206478848843, + "language_loss": 0.80727756, + "learning_rate": 3.988246856230734e-06, + "loss": 0.82960451, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 2.655360221862793 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.0106763, + "balance_loss_clip": 1.04328084, + "balance_loss_mlp": 1.03996134, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.3202834599111037, + "language_loss": 0.81067538, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83265185, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.921937942504883 + }, + { + "auxiliary_loss_clip": 0.01118136, + "auxiliary_loss_mlp": 0.01063671, + "balance_loss_clip": 1.04321849, + "balance_loss_mlp": 1.03997254, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 1.8439792334552254, + "language_loss": 0.83538485, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85720295, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 2.874163866043091 + }, + { + "auxiliary_loss_clip": 0.01169, + "auxiliary_loss_mlp": 0.01062846, + "balance_loss_clip": 1.05163038, + "balance_loss_mlp": 1.03516507, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 2.1588030570701116, + "language_loss": 0.87665129, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89896977, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.6224329471588135 + }, + { + "auxiliary_loss_clip": 0.01154191, + "auxiliary_loss_mlp": 0.01067531, + "balance_loss_clip": 1.0541507, + "balance_loss_mlp": 1.04151905, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.4344481224061942, + "language_loss": 0.91420555, + "learning_rate": 3.988077612246394e-06, + "loss": 0.93642282, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.633791208267212 + }, + { + "auxiliary_loss_clip": 0.01159051, + "auxiliary_loss_mlp": 0.01065206, + "balance_loss_clip": 1.05004644, + "balance_loss_mlp": 1.03843117, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.8146041113688276, + "language_loss": 0.8720715, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89431405, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.6509106159210205 + }, + { + "auxiliary_loss_clip": 0.01181928, + "auxiliary_loss_mlp": 0.01064509, + "balance_loss_clip": 1.05247235, + "balance_loss_mlp": 1.03664935, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.157933135136825, + "language_loss": 0.7733736, + "learning_rate": 3.987992537919185e-06, + "loss": 0.795838, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 2.7077577114105225 + }, + { + "auxiliary_loss_clip": 0.01161955, + "auxiliary_loss_mlp": 0.01067745, + "balance_loss_clip": 1.05104685, + "balance_loss_mlp": 1.04229367, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.299302479030624, + "language_loss": 0.86772668, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89002365, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.8633670806884766 + }, + { + "auxiliary_loss_clip": 0.01198454, + "auxiliary_loss_mlp": 0.01063584, + "balance_loss_clip": 1.05322409, + "balance_loss_mlp": 1.03705978, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 1.9197546181923386, + "language_loss": 0.80567825, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82829869, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.6967973709106445 + }, + { + "auxiliary_loss_clip": 0.01184084, + "auxiliary_loss_mlp": 0.0106929, + "balance_loss_clip": 1.05149579, + "balance_loss_mlp": 1.04270673, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.2732864693566674, + "language_loss": 0.84308529, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86561906, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.622316598892212 + }, + { + "auxiliary_loss_clip": 0.01156488, + "auxiliary_loss_mlp": 0.01055798, + "balance_loss_clip": 1.05362296, + "balance_loss_mlp": 1.03189707, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.796243894372737, + "language_loss": 0.684407, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70652986, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.826967477798462 + }, + { + "auxiliary_loss_clip": 0.01199233, + "auxiliary_loss_mlp": 0.01074286, + "balance_loss_clip": 1.05708003, + "balance_loss_mlp": 1.04814303, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 1.8875821472265681, + "language_loss": 0.90126503, + "learning_rate": 3.987778532894181e-06, + "loss": 0.9240002, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.4965007305145264 + }, + { + "auxiliary_loss_clip": 0.01173268, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.05216777, + "balance_loss_mlp": 1.04688144, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 1.8380049225156563, + "language_loss": 0.83490986, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85735214, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.641904354095459 + }, + { + "auxiliary_loss_clip": 0.01161369, + "auxiliary_loss_mlp": 0.01061145, + "balance_loss_clip": 1.04978728, + "balance_loss_mlp": 1.03692138, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.3846913574456416, + "language_loss": 0.89402002, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91624516, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.6298205852508545 + }, + { + "auxiliary_loss_clip": 0.0116986, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_clip": 1.0517056, + "balance_loss_mlp": 1.05441499, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.6858322857905907, + "language_loss": 0.95735979, + "learning_rate": 3.987649225345056e-06, + "loss": 0.97986305, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.6062984466552734 + }, + { + "auxiliary_loss_clip": 0.01122389, + "auxiliary_loss_mlp": 0.01058798, + "balance_loss_clip": 1.04748774, + "balance_loss_mlp": 1.03291726, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 3.072809860692461, + "language_loss": 0.88179433, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90360618, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 2.714777708053589 + }, + { + "auxiliary_loss_clip": 0.01133216, + "auxiliary_loss_mlp": 0.0105422, + "balance_loss_clip": 1.04453182, + "balance_loss_mlp": 1.02985358, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6151683799816388, + "language_loss": 0.76074803, + "learning_rate": 3.987562643450292e-06, + "loss": 0.7826224, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.678239345550537 + }, + { + "auxiliary_loss_clip": 0.01155268, + "auxiliary_loss_mlp": 0.01070031, + "balance_loss_clip": 1.05130303, + "balance_loss_mlp": 1.04281592, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.9581047045380218, + "language_loss": 0.80778754, + "learning_rate": 3.987519239449226e-06, + "loss": 0.83004057, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.6747915744781494 + }, + { + "auxiliary_loss_clip": 0.01178817, + "auxiliary_loss_mlp": 0.01057033, + "balance_loss_clip": 1.05279696, + "balance_loss_mlp": 1.03321552, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.915232337438118, + "language_loss": 0.80620867, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82856715, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 2.639620065689087 + }, + { + "auxiliary_loss_clip": 0.0115443, + "auxiliary_loss_mlp": 0.01067948, + "balance_loss_clip": 1.04934144, + "balance_loss_mlp": 1.04256868, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 4.371161390464407, + "language_loss": 0.79544461, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81766844, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 4.140271186828613 + }, + { + "auxiliary_loss_clip": 0.0115195, + "auxiliary_loss_mlp": 0.01059515, + "balance_loss_clip": 1.04930043, + "balance_loss_mlp": 1.03605461, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.3288393209857143, + "language_loss": 0.8809396, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90305424, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.582277297973633 + }, + { + "auxiliary_loss_clip": 0.01170925, + "auxiliary_loss_mlp": 0.0105623, + "balance_loss_clip": 1.04913664, + "balance_loss_mlp": 1.03248358, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 3.0050542190351854, + "language_loss": 0.80534959, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82762116, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 5.6431660652160645 + }, + { + "auxiliary_loss_clip": 0.01183735, + "auxiliary_loss_mlp": 0.0106631, + "balance_loss_clip": 1.0534817, + "balance_loss_mlp": 1.0405848, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.5375814417452114, + "language_loss": 0.91710097, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93960142, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.678119421005249 + }, + { + "auxiliary_loss_clip": 0.0120569, + "auxiliary_loss_mlp": 0.01058005, + "balance_loss_clip": 1.05679512, + "balance_loss_mlp": 1.03314972, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.3160730238270726, + "language_loss": 0.78774261, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81037956, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 2.5821433067321777 + }, + { + "auxiliary_loss_clip": 0.01138877, + "auxiliary_loss_mlp": 0.01063446, + "balance_loss_clip": 1.04640222, + "balance_loss_mlp": 1.03815007, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.9753308311667586, + "language_loss": 0.69787657, + "learning_rate": 3.987213301260294e-06, + "loss": 0.71989977, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 4.145960807800293 + }, + { + "auxiliary_loss_clip": 0.01162828, + "auxiliary_loss_mlp": 0.01070753, + "balance_loss_clip": 1.05790555, + "balance_loss_mlp": 1.04376459, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.7868968428314689, + "language_loss": 0.72163725, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74397314, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 2.7046492099761963 + }, + { + "auxiliary_loss_clip": 0.01115278, + "auxiliary_loss_mlp": 0.01064237, + "balance_loss_clip": 1.04147196, + "balance_loss_mlp": 1.03815448, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.6551413443442646, + "language_loss": 0.84614027, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86793542, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 2.6700878143310547 + }, + { + "auxiliary_loss_clip": 0.01182554, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_clip": 1.05124521, + "balance_loss_mlp": 1.03834927, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.6088045247938383, + "language_loss": 0.83028173, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85275304, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 2.5887222290039062 + }, + { + "auxiliary_loss_clip": 0.01150222, + "auxiliary_loss_mlp": 0.0106345, + "balance_loss_clip": 1.05018735, + "balance_loss_mlp": 1.03728354, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.849398311974718, + "language_loss": 0.79707092, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81920767, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 2.656703472137451 + }, + { + "auxiliary_loss_clip": 0.01151452, + "auxiliary_loss_mlp": 0.01064406, + "balance_loss_clip": 1.04774022, + "balance_loss_mlp": 1.03938413, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.0521815437067272, + "language_loss": 0.66392517, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68608373, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 2.713205099105835 + }, + { + "auxiliary_loss_clip": 0.01150782, + "auxiliary_loss_mlp": 0.01069597, + "balance_loss_clip": 1.04835129, + "balance_loss_mlp": 1.04521871, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.923219794910012, + "language_loss": 0.76825458, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79045832, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 2.6217124462127686 + }, + { + "auxiliary_loss_clip": 0.01181842, + "auxiliary_loss_mlp": 0.01062143, + "balance_loss_clip": 1.05146337, + "balance_loss_mlp": 1.0371325, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.5312337031286867, + "language_loss": 0.85397887, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87641871, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 2.573380947113037 + }, + { + "auxiliary_loss_clip": 0.01171365, + "auxiliary_loss_mlp": 0.01062667, + "balance_loss_clip": 1.05387688, + "balance_loss_mlp": 1.0388732, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.8583039901575475, + "language_loss": 0.77965266, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80199301, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.718027353286743 + }, + { + "auxiliary_loss_clip": 0.01177915, + "auxiliary_loss_mlp": 0.01066593, + "balance_loss_clip": 1.0545392, + "balance_loss_mlp": 1.04324031, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 2.0371107701500017, + "language_loss": 0.71149087, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73393601, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.62567138671875 + }, + { + "auxiliary_loss_clip": 0.01154777, + "auxiliary_loss_mlp": 0.00752729, + "balance_loss_clip": 1.05266714, + "balance_loss_mlp": 1.00094604, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.9441030162249422, + "language_loss": 0.85516644, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87424147, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.6799843311309814 + }, + { + "auxiliary_loss_clip": 0.01199452, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.05645216, + "balance_loss_mlp": 1.03427148, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 1.7103450963405136, + "language_loss": 0.7189213, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74150032, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.596745729446411 + }, + { + "auxiliary_loss_clip": 0.01105985, + "auxiliary_loss_mlp": 0.01064593, + "balance_loss_clip": 1.04535949, + "balance_loss_mlp": 1.0400362, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.211916110232254, + "language_loss": 0.82455033, + "learning_rate": 3.986680245605936e-06, + "loss": 0.84625614, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 2.9013328552246094 + }, + { + "auxiliary_loss_clip": 0.01200895, + "auxiliary_loss_mlp": 0.01064636, + "balance_loss_clip": 1.05389237, + "balance_loss_mlp": 1.03745639, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 2.063849139180065, + "language_loss": 0.70887589, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73153114, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 2.7198448181152344 + }, + { + "auxiliary_loss_clip": 0.01172261, + "auxiliary_loss_mlp": 0.01059745, + "balance_loss_clip": 1.05429661, + "balance_loss_mlp": 1.03437781, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.7053568723899821, + "language_loss": 0.87874705, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90106714, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.6630196571350098 + }, + { + "auxiliary_loss_clip": 0.01174841, + "auxiliary_loss_mlp": 0.01060075, + "balance_loss_clip": 1.05237603, + "balance_loss_mlp": 1.03387284, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.5645662968772536, + "language_loss": 0.81227839, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83462751, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 2.647977828979492 + }, + { + "auxiliary_loss_clip": 0.01152487, + "auxiliary_loss_mlp": 0.01057818, + "balance_loss_clip": 1.05283558, + "balance_loss_mlp": 1.03497791, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.0912121324455732, + "language_loss": 0.70043325, + "learning_rate": 3.986500149519811e-06, + "loss": 0.72253633, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 2.686317205429077 + }, + { + "auxiliary_loss_clip": 0.01184637, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_clip": 1.05390823, + "balance_loss_mlp": 1.04695225, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.97651286571378, + "language_loss": 0.77984381, + "learning_rate": 3.986454937173292e-06, + "loss": 0.80240333, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.6657347679138184 + }, + { + "auxiliary_loss_clip": 0.01200663, + "auxiliary_loss_mlp": 0.01062594, + "balance_loss_clip": 1.05569994, + "balance_loss_mlp": 1.03840649, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 2.18819828721061, + "language_loss": 0.78602356, + "learning_rate": 3.986409649500203e-06, + "loss": 0.8086561, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 2.637482166290283 + }, + { + "auxiliary_loss_clip": 0.01180362, + "auxiliary_loss_mlp": 0.01069166, + "balance_loss_clip": 1.0526278, + "balance_loss_mlp": 1.04369116, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.8755784010790353, + "language_loss": 0.81900716, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84150243, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.55355167388916 + }, + { + "auxiliary_loss_clip": 0.01159164, + "auxiliary_loss_mlp": 0.01054853, + "balance_loss_clip": 1.04625463, + "balance_loss_mlp": 1.03065395, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 1.8975867080658346, + "language_loss": 0.82848799, + "learning_rate": 3.986318848181186e-06, + "loss": 0.85062814, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.593022584915161 + }, + { + "auxiliary_loss_clip": 0.01173933, + "auxiliary_loss_mlp": 0.0106255, + "balance_loss_clip": 1.05762637, + "balance_loss_mlp": 1.03917289, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.1851883588471552, + "language_loss": 0.73429072, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75665557, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.592068672180176 + }, + { + "auxiliary_loss_clip": 0.01184159, + "auxiliary_loss_mlp": 0.01057265, + "balance_loss_clip": 1.05242729, + "balance_loss_mlp": 1.03354239, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 6.646610641451196, + "language_loss": 0.86489511, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88730937, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 2.5543107986450195 + }, + { + "auxiliary_loss_clip": 0.01175057, + "auxiliary_loss_mlp": 0.01064154, + "balance_loss_clip": 1.05594552, + "balance_loss_mlp": 1.0398469, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.134318705710726, + "language_loss": 0.8146047, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83699679, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.5716724395751953 + }, + { + "auxiliary_loss_clip": 0.01185782, + "auxiliary_loss_mlp": 0.0075256, + "balance_loss_clip": 1.05637598, + "balance_loss_mlp": 1.00107956, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.711276597673665, + "language_loss": 0.81949902, + "learning_rate": 3.986136341700063e-06, + "loss": 0.83888245, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.5533642768859863 + }, + { + "auxiliary_loss_clip": 0.01152309, + "auxiliary_loss_mlp": 0.01049384, + "balance_loss_clip": 1.04918134, + "balance_loss_mlp": 1.02398026, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.6294288240119967, + "language_loss": 0.80541086, + "learning_rate": 3.986090526789227e-06, + "loss": 0.8274278, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.726515054702759 + }, + { + "auxiliary_loss_clip": 0.01165362, + "auxiliary_loss_mlp": 0.01061466, + "balance_loss_clip": 1.05411124, + "balance_loss_mlp": 1.03872061, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8338663682886989, + "language_loss": 0.96786195, + "learning_rate": 3.986044636565639e-06, + "loss": 0.99013031, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.6089465618133545 + }, + { + "auxiliary_loss_clip": 0.01187382, + "auxiliary_loss_mlp": 0.01063343, + "balance_loss_clip": 1.05307209, + "balance_loss_mlp": 1.03833246, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9768164413839535, + "language_loss": 0.82844281, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85095, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.590801954269409 + }, + { + "auxiliary_loss_clip": 0.0107158, + "auxiliary_loss_mlp": 0.01021976, + "balance_loss_clip": 1.02016175, + "balance_loss_mlp": 1.01830471, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8046641731469935, + "language_loss": 0.56720722, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58814275, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.1434624195098877 + }, + { + "auxiliary_loss_clip": 0.01168622, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.04794264, + "balance_loss_mlp": 1.03725672, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 3.5596328238730277, + "language_loss": 0.72568548, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.7479943, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.590360164642334 + }, + { + "auxiliary_loss_clip": 0.01135522, + "auxiliary_loss_mlp": 0.01067462, + "balance_loss_clip": 1.04738379, + "balance_loss_mlp": 1.04226077, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 2.1177463146120084, + "language_loss": 0.77695549, + "learning_rate": 3.985860322578614e-06, + "loss": 0.7989853, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.8294711112976074 + }, + { + "auxiliary_loss_clip": 0.01137523, + "auxiliary_loss_mlp": 0.01053789, + "balance_loss_clip": 1.04763746, + "balance_loss_mlp": 1.03016138, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 3.1043280781912665, + "language_loss": 0.71268725, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73460031, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 2.7343909740448 + }, + { + "auxiliary_loss_clip": 0.01155763, + "auxiliary_loss_mlp": 0.01066833, + "balance_loss_clip": 1.04988456, + "balance_loss_mlp": 1.04316974, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.7962471110624223, + "language_loss": 0.78492534, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80715126, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.6448895931243896 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.010621, + "balance_loss_clip": 1.0471257, + "balance_loss_mlp": 1.03852057, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.858163798722029, + "language_loss": 0.78841388, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81039572, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 2.6408815383911133 + }, + { + "auxiliary_loss_clip": 0.01125648, + "auxiliary_loss_mlp": 0.01052061, + "balance_loss_clip": 1.04236937, + "balance_loss_mlp": 1.02877903, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 1.8090601625604898, + "language_loss": 0.82925582, + "learning_rate": 3.985674803727289e-06, + "loss": 0.85103285, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.623194456100464 + }, + { + "auxiliary_loss_clip": 0.01050477, + "auxiliary_loss_mlp": 0.01003584, + "balance_loss_clip": 1.0270803, + "balance_loss_mlp": 0.99995959, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8342259661845093, + "language_loss": 0.58155823, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60209882, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 4.925436496734619 + }, + { + "auxiliary_loss_clip": 0.01167866, + "auxiliary_loss_mlp": 0.01060526, + "balance_loss_clip": 1.05397153, + "balance_loss_mlp": 1.03550363, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.5974390570060044, + "language_loss": 0.91351336, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93579727, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.8578684329986572 + }, + { + "auxiliary_loss_clip": 0.01157553, + "auxiliary_loss_mlp": 0.00752775, + "balance_loss_clip": 1.0532788, + "balance_loss_mlp": 1.00117362, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 1.768525494505602, + "language_loss": 0.87369418, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89279747, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 4.229121923446655 + }, + { + "auxiliary_loss_clip": 0.01065331, + "auxiliary_loss_mlp": 0.0100327, + "balance_loss_clip": 1.01360846, + "balance_loss_mlp": 0.99945557, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9101636032962773, + "language_loss": 0.59829003, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61897606, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 4.551890850067139 + }, + { + "auxiliary_loss_clip": 0.01163721, + "auxiliary_loss_mlp": 0.01054185, + "balance_loss_clip": 1.04724228, + "balance_loss_mlp": 1.03062892, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 2.6735579854019518, + "language_loss": 0.8304199, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85259897, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.585383176803589 + }, + { + "auxiliary_loss_clip": 0.01170816, + "auxiliary_loss_mlp": 0.01056211, + "balance_loss_clip": 1.05312276, + "balance_loss_mlp": 1.0339191, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 2.1314127440629576, + "language_loss": 0.84748507, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.86975527, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 4.128492593765259 + }, + { + "auxiliary_loss_clip": 0.01201997, + "auxiliary_loss_mlp": 0.01067175, + "balance_loss_clip": 1.05876398, + "balance_loss_mlp": 1.04326165, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.002645173714635, + "language_loss": 0.78570414, + "learning_rate": 3.985347246871708e-06, + "loss": 0.8083958, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 2.5670602321624756 + }, + { + "auxiliary_loss_clip": 0.0106107, + "auxiliary_loss_mlp": 0.01006072, + "balance_loss_clip": 1.01411343, + "balance_loss_mlp": 1.00204265, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7500537721464665, + "language_loss": 0.58342671, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60409808, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 3.3025319576263428 + }, + { + "auxiliary_loss_clip": 0.01132196, + "auxiliary_loss_mlp": 0.01063257, + "balance_loss_clip": 1.05013061, + "balance_loss_mlp": 1.03930736, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 1.9587072415056725, + "language_loss": 0.7179544, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73990893, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.703907012939453 + }, + { + "auxiliary_loss_clip": 0.01130626, + "auxiliary_loss_mlp": 0.01059266, + "balance_loss_clip": 1.04828894, + "balance_loss_mlp": 1.03196669, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.7445535704834565, + "language_loss": 0.79110193, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81300086, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 2.6609396934509277 + }, + { + "auxiliary_loss_clip": 0.01178808, + "auxiliary_loss_mlp": 0.01052593, + "balance_loss_clip": 1.05188751, + "balance_loss_mlp": 1.03095686, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 1.9937688056127438, + "language_loss": 0.71454912, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73686314, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 2.595194101333618 + }, + { + "auxiliary_loss_clip": 0.01157308, + "auxiliary_loss_mlp": 0.01070305, + "balance_loss_clip": 1.05454397, + "balance_loss_mlp": 1.04497325, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.905129216269106, + "language_loss": 0.81082469, + "learning_rate": 3.985111019116736e-06, + "loss": 0.8331008, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 2.6679649353027344 + }, + { + "auxiliary_loss_clip": 0.01052453, + "auxiliary_loss_mlp": 0.01007377, + "balance_loss_clip": 1.01422811, + "balance_loss_mlp": 1.00327587, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.779314211846345, + "language_loss": 0.5971204, + "learning_rate": 3.985063547731735e-06, + "loss": 0.6177187, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 3.1433472633361816 + }, + { + "auxiliary_loss_clip": 0.01194523, + "auxiliary_loss_mlp": 0.01058634, + "balance_loss_clip": 1.05646551, + "balance_loss_mlp": 1.03545976, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.066667779809194, + "language_loss": 0.81721699, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83974862, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.621267557144165 + }, + { + "auxiliary_loss_clip": 0.01142697, + "auxiliary_loss_mlp": 0.01056772, + "balance_loss_clip": 1.04910922, + "balance_loss_mlp": 1.03219104, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.510122601220486, + "language_loss": 0.75809574, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78009039, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.6653146743774414 + }, + { + "auxiliary_loss_clip": 0.01101202, + "auxiliary_loss_mlp": 0.01065097, + "balance_loss_clip": 1.04193735, + "balance_loss_mlp": 1.04104018, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.9293993995489453, + "language_loss": 0.72321618, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74487913, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 3.20261549949646 + }, + { + "auxiliary_loss_clip": 0.01149476, + "auxiliary_loss_mlp": 0.01070722, + "balance_loss_clip": 1.05286336, + "balance_loss_mlp": 1.04623628, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.4319031404648697, + "language_loss": 0.80812633, + "learning_rate": 3.984872909471688e-06, + "loss": 0.83032835, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 3.004007577896118 + }, + { + "auxiliary_loss_clip": 0.01185459, + "auxiliary_loss_mlp": 0.01068915, + "balance_loss_clip": 1.05534601, + "balance_loss_mlp": 1.04488242, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.0917877184576277, + "language_loss": 0.80744815, + "learning_rate": 3.984825061735701e-06, + "loss": 0.82999188, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 2.5705788135528564 + }, + { + "auxiliary_loss_clip": 0.0115624, + "auxiliary_loss_mlp": 0.01065759, + "balance_loss_clip": 1.04813063, + "balance_loss_mlp": 1.04214358, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.804744426464911, + "language_loss": 0.63439572, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65661573, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 2.874138593673706 + }, + { + "auxiliary_loss_clip": 0.01110692, + "auxiliary_loss_mlp": 0.01065175, + "balance_loss_clip": 1.04197514, + "balance_loss_mlp": 1.03823423, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.040731846033602, + "language_loss": 0.74936515, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77112383, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 2.7602431774139404 + }, + { + "auxiliary_loss_clip": 0.01158574, + "auxiliary_loss_mlp": 0.00752827, + "balance_loss_clip": 1.05337048, + "balance_loss_mlp": 1.00116134, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0305706335541154, + "language_loss": 0.87137365, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89048767, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 2.7162845134735107 + }, + { + "auxiliary_loss_clip": 0.01162088, + "auxiliary_loss_mlp": 0.00752686, + "balance_loss_clip": 1.04758799, + "balance_loss_mlp": 1.00108099, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.3953541957431765, + "language_loss": 0.78454703, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80369473, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.6297051906585693 + }, + { + "auxiliary_loss_clip": 0.01174334, + "auxiliary_loss_mlp": 0.01070588, + "balance_loss_clip": 1.05440998, + "balance_loss_mlp": 1.04586363, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.0372708745179846, + "language_loss": 0.84257817, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86502743, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.7189431190490723 + }, + { + "auxiliary_loss_clip": 0.01135693, + "auxiliary_loss_mlp": 0.01065634, + "balance_loss_clip": 1.04522443, + "balance_loss_mlp": 1.04150629, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.156022317312671, + "language_loss": 0.78554976, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80756301, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 2.6175754070281982 + }, + { + "auxiliary_loss_clip": 0.0119957, + "auxiliary_loss_mlp": 0.01060908, + "balance_loss_clip": 1.0569824, + "balance_loss_mlp": 1.03705478, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.0298367513426, + "language_loss": 0.85534871, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87795353, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.5764102935791016 + }, + { + "auxiliary_loss_clip": 0.01137544, + "auxiliary_loss_mlp": 0.01065279, + "balance_loss_clip": 1.04811907, + "balance_loss_mlp": 1.04036474, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 4.259928196898044, + "language_loss": 0.74695152, + "learning_rate": 3.984439570469271e-06, + "loss": 0.76897973, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.7884867191314697 + }, + { + "auxiliary_loss_clip": 0.01174709, + "auxiliary_loss_mlp": 0.00752611, + "balance_loss_clip": 1.05459428, + "balance_loss_mlp": 1.00105703, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.370649812747463, + "language_loss": 0.67878133, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.69805455, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 2.6641149520874023 + }, + { + "auxiliary_loss_clip": 0.01184973, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_clip": 1.05344772, + "balance_loss_mlp": 1.04184031, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 2.3780327259569836, + "language_loss": 0.79291445, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81543362, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.5682709217071533 + }, + { + "auxiliary_loss_clip": 0.01177976, + "auxiliary_loss_mlp": 0.01062932, + "balance_loss_clip": 1.05808663, + "balance_loss_mlp": 1.03883994, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 2.632727551667695, + "language_loss": 0.68720073, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70960987, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.6621246337890625 + }, + { + "auxiliary_loss_clip": 0.01161287, + "auxiliary_loss_mlp": 0.01055445, + "balance_loss_clip": 1.0510354, + "balance_loss_mlp": 1.0336659, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 2.1980519898147866, + "language_loss": 0.74409878, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76626611, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.582021474838257 + }, + { + "auxiliary_loss_clip": 0.01173107, + "auxiliary_loss_mlp": 0.01060141, + "balance_loss_clip": 1.05002117, + "balance_loss_mlp": 1.03597772, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.101672095802999, + "language_loss": 0.91760659, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93993902, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.5325844287872314 + }, + { + "auxiliary_loss_clip": 0.01196546, + "auxiliary_loss_mlp": 0.01064405, + "balance_loss_clip": 1.05333281, + "balance_loss_mlp": 1.03959775, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.7429702840512658, + "language_loss": 0.82319826, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84580779, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.500627040863037 + }, + { + "auxiliary_loss_clip": 0.01190591, + "auxiliary_loss_mlp": 0.01063965, + "balance_loss_clip": 1.05393529, + "balance_loss_mlp": 1.04173195, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 3.61292049324184, + "language_loss": 0.84955144, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87209696, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.5638790130615234 + }, + { + "auxiliary_loss_clip": 0.01143117, + "auxiliary_loss_mlp": 0.0105807, + "balance_loss_clip": 1.04895389, + "balance_loss_mlp": 1.03495562, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.0035466021012978, + "language_loss": 0.85759389, + "learning_rate": 3.984049263194367e-06, + "loss": 0.87960577, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.62559175491333 + }, + { + "auxiliary_loss_clip": 0.01157601, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_clip": 1.05023277, + "balance_loss_mlp": 1.03144801, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.044118747824804, + "language_loss": 0.69698089, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71911186, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.6582436561584473 + }, + { + "auxiliary_loss_clip": 0.0119644, + "auxiliary_loss_mlp": 0.01054981, + "balance_loss_clip": 1.05309367, + "balance_loss_mlp": 1.03018534, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.5076365216833545, + "language_loss": 0.84003121, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86254537, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 2.59857177734375 + }, + { + "auxiliary_loss_clip": 0.01169912, + "auxiliary_loss_mlp": 0.01058174, + "balance_loss_clip": 1.05518627, + "balance_loss_mlp": 1.03402209, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.649781368184937, + "language_loss": 0.81753373, + "learning_rate": 3.983901656532052e-06, + "loss": 0.8398146, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 2.5735607147216797 + }, + { + "auxiliary_loss_clip": 0.0119349, + "auxiliary_loss_mlp": 0.01056734, + "balance_loss_clip": 1.05644357, + "balance_loss_mlp": 1.03423941, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8082964105405899, + "language_loss": 0.85347682, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87597907, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.6749699115753174 + }, + { + "auxiliary_loss_clip": 0.01173397, + "auxiliary_loss_mlp": 0.0105762, + "balance_loss_clip": 1.05357409, + "balance_loss_mlp": 1.03573275, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1313254585648895, + "language_loss": 0.9046458, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92695594, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 2.5538787841796875 + }, + { + "auxiliary_loss_clip": 0.01151804, + "auxiliary_loss_mlp": 0.01052707, + "balance_loss_clip": 1.05012393, + "balance_loss_mlp": 1.030164, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.0745061392604387, + "language_loss": 0.81818986, + "learning_rate": 3.983753372802008e-06, + "loss": 0.84023494, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.597510576248169 + }, + { + "auxiliary_loss_clip": 0.01169855, + "auxiliary_loss_mlp": 0.010593, + "balance_loss_clip": 1.05774033, + "balance_loss_mlp": 1.0373888, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 2.468435744321794, + "language_loss": 0.74992526, + "learning_rate": 3.983703794441237e-06, + "loss": 0.7722168, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 4.299901485443115 + }, + { + "auxiliary_loss_clip": 0.01163052, + "auxiliary_loss_mlp": 0.00752502, + "balance_loss_clip": 1.04780674, + "balance_loss_mlp": 1.00095963, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.696947785313239, + "language_loss": 0.71074152, + "learning_rate": 3.98365414085822e-06, + "loss": 0.72989702, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 2.6761910915374756 + }, + { + "auxiliary_loss_clip": 0.01160385, + "auxiliary_loss_mlp": 0.00752512, + "balance_loss_clip": 1.05112362, + "balance_loss_mlp": 1.00092185, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 1.9987348808767254, + "language_loss": 0.74959445, + "learning_rate": 3.98360441205484e-06, + "loss": 0.76872343, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 5.875297784805298 + }, + { + "auxiliary_loss_clip": 0.01162814, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.04985011, + "balance_loss_mlp": 1.03136027, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.9918196167017042, + "language_loss": 0.71422786, + "learning_rate": 3.983554608032982e-06, + "loss": 0.73640019, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 2.6765334606170654 + }, + { + "auxiliary_loss_clip": 0.01193027, + "auxiliary_loss_mlp": 0.0105688, + "balance_loss_clip": 1.05318737, + "balance_loss_mlp": 1.0337534, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.8176273530143499, + "language_loss": 0.80049956, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82299864, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 4.087301969528198 + }, + { + "auxiliary_loss_clip": 0.01194383, + "auxiliary_loss_mlp": 0.0106105, + "balance_loss_clip": 1.05488038, + "balance_loss_mlp": 1.03527701, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 2.567353534850428, + "language_loss": 0.81409216, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83664644, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 2.5480942726135254 + }, + { + "auxiliary_loss_clip": 0.01173114, + "auxiliary_loss_mlp": 0.01057696, + "balance_loss_clip": 1.04874098, + "balance_loss_mlp": 1.03354466, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.6568820946782117, + "language_loss": 0.75903559, + "learning_rate": 3.983404744675437e-06, + "loss": 0.7813437, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 2.613218069076538 + }, + { + "auxiliary_loss_clip": 0.0116178, + "auxiliary_loss_mlp": 0.01065115, + "balance_loss_clip": 1.04967129, + "balance_loss_mlp": 1.04214334, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 2.0835353203023805, + "language_loss": 0.82692313, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.84919214, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 2.6904873847961426 + }, + { + "auxiliary_loss_clip": 0.0116259, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_clip": 1.04793048, + "balance_loss_mlp": 1.03330326, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 2.345089549004948, + "language_loss": 0.79677874, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81898475, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.6416218280792236 + }, + { + "auxiliary_loss_clip": 0.01173775, + "auxiliary_loss_mlp": 0.0105795, + "balance_loss_clip": 1.04995906, + "balance_loss_mlp": 1.03392911, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.266267833635071, + "language_loss": 0.78822291, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81054014, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.5743844509124756 + }, + { + "auxiliary_loss_clip": 0.01125538, + "auxiliary_loss_mlp": 0.01067933, + "balance_loss_clip": 1.04523003, + "balance_loss_mlp": 1.04182601, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.7653124269443963, + "language_loss": 0.72880679, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75074154, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.7007648944854736 + }, + { + "auxiliary_loss_clip": 0.01165102, + "auxiliary_loss_mlp": 0.01056792, + "balance_loss_clip": 1.05052006, + "balance_loss_mlp": 1.03365374, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.749009840810865, + "language_loss": 0.81143558, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83365458, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.6552822589874268 + }, + { + "auxiliary_loss_clip": 0.01152416, + "auxiliary_loss_mlp": 0.01048794, + "balance_loss_clip": 1.04621482, + "balance_loss_mlp": 1.02458274, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 2.321003104206605, + "language_loss": 0.84580976, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86782187, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.605109214782715 + }, + { + "auxiliary_loss_clip": 0.01177432, + "auxiliary_loss_mlp": 0.01054947, + "balance_loss_clip": 1.05148172, + "balance_loss_mlp": 1.03073525, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 4.925259211500943, + "language_loss": 0.89649838, + "learning_rate": 3.983052431214997e-06, + "loss": 0.91882217, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.5230281352996826 + }, + { + "auxiliary_loss_clip": 0.01170869, + "auxiliary_loss_mlp": 0.01069937, + "balance_loss_clip": 1.05126667, + "balance_loss_mlp": 1.04330552, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.1953544617666507, + "language_loss": 0.88960677, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91201484, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.5949044227600098 + }, + { + "auxiliary_loss_clip": 0.01193597, + "auxiliary_loss_mlp": 0.01068124, + "balance_loss_clip": 1.05385995, + "balance_loss_mlp": 1.04360271, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.164425636696515, + "language_loss": 0.84001803, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86263525, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.548305034637451 + }, + { + "auxiliary_loss_clip": 0.01161049, + "auxiliary_loss_mlp": 0.00752509, + "balance_loss_clip": 1.05172074, + "balance_loss_mlp": 1.00088978, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.9636129474915305, + "language_loss": 0.75658214, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77571774, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 2.57993745803833 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01059071, + "balance_loss_clip": 1.05269217, + "balance_loss_mlp": 1.03524137, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 1.7500135499279081, + "language_loss": 0.89077526, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91301501, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 2.6167187690734863 + }, + { + "auxiliary_loss_clip": 0.01177774, + "auxiliary_loss_mlp": 0.01055837, + "balance_loss_clip": 1.05418086, + "balance_loss_mlp": 1.03068435, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 2.0740133644391943, + "language_loss": 0.81977046, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84210652, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 2.653970718383789 + }, + { + "auxiliary_loss_clip": 0.01184503, + "auxiliary_loss_mlp": 0.01059124, + "balance_loss_clip": 1.05368233, + "balance_loss_mlp": 1.03406644, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.4403370650156884, + "language_loss": 0.82253087, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.84496719, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 2.577862024307251 + }, + { + "auxiliary_loss_clip": 0.0116199, + "auxiliary_loss_mlp": 0.01061397, + "balance_loss_clip": 1.047925, + "balance_loss_mlp": 1.03766239, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.9110573296175581, + "language_loss": 0.85168457, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87391841, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 2.6165671348571777 + }, + { + "auxiliary_loss_clip": 0.01180704, + "auxiliary_loss_mlp": 0.01063418, + "balance_loss_clip": 1.05421352, + "balance_loss_mlp": 1.0401361, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 2.530805291671771, + "language_loss": 0.83329141, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85573268, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.583895206451416 + }, + { + "auxiliary_loss_clip": 0.01133813, + "auxiliary_loss_mlp": 0.01065046, + "balance_loss_clip": 1.04600811, + "balance_loss_mlp": 1.03940415, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.2076093701903816, + "language_loss": 0.74556255, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76755118, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 2.6590468883514404 + }, + { + "auxiliary_loss_clip": 0.01170395, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.05131912, + "balance_loss_mlp": 1.03465676, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.92708040804133, + "language_loss": 0.85762399, + "learning_rate": 3.982542734644673e-06, + "loss": 0.87992662, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.639665126800537 + }, + { + "auxiliary_loss_clip": 0.01063961, + "auxiliary_loss_mlp": 0.01023979, + "balance_loss_clip": 1.02118516, + "balance_loss_mlp": 1.0190438, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.9288676978965053, + "language_loss": 0.63271511, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65359449, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.3179469108581543 + }, + { + "auxiliary_loss_clip": 0.01197524, + "auxiliary_loss_mlp": 0.01064203, + "balance_loss_clip": 1.06091547, + "balance_loss_mlp": 1.04031348, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.494670748020801, + "language_loss": 0.83827299, + "learning_rate": 3.98243989312991e-06, + "loss": 0.86089021, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.6103515625 + }, + { + "auxiliary_loss_clip": 0.01163134, + "auxiliary_loss_mlp": 0.01060162, + "balance_loss_clip": 1.05091858, + "balance_loss_mlp": 1.03580737, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.0558432077806597, + "language_loss": 0.88157928, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90381229, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 2.611618757247925 + }, + { + "auxiliary_loss_clip": 0.01164797, + "auxiliary_loss_mlp": 0.01061683, + "balance_loss_clip": 1.05440068, + "balance_loss_mlp": 1.03776956, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 6.932505430774496, + "language_loss": 0.83448815, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85675299, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.8974993228912354 + }, + { + "auxiliary_loss_clip": 0.01179853, + "auxiliary_loss_mlp": 0.0106096, + "balance_loss_clip": 1.0561918, + "balance_loss_mlp": 1.03568769, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.3570045617272375, + "language_loss": 0.79618949, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81859761, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.664923667907715 + }, + { + "auxiliary_loss_clip": 0.01196803, + "auxiliary_loss_mlp": 0.01064393, + "balance_loss_clip": 1.0522635, + "balance_loss_mlp": 1.03908491, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.6478011638532872, + "language_loss": 0.79418021, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81679219, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.7479007244110107 + }, + { + "auxiliary_loss_clip": 0.01139067, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.05177331, + "balance_loss_mlp": 1.03490579, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 1.829519656256186, + "language_loss": 0.77000916, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79198706, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.726245164871216 + }, + { + "auxiliary_loss_clip": 0.01194863, + "auxiliary_loss_mlp": 0.01066359, + "balance_loss_clip": 1.05616724, + "balance_loss_mlp": 1.04174256, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.4803341783571917, + "language_loss": 0.6579504, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68056262, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.637849807739258 + }, + { + "auxiliary_loss_clip": 0.01180357, + "auxiliary_loss_mlp": 0.01055472, + "balance_loss_clip": 1.05500305, + "balance_loss_mlp": 1.03074789, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 1.9532255335586424, + "language_loss": 0.6964103, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71876854, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.6307260990142822 + }, + { + "auxiliary_loss_clip": 0.01138015, + "auxiliary_loss_mlp": 0.01057623, + "balance_loss_clip": 1.04901946, + "balance_loss_mlp": 1.03435373, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.2965726672213043, + "language_loss": 0.78478891, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80674529, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.6188857555389404 + }, + { + "auxiliary_loss_clip": 0.0119255, + "auxiliary_loss_mlp": 0.01057429, + "balance_loss_clip": 1.05651462, + "balance_loss_mlp": 1.03347969, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.116032184385898, + "language_loss": 0.85086519, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87336498, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 2.489203691482544 + }, + { + "auxiliary_loss_clip": 0.01158613, + "auxiliary_loss_mlp": 0.00752451, + "balance_loss_clip": 1.05621338, + "balance_loss_mlp": 1.0007565, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.8394766322116525, + "language_loss": 0.77019775, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.78930843, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 2.6706535816192627 + }, + { + "auxiliary_loss_clip": 0.01194425, + "auxiliary_loss_mlp": 0.01069705, + "balance_loss_clip": 1.05440688, + "balance_loss_mlp": 1.04380083, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.271810798942238, + "language_loss": 0.754547, + "learning_rate": 3.981868890255468e-06, + "loss": 0.7771883, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.4815480709075928 + }, + { + "auxiliary_loss_clip": 0.01145122, + "auxiliary_loss_mlp": 0.01058431, + "balance_loss_clip": 1.04687691, + "balance_loss_mlp": 1.03289664, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 2.812489611046163, + "language_loss": 0.73753864, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75957417, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 2.6586837768554688 + }, + { + "auxiliary_loss_clip": 0.01189503, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_clip": 1.0515641, + "balance_loss_mlp": 1.02963543, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.3131044029425714, + "language_loss": 0.78182852, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80425096, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.6258537769317627 + }, + { + "auxiliary_loss_clip": 0.01171567, + "auxiliary_loss_mlp": 0.0105892, + "balance_loss_clip": 1.05535531, + "balance_loss_mlp": 1.03306365, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 2.51835949668458, + "language_loss": 0.85674024, + "learning_rate": 3.981711583882166e-06, + "loss": 0.87904513, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 2.6825358867645264 + }, + { + "auxiliary_loss_clip": 0.01167617, + "auxiliary_loss_mlp": 0.0106136, + "balance_loss_clip": 1.05053675, + "balance_loss_mlp": 1.03625417, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.8887858004332299, + "language_loss": 0.81575167, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83804142, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 4.158589839935303 + }, + { + "auxiliary_loss_clip": 0.01149773, + "auxiliary_loss_mlp": 0.0105505, + "balance_loss_clip": 1.0514617, + "balance_loss_mlp": 1.03231716, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 2.336783438270148, + "language_loss": 0.80130464, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82335293, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 2.5812933444976807 + }, + { + "auxiliary_loss_clip": 0.01157771, + "auxiliary_loss_mlp": 0.00752681, + "balance_loss_clip": 1.05160141, + "balance_loss_mlp": 1.00068521, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.4632416373659067, + "language_loss": 0.71047807, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.72958255, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 4.363982677459717 + }, + { + "auxiliary_loss_clip": 0.0114765, + "auxiliary_loss_mlp": 0.01057423, + "balance_loss_clip": 1.05674803, + "balance_loss_mlp": 1.0325675, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.294425856598502, + "language_loss": 0.8611784, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88322908, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 4.307765960693359 + }, + { + "auxiliary_loss_clip": 0.01148439, + "auxiliary_loss_mlp": 0.01061784, + "balance_loss_clip": 1.05469, + "balance_loss_mlp": 1.03686905, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.2114621356715305, + "language_loss": 0.8414337, + "learning_rate": 3.981447903685947e-06, + "loss": 0.863536, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 4.152133464813232 + }, + { + "auxiliary_loss_clip": 0.01198278, + "auxiliary_loss_mlp": 0.01057414, + "balance_loss_clip": 1.06186461, + "balance_loss_mlp": 1.03481245, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.091175170650118, + "language_loss": 0.76301378, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78557068, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 2.632012128829956 + }, + { + "auxiliary_loss_clip": 0.01174808, + "auxiliary_loss_mlp": 0.01065264, + "balance_loss_clip": 1.05617321, + "balance_loss_mlp": 1.04069555, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 2.0900181190655074, + "language_loss": 0.82870424, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85110497, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.7654712200164795 + }, + { + "auxiliary_loss_clip": 0.01179895, + "auxiliary_loss_mlp": 0.01054462, + "balance_loss_clip": 1.05309153, + "balance_loss_mlp": 1.02877212, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 3.0917150974537737, + "language_loss": 0.68812162, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71046519, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.567901849746704 + }, + { + "auxiliary_loss_clip": 0.01166417, + "auxiliary_loss_mlp": 0.00752331, + "balance_loss_clip": 1.05538464, + "balance_loss_mlp": 1.00065899, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.778779968143205, + "language_loss": 0.87410611, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89329362, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.582549571990967 + }, + { + "auxiliary_loss_clip": 0.01141191, + "auxiliary_loss_mlp": 0.01061872, + "balance_loss_clip": 1.04801607, + "balance_loss_mlp": 1.03766072, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.7579119248422495, + "language_loss": 0.77929771, + "learning_rate": 3.981182345072293e-06, + "loss": 0.8013283, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.661501169204712 + }, + { + "auxiliary_loss_clip": 0.01177055, + "auxiliary_loss_mlp": 0.01063989, + "balance_loss_clip": 1.05365932, + "balance_loss_mlp": 1.04048133, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.7268883620742161, + "language_loss": 0.82260394, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84501445, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.70599102973938 + }, + { + "auxiliary_loss_clip": 0.01163893, + "auxiliary_loss_mlp": 0.00752321, + "balance_loss_clip": 1.05490685, + "balance_loss_mlp": 1.00062728, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.4489020625743083, + "language_loss": 0.76624548, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78540754, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.6655027866363525 + }, + { + "auxiliary_loss_clip": 0.01174713, + "auxiliary_loss_mlp": 0.01060639, + "balance_loss_clip": 1.05537963, + "balance_loss_mlp": 1.03623652, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.7761921109217975, + "language_loss": 0.77442759, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79678112, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.5557100772857666 + }, + { + "auxiliary_loss_clip": 0.01169629, + "auxiliary_loss_mlp": 0.01053035, + "balance_loss_clip": 1.0529263, + "balance_loss_mlp": 1.03033757, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 2.007650555696547, + "language_loss": 0.80082411, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.82305074, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.739826202392578 + }, + { + "auxiliary_loss_clip": 0.01169577, + "auxiliary_loss_mlp": 0.01051916, + "balance_loss_clip": 1.05247509, + "balance_loss_mlp": 1.02923071, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 2.734821274848917, + "language_loss": 0.78330886, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80552375, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.59505558013916 + }, + { + "auxiliary_loss_clip": 0.0117745, + "auxiliary_loss_mlp": 0.01066031, + "balance_loss_clip": 1.05382276, + "balance_loss_mlp": 1.04299974, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.349406536256932, + "language_loss": 0.81279075, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83522552, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 2.5931737422943115 + }, + { + "auxiliary_loss_clip": 0.01161853, + "auxiliary_loss_mlp": 0.01066331, + "balance_loss_clip": 1.05457652, + "balance_loss_mlp": 1.04279947, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 1.8801925990051753, + "language_loss": 0.84719068, + "learning_rate": 3.98080740775156e-06, + "loss": 0.8694725, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 2.6310455799102783 + }, + { + "auxiliary_loss_clip": 0.01142556, + "auxiliary_loss_mlp": 0.01054525, + "balance_loss_clip": 1.04782796, + "balance_loss_mlp": 1.03200674, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.9680630083085946, + "language_loss": 0.90468085, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9266516, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.6922059059143066 + }, + { + "auxiliary_loss_clip": 0.01188303, + "auxiliary_loss_mlp": 0.01056625, + "balance_loss_clip": 1.05550861, + "balance_loss_mlp": 1.03284311, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 1.7856290512762105, + "language_loss": 0.72663987, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74908918, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 2.558749198913574 + }, + { + "auxiliary_loss_clip": 0.0113782, + "auxiliary_loss_mlp": 0.01053118, + "balance_loss_clip": 1.04762864, + "balance_loss_mlp": 1.02873993, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.8996205110456295, + "language_loss": 0.84157807, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86348748, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 2.691983461380005 + }, + { + "auxiliary_loss_clip": 0.01194515, + "auxiliary_loss_mlp": 0.01062421, + "balance_loss_clip": 1.05743694, + "balance_loss_mlp": 1.03764915, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.0377712585780254, + "language_loss": 0.83829391, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86086327, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.564521551132202 + }, + { + "auxiliary_loss_clip": 0.01127545, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.0447818, + "balance_loss_mlp": 1.03449249, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 2.6508754590687844, + "language_loss": 0.81355387, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83541948, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.7723538875579834 + }, + { + "auxiliary_loss_clip": 0.011494, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_clip": 1.05032063, + "balance_loss_mlp": 1.03402042, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.06445089350465, + "language_loss": 0.75747836, + "learning_rate": 3.980483103494872e-06, + "loss": 0.77953619, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 2.651658296585083 + }, + { + "auxiliary_loss_clip": 0.01151372, + "auxiliary_loss_mlp": 0.01060088, + "balance_loss_clip": 1.05064583, + "balance_loss_mlp": 1.03795087, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 1.9898279620270802, + "language_loss": 0.86517411, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88728869, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.603562355041504 + }, + { + "auxiliary_loss_clip": 0.01177206, + "auxiliary_loss_mlp": 0.01068959, + "balance_loss_clip": 1.05343032, + "balance_loss_mlp": 1.04642797, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 2.192709182268489, + "language_loss": 0.86558664, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.88804823, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.561358690261841 + }, + { + "auxiliary_loss_clip": 0.01186829, + "auxiliary_loss_mlp": 0.0106318, + "balance_loss_clip": 1.05383873, + "balance_loss_mlp": 1.04088819, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.240881788198622, + "language_loss": 0.84822094, + "learning_rate": 3.980319937487235e-06, + "loss": 0.87072098, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 2.557262420654297 + }, + { + "auxiliary_loss_clip": 0.01141756, + "auxiliary_loss_mlp": 0.01063578, + "balance_loss_clip": 1.04665387, + "balance_loss_mlp": 1.04076171, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.5305998786443396, + "language_loss": 0.76960796, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79166126, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 2.6524837017059326 + }, + { + "auxiliary_loss_clip": 0.01142357, + "auxiliary_loss_mlp": 0.01060428, + "balance_loss_clip": 1.05117834, + "balance_loss_mlp": 1.03683674, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 6.83865368868767, + "language_loss": 0.92176032, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94378817, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 2.654203414916992 + }, + { + "auxiliary_loss_clip": 0.01121572, + "auxiliary_loss_mlp": 0.01059126, + "balance_loss_clip": 1.04862678, + "balance_loss_mlp": 1.03633285, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 3.6111047601491677, + "language_loss": 0.90769035, + "learning_rate": 3.980156095634242e-06, + "loss": 0.92949736, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.7061567306518555 + }, + { + "auxiliary_loss_clip": 0.01191195, + "auxiliary_loss_mlp": 0.01073815, + "balance_loss_clip": 1.05632389, + "balance_loss_mlp": 1.05048633, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.3642510656556452, + "language_loss": 0.82291341, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84556353, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.594834566116333 + }, + { + "auxiliary_loss_clip": 0.01189202, + "auxiliary_loss_mlp": 0.01064434, + "balance_loss_clip": 1.0554018, + "balance_loss_mlp": 1.03949523, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 1.9264202200867353, + "language_loss": 0.83438987, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8569262, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.5509397983551025 + }, + { + "auxiliary_loss_clip": 0.01154562, + "auxiliary_loss_mlp": 0.01050529, + "balance_loss_clip": 1.05125427, + "balance_loss_mlp": 1.02768815, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 1.9162661294206973, + "language_loss": 0.90404791, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92609882, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.618229627609253 + }, + { + "auxiliary_loss_clip": 0.01194639, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.05222154, + "balance_loss_mlp": 1.0248549, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 3.032492783523036, + "language_loss": 0.77122915, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79366869, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.515773057937622 + }, + { + "auxiliary_loss_clip": 0.01177335, + "auxiliary_loss_mlp": 0.01056163, + "balance_loss_clip": 1.05799484, + "balance_loss_mlp": 1.03334606, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.5010259814706284, + "language_loss": 0.85826391, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.8805989, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 2.645554304122925 + }, + { + "auxiliary_loss_clip": 0.0117377, + "auxiliary_loss_mlp": 0.01060725, + "balance_loss_clip": 1.05087149, + "balance_loss_mlp": 1.03722906, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 2.0751385877236244, + "language_loss": 0.79388148, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81622642, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.536548376083374 + }, + { + "auxiliary_loss_clip": 0.01175741, + "auxiliary_loss_mlp": 0.0075229, + "balance_loss_clip": 1.05198622, + "balance_loss_mlp": 1.00051975, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.051198390098258, + "language_loss": 0.7812776, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80055791, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 2.5642240047454834 + }, + { + "auxiliary_loss_clip": 0.01187112, + "auxiliary_loss_mlp": 0.01051126, + "balance_loss_clip": 1.05565, + "balance_loss_mlp": 1.02699828, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 2.1099974789588747, + "language_loss": 0.8135193, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83590168, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.598944664001465 + }, + { + "auxiliary_loss_clip": 0.01162739, + "auxiliary_loss_mlp": 0.010678, + "balance_loss_clip": 1.05073333, + "balance_loss_mlp": 1.04365993, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.0698789607120514, + "language_loss": 0.95140433, + "learning_rate": 3.979660515563434e-06, + "loss": 0.9737097, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 2.62667179107666 + }, + { + "auxiliary_loss_clip": 0.01172711, + "auxiliary_loss_mlp": 0.01060054, + "balance_loss_clip": 1.05456555, + "balance_loss_mlp": 1.03784502, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.2043234751766594, + "language_loss": 0.80811477, + "learning_rate": 3.979605075738569e-06, + "loss": 0.83044243, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.5642166137695312 + }, + { + "auxiliary_loss_clip": 0.01194237, + "auxiliary_loss_mlp": 0.01060227, + "balance_loss_clip": 1.05449271, + "balance_loss_mlp": 1.03403711, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.405349775328163, + "language_loss": 0.70686567, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72941035, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 2.62843656539917 + }, + { + "auxiliary_loss_clip": 0.01151501, + "auxiliary_loss_mlp": 0.01072411, + "balance_loss_clip": 1.04958236, + "balance_loss_mlp": 1.04629278, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.891900868718961, + "language_loss": 0.77210635, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79434544, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 4.166505813598633 + }, + { + "auxiliary_loss_clip": 0.01185273, + "auxiliary_loss_mlp": 0.01053801, + "balance_loss_clip": 1.05357242, + "balance_loss_mlp": 1.03031695, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 1.9622770481399936, + "language_loss": 0.82804525, + "learning_rate": 3.979438305871464e-06, + "loss": 0.85043597, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 2.598177433013916 + }, + { + "auxiliary_loss_clip": 0.01138411, + "auxiliary_loss_mlp": 0.00752381, + "balance_loss_clip": 1.04869652, + "balance_loss_mlp": 1.00051403, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 1.6930912323359602, + "language_loss": 0.75864202, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77754998, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 2.6751797199249268 + }, + { + "auxiliary_loss_clip": 0.01122106, + "auxiliary_loss_mlp": 0.00752491, + "balance_loss_clip": 1.04688621, + "balance_loss_mlp": 1.00047553, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 2.1000781941178044, + "language_loss": 0.77451986, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79326588, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 5.838830471038818 + }, + { + "auxiliary_loss_clip": 0.01165825, + "auxiliary_loss_mlp": 0.01055221, + "balance_loss_clip": 1.05220771, + "balance_loss_mlp": 1.03035367, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.337601886652349, + "language_loss": 0.86780035, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.89001077, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 4.133359432220459 + }, + { + "auxiliary_loss_clip": 0.0114566, + "auxiliary_loss_mlp": 0.01057391, + "balance_loss_clip": 1.04912329, + "balance_loss_mlp": 1.03147531, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 1.8668697800851548, + "language_loss": 0.8879723, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91000283, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.6546003818511963 + }, + { + "auxiliary_loss_clip": 0.01162991, + "auxiliary_loss_mlp": 0.01065149, + "balance_loss_clip": 1.05303884, + "balance_loss_mlp": 1.03953052, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 1.7881313837244552, + "language_loss": 0.8864038, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90868521, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.6186609268188477 + }, + { + "auxiliary_loss_clip": 0.01084891, + "auxiliary_loss_mlp": 0.01003876, + "balance_loss_clip": 1.04214835, + "balance_loss_mlp": 0.99984705, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.8904215018925132, + "language_loss": 0.63101482, + "learning_rate": 3.979102739560979e-06, + "loss": 0.6519025, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.24124813079834 + }, + { + "auxiliary_loss_clip": 0.01148443, + "auxiliary_loss_mlp": 0.01065211, + "balance_loss_clip": 1.04936039, + "balance_loss_mlp": 1.03630269, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.1587808781535704, + "language_loss": 0.632972, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65510857, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.67132306098938 + }, + { + "auxiliary_loss_clip": 0.01175725, + "auxiliary_loss_mlp": 0.0105806, + "balance_loss_clip": 1.0527035, + "balance_loss_mlp": 1.03424191, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 1.7343274898201662, + "language_loss": 0.76162064, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78395844, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.624129295349121 + }, + { + "auxiliary_loss_clip": 0.01172017, + "auxiliary_loss_mlp": 0.00752524, + "balance_loss_clip": 1.05452704, + "balance_loss_mlp": 1.00049639, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 3.4173741179350974, + "language_loss": 0.69531256, + "learning_rate": 3.978933943232123e-06, + "loss": 0.71455795, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.570718288421631 + }, + { + "auxiliary_loss_clip": 0.01190822, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.05517507, + "balance_loss_mlp": 1.03430319, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9729142668576993, + "language_loss": 0.88643825, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90894049, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.5625641345977783 + }, + { + "auxiliary_loss_clip": 0.01199302, + "auxiliary_loss_mlp": 0.01074345, + "balance_loss_clip": 1.05522156, + "balance_loss_mlp": 1.04660523, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.1626605002135517, + "language_loss": 0.87897146, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90170789, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.47031831741333 + }, + { + "auxiliary_loss_clip": 0.01171095, + "auxiliary_loss_mlp": 0.01064363, + "balance_loss_clip": 1.05452967, + "balance_loss_mlp": 1.03828025, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 6.9376712801246745, + "language_loss": 0.64448965, + "learning_rate": 3.978764471530921e-06, + "loss": 0.66684413, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.7353668212890625 + }, + { + "auxiliary_loss_clip": 0.01167377, + "auxiliary_loss_mlp": 0.0075237, + "balance_loss_clip": 1.0535171, + "balance_loss_mlp": 1.00051451, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 1.994316607626091, + "language_loss": 0.74311507, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76231247, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 2.5774824619293213 + }, + { + "auxiliary_loss_clip": 0.01150121, + "auxiliary_loss_mlp": 0.01076046, + "balance_loss_clip": 1.05024981, + "balance_loss_mlp": 1.04890227, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.778819149972547, + "language_loss": 0.81862539, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84088707, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 2.6084816455841064 + }, + { + "auxiliary_loss_clip": 0.01135803, + "auxiliary_loss_mlp": 0.0106649, + "balance_loss_clip": 1.05259013, + "balance_loss_mlp": 1.04064572, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.3157727084393525, + "language_loss": 0.66696334, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68898624, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 2.8131306171417236 + }, + { + "auxiliary_loss_clip": 0.01055703, + "auxiliary_loss_mlp": 0.0100962, + "balance_loss_clip": 1.02692056, + "balance_loss_mlp": 1.00499487, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.9049380968190343, + "language_loss": 0.70343155, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72408479, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 3.270096778869629 + }, + { + "auxiliary_loss_clip": 0.01191768, + "auxiliary_loss_mlp": 0.01069018, + "balance_loss_clip": 1.0557636, + "balance_loss_mlp": 1.04342341, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.1915814333518773, + "language_loss": 0.79724145, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81984931, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.5999348163604736 + }, + { + "auxiliary_loss_clip": 0.01147274, + "auxiliary_loss_mlp": 0.01068132, + "balance_loss_clip": 1.05092847, + "balance_loss_mlp": 1.04293144, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 7.025142775952372, + "language_loss": 0.93487215, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95702624, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 2.6558501720428467 + }, + { + "auxiliary_loss_clip": 0.01164882, + "auxiliary_loss_mlp": 0.01063895, + "balance_loss_clip": 1.05734169, + "balance_loss_mlp": 1.0388608, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.8703493617257148, + "language_loss": 0.87949246, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90178025, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.6324715614318848 + }, + { + "auxiliary_loss_clip": 0.01186638, + "auxiliary_loss_mlp": 0.01066513, + "balance_loss_clip": 1.05287015, + "balance_loss_mlp": 1.04075146, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.850743941649476, + "language_loss": 0.79502463, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81755608, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 2.492222785949707 + }, + { + "auxiliary_loss_clip": 0.01053695, + "auxiliary_loss_mlp": 0.01009304, + "balance_loss_clip": 1.02314734, + "balance_loss_mlp": 1.00506008, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7761287893049952, + "language_loss": 0.58050919, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60113919, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.2468597888946533 + }, + { + "auxiliary_loss_clip": 0.01139932, + "auxiliary_loss_mlp": 0.01063791, + "balance_loss_clip": 1.05384421, + "balance_loss_mlp": 1.03884053, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 2.5637106105703746, + "language_loss": 0.90031105, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92234838, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.655123710632324 + }, + { + "auxiliary_loss_clip": 0.01150081, + "auxiliary_loss_mlp": 0.01058998, + "balance_loss_clip": 1.05253148, + "balance_loss_mlp": 1.03395224, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 1.825928853929046, + "language_loss": 0.8124969, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83458769, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.5581352710723877 + }, + { + "auxiliary_loss_clip": 0.01180207, + "auxiliary_loss_mlp": 0.01061619, + "balance_loss_clip": 1.05430722, + "balance_loss_mlp": 1.03716946, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.949426017369523, + "language_loss": 0.76141465, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78383291, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 2.5932865142822266 + }, + { + "auxiliary_loss_clip": 0.01142379, + "auxiliary_loss_mlp": 0.01069724, + "balance_loss_clip": 1.04775739, + "balance_loss_mlp": 1.04383183, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 2.1873416368647334, + "language_loss": 0.8514266, + "learning_rate": 3.978022291272044e-06, + "loss": 0.87354761, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.5780677795410156 + }, + { + "auxiliary_loss_clip": 0.01196653, + "auxiliary_loss_mlp": 0.01065925, + "balance_loss_clip": 1.05978048, + "balance_loss_mlp": 1.04202354, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 1.7620706293799606, + "language_loss": 0.82729232, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84991813, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.5547633171081543 + }, + { + "auxiliary_loss_clip": 0.01189068, + "auxiliary_loss_mlp": 0.01060301, + "balance_loss_clip": 1.0539937, + "balance_loss_mlp": 1.03515983, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 4.478222980767158, + "language_loss": 0.82382917, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84632289, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.527971029281616 + }, + { + "auxiliary_loss_clip": 0.01147908, + "auxiliary_loss_mlp": 0.01061161, + "balance_loss_clip": 1.05521083, + "balance_loss_mlp": 1.03636539, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.3262583848749507, + "language_loss": 0.76391172, + "learning_rate": 3.977849218567442e-06, + "loss": 0.7860024, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.7131223678588867 + }, + { + "auxiliary_loss_clip": 0.01164766, + "auxiliary_loss_mlp": 0.01062427, + "balance_loss_clip": 1.05356002, + "balance_loss_mlp": 1.0372498, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.7954939094691897, + "language_loss": 0.81074989, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83302176, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.657432794570923 + }, + { + "auxiliary_loss_clip": 0.01121882, + "auxiliary_loss_mlp": 0.01069146, + "balance_loss_clip": 1.04628623, + "balance_loss_mlp": 1.04134655, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.7475817690101474, + "language_loss": 0.65531534, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67722559, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 2.711710214614868 + }, + { + "auxiliary_loss_clip": 0.01149294, + "auxiliary_loss_mlp": 0.01062684, + "balance_loss_clip": 1.05206358, + "balance_loss_mlp": 1.03713775, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 1.988691880132706, + "language_loss": 0.79581141, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81793118, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 2.672640562057495 + }, + { + "auxiliary_loss_clip": 0.01165966, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.05201173, + "balance_loss_mlp": 1.02821767, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 3.6674060806129574, + "language_loss": 0.72713995, + "learning_rate": 3.977617404968205e-06, + "loss": 0.7493214, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 2.61460280418396 + }, + { + "auxiliary_loss_clip": 0.01176816, + "auxiliary_loss_mlp": 0.01059265, + "balance_loss_clip": 1.05360603, + "balance_loss_mlp": 1.03432608, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.156162140502577, + "language_loss": 0.82043487, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84279573, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 2.5321462154388428 + }, + { + "auxiliary_loss_clip": 0.01180153, + "auxiliary_loss_mlp": 0.01064925, + "balance_loss_clip": 1.05486727, + "balance_loss_mlp": 1.03919959, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 2.2456933967186523, + "language_loss": 0.88774467, + "learning_rate": 3.977501048211088e-06, + "loss": 0.91019547, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 2.545961380004883 + }, + { + "auxiliary_loss_clip": 0.01178776, + "auxiliary_loss_mlp": 0.01066375, + "balance_loss_clip": 1.05615497, + "balance_loss_mlp": 1.04166293, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 1.975571729537351, + "language_loss": 0.70920086, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73165238, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 2.56514048576355 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01068036, + "balance_loss_clip": 1.05293393, + "balance_loss_mlp": 1.04365802, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.5890990793507191, + "language_loss": 0.82883698, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85095108, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 2.6520426273345947 + }, + { + "auxiliary_loss_clip": 0.01155684, + "auxiliary_loss_mlp": 0.00752474, + "balance_loss_clip": 1.04796851, + "balance_loss_mlp": 1.00055838, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.1222988876563043, + "language_loss": 0.79991311, + "learning_rate": 3.977325950678162e-06, + "loss": 0.81899464, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 2.5758299827575684 + }, + { + "auxiliary_loss_clip": 0.01165289, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_clip": 1.05494356, + "balance_loss_mlp": 1.03861964, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 2.1052274069214234, + "language_loss": 0.81349325, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83577788, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 2.618074417114258 + }, + { + "auxiliary_loss_clip": 0.01165554, + "auxiliary_loss_mlp": 0.01072538, + "balance_loss_clip": 1.05216074, + "balance_loss_mlp": 1.04618061, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.85323378698783, + "language_loss": 0.73197293, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75435388, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 4.2859790325164795 + }, + { + "auxiliary_loss_clip": 0.01189805, + "auxiliary_loss_mlp": 0.01066365, + "balance_loss_clip": 1.05555725, + "balance_loss_mlp": 1.03999603, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.3662845505783587, + "language_loss": 0.79568017, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81824178, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.5284430980682373 + }, + { + "auxiliary_loss_clip": 0.01176738, + "auxiliary_loss_mlp": 0.01060511, + "balance_loss_clip": 1.05411553, + "balance_loss_mlp": 1.03680003, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.274401362108263, + "language_loss": 0.59322298, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61559546, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.6620821952819824 + }, + { + "auxiliary_loss_clip": 0.01182947, + "auxiliary_loss_mlp": 0.01061781, + "balance_loss_clip": 1.05481148, + "balance_loss_mlp": 1.03691363, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.3460302949870893, + "language_loss": 0.74691468, + "learning_rate": 3.977032621878305e-06, + "loss": 0.76936197, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 3.993556499481201 + }, + { + "auxiliary_loss_clip": 0.01140727, + "auxiliary_loss_mlp": 0.01063818, + "balance_loss_clip": 1.0495013, + "balance_loss_mlp": 1.0394758, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 4.474515232806653, + "language_loss": 0.88498032, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90702581, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 4.115755558013916 + }, + { + "auxiliary_loss_clip": 0.01146774, + "auxiliary_loss_mlp": 0.01059054, + "balance_loss_clip": 1.04796481, + "balance_loss_mlp": 1.034127, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.2827612013090435, + "language_loss": 0.8323741, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85443234, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 4.070139408111572 + }, + { + "auxiliary_loss_clip": 0.0117422, + "auxiliary_loss_mlp": 0.01060921, + "balance_loss_clip": 1.05384564, + "balance_loss_mlp": 1.03618479, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 2.420088402953643, + "language_loss": 0.76257348, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78492486, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.565587043762207 + }, + { + "auxiliary_loss_clip": 0.01154347, + "auxiliary_loss_mlp": 0.0106239, + "balance_loss_clip": 1.05000222, + "balance_loss_mlp": 1.0369029, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.83279221162963, + "language_loss": 0.75720119, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77936852, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.5682806968688965 + }, + { + "auxiliary_loss_clip": 0.01188035, + "auxiliary_loss_mlp": 0.01064737, + "balance_loss_clip": 1.0551939, + "balance_loss_mlp": 1.0398581, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.5891116914206727, + "language_loss": 0.84045291, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86298066, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.5264530181884766 + }, + { + "auxiliary_loss_clip": 0.01174558, + "auxiliary_loss_mlp": 0.01065023, + "balance_loss_clip": 1.05281699, + "balance_loss_mlp": 1.03879738, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.264076120164278, + "language_loss": 0.75525212, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77764797, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.5182743072509766 + }, + { + "auxiliary_loss_clip": 0.01153494, + "auxiliary_loss_mlp": 0.01061718, + "balance_loss_clip": 1.04763722, + "balance_loss_mlp": 1.03779316, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.742323774069897, + "language_loss": 0.76365691, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78580904, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 2.7317306995391846 + }, + { + "auxiliary_loss_clip": 0.01191078, + "auxiliary_loss_mlp": 0.01067001, + "balance_loss_clip": 1.05661225, + "balance_loss_mlp": 1.0436362, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.7478858584466885, + "language_loss": 0.84062028, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86320102, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.5470380783081055 + }, + { + "auxiliary_loss_clip": 0.01160762, + "auxiliary_loss_mlp": 0.01058756, + "balance_loss_clip": 1.05111623, + "balance_loss_mlp": 1.03411579, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.615843533929529, + "language_loss": 0.77319503, + "learning_rate": 3.97649990716259e-06, + "loss": 0.79539025, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.598360776901245 + }, + { + "auxiliary_loss_clip": 0.011589, + "auxiliary_loss_mlp": 0.01061869, + "balance_loss_clip": 1.04925382, + "balance_loss_mlp": 1.03772879, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6649897527835884, + "language_loss": 0.84520054, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86740816, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.5964460372924805 + }, + { + "auxiliary_loss_clip": 0.01186603, + "auxiliary_loss_mlp": 0.01055646, + "balance_loss_clip": 1.0518198, + "balance_loss_mlp": 1.03245938, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 1.9927715521802223, + "language_loss": 0.85492051, + "learning_rate": 3.976380701617068e-06, + "loss": 0.87734306, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 2.56862211227417 + }, + { + "auxiliary_loss_clip": 0.01185586, + "auxiliary_loss_mlp": 0.01050938, + "balance_loss_clip": 1.05248344, + "balance_loss_mlp": 1.02789438, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.9707767464687576, + "language_loss": 0.85374224, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87610745, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 2.565200090408325 + }, + { + "auxiliary_loss_clip": 0.01155646, + "auxiliary_loss_mlp": 0.01059751, + "balance_loss_clip": 1.05333996, + "balance_loss_mlp": 1.03421664, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.8749162514452973, + "language_loss": 0.90953082, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93168485, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 2.617156982421875 + }, + { + "auxiliary_loss_clip": 0.01051588, + "auxiliary_loss_mlp": 0.0100512, + "balance_loss_clip": 1.01718509, + "balance_loss_mlp": 1.00023293, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.8811422831890537, + "language_loss": 0.65063655, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67120361, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.256359815597534 + }, + { + "auxiliary_loss_clip": 0.01173201, + "auxiliary_loss_mlp": 0.01057634, + "balance_loss_clip": 1.0527873, + "balance_loss_mlp": 1.03449559, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.6555463937682529, + "language_loss": 0.87684053, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89914888, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.667637348175049 + }, + { + "auxiliary_loss_clip": 0.01092476, + "auxiliary_loss_mlp": 0.01077301, + "balance_loss_clip": 1.04319978, + "balance_loss_mlp": 1.04987073, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.327323320852558, + "language_loss": 0.84880733, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87050509, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 2.760756492614746 + }, + { + "auxiliary_loss_clip": 0.01134633, + "auxiliary_loss_mlp": 0.01066197, + "balance_loss_clip": 1.0493027, + "balance_loss_mlp": 1.04197407, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.0598347417059855, + "language_loss": 0.79420221, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81621051, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.8902359008789062 + }, + { + "auxiliary_loss_clip": 0.0114009, + "auxiliary_loss_mlp": 0.01060286, + "balance_loss_clip": 1.05238271, + "balance_loss_mlp": 1.03557372, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.35557639326236, + "language_loss": 0.88255572, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90455949, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 2.728447914123535 + }, + { + "auxiliary_loss_clip": 0.01189445, + "auxiliary_loss_mlp": 0.01066237, + "balance_loss_clip": 1.05598629, + "balance_loss_mlp": 1.04164457, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.7107927712665556, + "language_loss": 0.96154153, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98409832, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.533402442932129 + }, + { + "auxiliary_loss_clip": 0.01151005, + "auxiliary_loss_mlp": 0.01056822, + "balance_loss_clip": 1.04806137, + "balance_loss_mlp": 1.03374302, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.119079363538691, + "language_loss": 0.76305449, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78513271, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.6897077560424805 + }, + { + "auxiliary_loss_clip": 0.01131086, + "auxiliary_loss_mlp": 0.00752584, + "balance_loss_clip": 1.05265558, + "balance_loss_mlp": 1.00068402, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 2.2323145468823373, + "language_loss": 0.80374491, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82258165, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 2.776449680328369 + }, + { + "auxiliary_loss_clip": 0.0114662, + "auxiliary_loss_mlp": 0.01060683, + "balance_loss_clip": 1.05273771, + "balance_loss_mlp": 1.03617311, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 1.8921761375885429, + "language_loss": 0.86289209, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88496518, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.770564317703247 + }, + { + "auxiliary_loss_clip": 0.01186202, + "auxiliary_loss_mlp": 0.01053465, + "balance_loss_clip": 1.05401373, + "balance_loss_mlp": 1.02934933, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.8945773096838294, + "language_loss": 0.71979707, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74219376, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.744586229324341 + }, + { + "auxiliary_loss_clip": 0.01177878, + "auxiliary_loss_mlp": 0.01071592, + "balance_loss_clip": 1.05553985, + "balance_loss_mlp": 1.04785728, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5915506069188028, + "language_loss": 0.7106905, + "learning_rate": 3.97559855928952e-06, + "loss": 0.73318529, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.775195360183716 + }, + { + "auxiliary_loss_clip": 0.01144134, + "auxiliary_loss_mlp": 0.00752517, + "balance_loss_clip": 1.05185008, + "balance_loss_mlp": 1.00064111, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.3058149190996047, + "language_loss": 0.82185018, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84081674, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.667354106903076 + }, + { + "auxiliary_loss_clip": 0.01175026, + "auxiliary_loss_mlp": 0.01068996, + "balance_loss_clip": 1.0566746, + "balance_loss_mlp": 1.04443872, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 2.063250544223281, + "language_loss": 0.75047922, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77291948, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.584181070327759 + }, + { + "auxiliary_loss_clip": 0.01192418, + "auxiliary_loss_mlp": 0.01063662, + "balance_loss_clip": 1.06028688, + "balance_loss_mlp": 1.03853261, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.6977347858012843, + "language_loss": 0.76253086, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78509164, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.579174518585205 + }, + { + "auxiliary_loss_clip": 0.01123518, + "auxiliary_loss_mlp": 0.0106277, + "balance_loss_clip": 1.04972315, + "balance_loss_mlp": 1.0383321, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.7641063506933738, + "language_loss": 0.85100627, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87286913, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 2.7290120124816895 + }, + { + "auxiliary_loss_clip": 0.01179485, + "auxiliary_loss_mlp": 0.01046863, + "balance_loss_clip": 1.05852365, + "balance_loss_mlp": 1.02408206, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 2.3581326927707953, + "language_loss": 0.90626466, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92852807, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 2.604092597961426 + }, + { + "auxiliary_loss_clip": 0.01127413, + "auxiliary_loss_mlp": 0.01058107, + "balance_loss_clip": 1.04862547, + "balance_loss_mlp": 1.03344226, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.7811066759313605, + "language_loss": 0.83519834, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85705352, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.673567295074463 + }, + { + "auxiliary_loss_clip": 0.0112877, + "auxiliary_loss_mlp": 0.01061723, + "balance_loss_clip": 1.04721355, + "balance_loss_mlp": 1.03815579, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.500996421816164, + "language_loss": 0.77464449, + "learning_rate": 3.975172161365958e-06, + "loss": 0.79654944, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 2.7152822017669678 + }, + { + "auxiliary_loss_clip": 0.0117813, + "auxiliary_loss_mlp": 0.01068712, + "balance_loss_clip": 1.05242014, + "balance_loss_mlp": 1.04200888, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 2.0429953427862273, + "language_loss": 0.80488372, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82735211, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 2.573932647705078 + }, + { + "auxiliary_loss_clip": 0.01154278, + "auxiliary_loss_mlp": 0.00752355, + "balance_loss_clip": 1.05242109, + "balance_loss_mlp": 1.00070405, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.8448173684996456, + "language_loss": 0.72976238, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.74882865, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 2.7011818885803223 + }, + { + "auxiliary_loss_clip": 0.01169159, + "auxiliary_loss_mlp": 0.01082619, + "balance_loss_clip": 1.05517411, + "balance_loss_mlp": 1.05771649, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.8557443397113702, + "language_loss": 0.85831153, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88082927, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 2.5421202182769775 + }, + { + "auxiliary_loss_clip": 0.01158367, + "auxiliary_loss_mlp": 0.01062698, + "balance_loss_clip": 1.05269468, + "balance_loss_mlp": 1.03942823, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.9177426875702395, + "language_loss": 0.8224299, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84464055, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 2.6354479789733887 + }, + { + "auxiliary_loss_clip": 0.01161997, + "auxiliary_loss_mlp": 0.00752575, + "balance_loss_clip": 1.05227399, + "balance_loss_mlp": 1.00068188, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 2.3995570449067993, + "language_loss": 0.73441786, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75356358, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 2.5289065837860107 + }, + { + "auxiliary_loss_clip": 0.01130151, + "auxiliary_loss_mlp": 0.00752471, + "balance_loss_clip": 1.04555106, + "balance_loss_mlp": 1.00063217, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.5133464147180833, + "language_loss": 0.79715759, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81598377, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 4.286882162094116 + }, + { + "auxiliary_loss_clip": 0.01169507, + "auxiliary_loss_mlp": 0.01061896, + "balance_loss_clip": 1.0499773, + "balance_loss_mlp": 1.03662324, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.8518826937383488, + "language_loss": 0.7380892, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76040322, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 2.6000142097473145 + }, + { + "auxiliary_loss_clip": 0.01140187, + "auxiliary_loss_mlp": 0.01064865, + "balance_loss_clip": 1.04778302, + "balance_loss_mlp": 1.03967619, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.7933324583446253, + "language_loss": 0.66566098, + "learning_rate": 3.974680355576927e-06, + "loss": 0.68771148, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.6303131580352783 + }, + { + "auxiliary_loss_clip": 0.01156334, + "auxiliary_loss_mlp": 0.01066503, + "balance_loss_clip": 1.05312443, + "balance_loss_mlp": 1.04057527, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.707477645712578, + "language_loss": 0.73239684, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75462514, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.677720069885254 + }, + { + "auxiliary_loss_clip": 0.01118492, + "auxiliary_loss_mlp": 0.01058755, + "balance_loss_clip": 1.04626513, + "balance_loss_mlp": 1.03547382, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.7192756012495023, + "language_loss": 0.90530992, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92708242, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 4.2497687339782715 + }, + { + "auxiliary_loss_clip": 0.01156021, + "auxiliary_loss_mlp": 0.01054384, + "balance_loss_clip": 1.04892862, + "balance_loss_mlp": 1.03058946, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.273871902429028, + "language_loss": 0.80161238, + "learning_rate": 3.974494692820539e-06, + "loss": 0.8237164, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 5.637251853942871 + }, + { + "auxiliary_loss_clip": 0.01161534, + "auxiliary_loss_mlp": 0.01057031, + "balance_loss_clip": 1.05264187, + "balance_loss_mlp": 1.03420258, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 1.952647150759509, + "language_loss": 0.69470334, + "learning_rate": 3.974432655485872e-06, + "loss": 0.7168889, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.588764190673828 + }, + { + "auxiliary_loss_clip": 0.01169106, + "auxiliary_loss_mlp": 0.0105777, + "balance_loss_clip": 1.05303025, + "balance_loss_mlp": 1.034024, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 2.017885512605034, + "language_loss": 0.83963436, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.86190307, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.6158251762390137 + }, + { + "auxiliary_loss_clip": 0.01184915, + "auxiliary_loss_mlp": 0.01058308, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.03379869, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 3.2695344878488117, + "language_loss": 0.90417653, + "learning_rate": 3.974308356206838e-06, + "loss": 0.9266088, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.626020669937134 + }, + { + "auxiliary_loss_clip": 0.01141574, + "auxiliary_loss_mlp": 0.01060108, + "balance_loss_clip": 1.05253482, + "balance_loss_mlp": 1.03556228, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.938323394934383, + "language_loss": 0.82646483, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84848166, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.710608959197998 + }, + { + "auxiliary_loss_clip": 0.01161287, + "auxiliary_loss_mlp": 0.01051482, + "balance_loss_clip": 1.05163908, + "balance_loss_mlp": 1.02662671, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.2171893334161927, + "language_loss": 0.79378098, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81590867, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.6676883697509766 + }, + { + "auxiliary_loss_clip": 0.01092911, + "auxiliary_loss_mlp": 0.00752649, + "balance_loss_clip": 1.04146004, + "balance_loss_mlp": 1.00052333, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.1555509728386406, + "language_loss": 0.88324958, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90170515, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.7119979858398438 + }, + { + "auxiliary_loss_clip": 0.01179362, + "auxiliary_loss_mlp": 0.01057882, + "balance_loss_clip": 1.05107594, + "balance_loss_mlp": 1.03356361, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 1.9751114132242424, + "language_loss": 0.8353647, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85773718, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.5690765380859375 + }, + { + "auxiliary_loss_clip": 0.0118627, + "auxiliary_loss_mlp": 0.0105471, + "balance_loss_clip": 1.05395162, + "balance_loss_mlp": 1.02966404, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 3.070381477507999, + "language_loss": 0.7848205, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80723023, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 2.535754919052124 + }, + { + "auxiliary_loss_clip": 0.01180224, + "auxiliary_loss_mlp": 0.01054928, + "balance_loss_clip": 1.05574203, + "balance_loss_mlp": 1.03025198, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.1342893785288077, + "language_loss": 0.74185669, + "learning_rate": 3.973933661662101e-06, + "loss": 0.7642082, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 2.5574393272399902 + }, + { + "auxiliary_loss_clip": 0.01153504, + "auxiliary_loss_mlp": 0.01064042, + "balance_loss_clip": 1.05124784, + "balance_loss_mlp": 1.04011631, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.6724228029437804, + "language_loss": 0.81151497, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83369052, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 2.6250534057617188 + }, + { + "auxiliary_loss_clip": 0.01183687, + "auxiliary_loss_mlp": 0.00752308, + "balance_loss_clip": 1.05198991, + "balance_loss_mlp": 1.0004952, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 3.883571090065731, + "language_loss": 0.89036834, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.90972829, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.5245895385742188 + }, + { + "auxiliary_loss_clip": 0.01176097, + "auxiliary_loss_mlp": 0.00752421, + "balance_loss_clip": 1.04998028, + "balance_loss_mlp": 1.00052273, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 1.7794594613107462, + "language_loss": 0.73238987, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75167507, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.7955965995788574 + }, + { + "auxiliary_loss_clip": 0.01158637, + "auxiliary_loss_mlp": 0.01054303, + "balance_loss_clip": 1.05040693, + "balance_loss_mlp": 1.03072393, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.8274976752214507, + "language_loss": 0.82613087, + "learning_rate": 3.973682368232138e-06, + "loss": 0.84826028, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 2.6122965812683105 + }, + { + "auxiliary_loss_clip": 0.01135503, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.0474596, + "balance_loss_mlp": 1.03542626, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.657923858659588, + "language_loss": 0.74707627, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76901925, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.692624807357788 + }, + { + "auxiliary_loss_clip": 0.01158122, + "auxiliary_loss_mlp": 0.01065472, + "balance_loss_clip": 1.05510616, + "balance_loss_mlp": 1.04184425, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 1.8513491630116747, + "language_loss": 0.80137444, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82361042, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 2.6557395458221436 + }, + { + "auxiliary_loss_clip": 0.01044675, + "auxiliary_loss_mlp": 0.0100555, + "balance_loss_clip": 1.02087379, + "balance_loss_mlp": 1.00020909, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7398873682809782, + "language_loss": 0.56009531, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58059752, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.2698168754577637 + }, + { + "auxiliary_loss_clip": 0.01157201, + "auxiliary_loss_mlp": 0.01061927, + "balance_loss_clip": 1.0491761, + "balance_loss_mlp": 1.03892004, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.3182790117590555, + "language_loss": 0.67670918, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.69890052, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.6607916355133057 + }, + { + "auxiliary_loss_clip": 0.01147496, + "auxiliary_loss_mlp": 0.01077741, + "balance_loss_clip": 1.04964006, + "balance_loss_mlp": 1.05307615, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.6821163507280206, + "language_loss": 0.86711669, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88936913, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 2.7176201343536377 + }, + { + "auxiliary_loss_clip": 0.01111138, + "auxiliary_loss_mlp": 0.0107472, + "balance_loss_clip": 1.04046512, + "balance_loss_mlp": 1.04764795, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.159311660911319, + "language_loss": 0.87091815, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89277673, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.6881895065307617 + }, + { + "auxiliary_loss_clip": 0.01169452, + "auxiliary_loss_mlp": 0.01055587, + "balance_loss_clip": 1.05148411, + "balance_loss_mlp": 1.03317618, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 1.9315013979204663, + "language_loss": 0.8903141, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91256452, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.57633113861084 + }, + { + "auxiliary_loss_clip": 0.01058583, + "auxiliary_loss_mlp": 0.0100437, + "balance_loss_clip": 1.01672363, + "balance_loss_mlp": 0.99993581, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8856608795913387, + "language_loss": 0.64848733, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66911685, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 3.0619685649871826 + }, + { + "auxiliary_loss_clip": 0.01178264, + "auxiliary_loss_mlp": 0.01059209, + "balance_loss_clip": 1.05140126, + "balance_loss_mlp": 1.03244686, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.6882424640823346, + "language_loss": 0.89841163, + "learning_rate": 3.973112579977733e-06, + "loss": 0.92078644, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.654998779296875 + }, + { + "auxiliary_loss_clip": 0.01153736, + "auxiliary_loss_mlp": 0.01059835, + "balance_loss_clip": 1.05148292, + "balance_loss_mlp": 1.03444314, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.533591967705759, + "language_loss": 0.76460236, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78673804, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.6112728118896484 + }, + { + "auxiliary_loss_clip": 0.01044341, + "auxiliary_loss_mlp": 0.01004265, + "balance_loss_clip": 1.01545703, + "balance_loss_mlp": 0.99963981, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.7980795954821738, + "language_loss": 0.57419437, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59468043, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 3.057559013366699 + }, + { + "auxiliary_loss_clip": 0.01132511, + "auxiliary_loss_mlp": 0.01062137, + "balance_loss_clip": 1.04796398, + "balance_loss_mlp": 1.03746128, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.421548534083173, + "language_loss": 0.86470175, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88664818, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 2.6307027339935303 + }, + { + "auxiliary_loss_clip": 0.01180898, + "auxiliary_loss_mlp": 0.01055302, + "balance_loss_clip": 1.05258048, + "balance_loss_mlp": 1.03212786, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.6932252805800814, + "language_loss": 0.87728697, + "learning_rate": 3.972857395313042e-06, + "loss": 0.89964896, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 2.5738604068756104 + }, + { + "auxiliary_loss_clip": 0.01170404, + "auxiliary_loss_mlp": 0.01054059, + "balance_loss_clip": 1.05254757, + "balance_loss_mlp": 1.03052759, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6371616155835413, + "language_loss": 0.92931449, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95155907, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 2.695894241333008 + }, + { + "auxiliary_loss_clip": 0.0116701, + "auxiliary_loss_mlp": 0.01062507, + "balance_loss_clip": 1.05253005, + "balance_loss_mlp": 1.03736591, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.856856229860684, + "language_loss": 0.89512551, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91742063, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 2.606886625289917 + }, + { + "auxiliary_loss_clip": 0.0112291, + "auxiliary_loss_mlp": 0.01059202, + "balance_loss_clip": 1.05485976, + "balance_loss_mlp": 1.03638554, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 1.7517759077585398, + "language_loss": 0.76514649, + "learning_rate": 3.97266522129109e-06, + "loss": 0.78696764, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 2.726560592651367 + }, + { + "auxiliary_loss_clip": 0.01180946, + "auxiliary_loss_mlp": 0.01061391, + "balance_loss_clip": 1.05130434, + "balance_loss_mlp": 1.03696465, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 1.9068002948562215, + "language_loss": 0.88482738, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90725076, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 2.534353017807007 + }, + { + "auxiliary_loss_clip": 0.01141984, + "auxiliary_loss_mlp": 0.00752296, + "balance_loss_clip": 1.04844844, + "balance_loss_mlp": 1.00041926, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.0646061726945635, + "language_loss": 0.82349873, + "learning_rate": 3.972536731254092e-06, + "loss": 0.8424415, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 2.64471173286438 + }, + { + "auxiliary_loss_clip": 0.01179888, + "auxiliary_loss_mlp": 0.01058113, + "balance_loss_clip": 1.0500319, + "balance_loss_mlp": 1.03344893, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.8809777979817874, + "language_loss": 0.753847, + "learning_rate": 3.972472374036189e-06, + "loss": 0.776227, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 2.5707833766937256 + }, + { + "auxiliary_loss_clip": 0.01173566, + "auxiliary_loss_mlp": 0.00752439, + "balance_loss_clip": 1.05362296, + "balance_loss_mlp": 1.00043631, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 2.0247158829421514, + "language_loss": 0.82910633, + "learning_rate": 3.972407942021935e-06, + "loss": 0.84836644, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.6451871395111084 + }, + { + "auxiliary_loss_clip": 0.01056031, + "auxiliary_loss_mlp": 0.01023679, + "balance_loss_clip": 1.01457477, + "balance_loss_mlp": 1.01922023, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8524893437212557, + "language_loss": 0.59693557, + "learning_rate": 3.972343435213775e-06, + "loss": 0.6177327, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.110713481903076 + }, + { + "auxiliary_loss_clip": 0.01132692, + "auxiliary_loss_mlp": 0.01058332, + "balance_loss_clip": 1.04711056, + "balance_loss_mlp": 1.03567052, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7731621966626576, + "language_loss": 0.82744074, + "learning_rate": 3.972278853614154e-06, + "loss": 0.84935099, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 4.139779090881348 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01060084, + "balance_loss_clip": 1.04941571, + "balance_loss_mlp": 1.03449011, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 2.59562850028633, + "language_loss": 0.70988041, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73213041, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.5190041065216064 + }, + { + "auxiliary_loss_clip": 0.01179143, + "auxiliary_loss_mlp": 0.01057062, + "balance_loss_clip": 1.05716801, + "balance_loss_mlp": 1.0320636, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 1.8589823618461159, + "language_loss": 0.70691544, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72927749, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.6144418716430664 + }, + { + "auxiliary_loss_clip": 0.01170964, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_clip": 1.05382204, + "balance_loss_mlp": 1.03013849, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.326445094505703, + "language_loss": 0.8419795, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86422431, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 4.053532838821411 + }, + { + "auxiliary_loss_clip": 0.01160825, + "auxiliary_loss_mlp": 0.01057533, + "balance_loss_clip": 1.05214381, + "balance_loss_mlp": 1.03133035, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9461591697268552, + "language_loss": 1.02348876, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04567242, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 4.156485319137573 + }, + { + "auxiliary_loss_clip": 0.01109929, + "auxiliary_loss_mlp": 0.01059635, + "balance_loss_clip": 1.04423118, + "balance_loss_mlp": 1.03470874, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 1.9655712374941905, + "language_loss": 0.83564162, + "learning_rate": 3.971954823829951e-06, + "loss": 0.8573373, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 4.231536149978638 + }, + { + "auxiliary_loss_clip": 0.01182135, + "auxiliary_loss_mlp": 0.01064995, + "balance_loss_clip": 1.05157983, + "balance_loss_mlp": 1.04065239, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.0247445541152564, + "language_loss": 0.72099715, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74346846, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 2.509078025817871 + }, + { + "auxiliary_loss_clip": 0.01150498, + "auxiliary_loss_mlp": 0.01062609, + "balance_loss_clip": 1.04535913, + "balance_loss_mlp": 1.03671634, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 2.0286499447754656, + "language_loss": 0.7661593, + "learning_rate": 3.971824688461976e-06, + "loss": 0.78829038, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.6133005619049072 + }, + { + "auxiliary_loss_clip": 0.01181713, + "auxiliary_loss_mlp": 0.01051974, + "balance_loss_clip": 1.05422068, + "balance_loss_mlp": 1.02841818, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.280459332921848, + "language_loss": 0.72622764, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74856448, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 2.615299701690674 + }, + { + "auxiliary_loss_clip": 0.01183633, + "auxiliary_loss_mlp": 0.01058408, + "balance_loss_clip": 1.05548692, + "balance_loss_mlp": 1.0322777, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 1.9732904974763616, + "language_loss": 0.77230424, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79472464, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.6821577548980713 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01066266, + "balance_loss_clip": 1.04425967, + "balance_loss_mlp": 1.03945613, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 1.728875805140103, + "language_loss": 0.8211112, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84287858, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.796828031539917 + }, + { + "auxiliary_loss_clip": 0.01175679, + "auxiliary_loss_mlp": 0.01063117, + "balance_loss_clip": 1.05702615, + "balance_loss_mlp": 1.03898895, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.1183305365653493, + "language_loss": 0.82127917, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84366715, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.654836654663086 + }, + { + "auxiliary_loss_clip": 0.01132617, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.04466772, + "balance_loss_mlp": 1.04292822, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 1.6889244721346512, + "language_loss": 0.81591558, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83791053, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 2.5907421112060547 + }, + { + "auxiliary_loss_clip": 0.01186923, + "auxiliary_loss_mlp": 0.01065141, + "balance_loss_clip": 1.05392087, + "balance_loss_mlp": 1.04047656, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 3.963017742043737, + "language_loss": 0.83480978, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85733038, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 2.454225540161133 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.05144835, + "balance_loss_mlp": 1.02667332, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7259838590784113, + "language_loss": 0.81519353, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83709377, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.7041993141174316 + }, + { + "auxiliary_loss_clip": 0.01116291, + "auxiliary_loss_mlp": 0.00752379, + "balance_loss_clip": 1.04618073, + "balance_loss_mlp": 1.00040627, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.167160636056162, + "language_loss": 0.74794596, + "learning_rate": 3.971301156316582e-06, + "loss": 0.76663262, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 2.712146520614624 + }, + { + "auxiliary_loss_clip": 0.0113748, + "auxiliary_loss_mlp": 0.01063587, + "balance_loss_clip": 1.05305147, + "balance_loss_mlp": 1.03832662, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6299542799922064, + "language_loss": 0.74640012, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76841074, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.6966614723205566 + }, + { + "auxiliary_loss_clip": 0.01077409, + "auxiliary_loss_mlp": 0.01064474, + "balance_loss_clip": 1.04238033, + "balance_loss_mlp": 1.03810525, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.02992668086831, + "language_loss": 0.70803237, + "learning_rate": 3.971169525711122e-06, + "loss": 0.72945118, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 2.919870376586914 + }, + { + "auxiliary_loss_clip": 0.01137788, + "auxiliary_loss_mlp": 0.01054388, + "balance_loss_clip": 1.04567981, + "balance_loss_mlp": 1.0291512, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.3226435589207695, + "language_loss": 0.87964147, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90156317, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 2.807803153991699 + }, + { + "auxiliary_loss_clip": 0.01131039, + "auxiliary_loss_mlp": 0.01060792, + "balance_loss_clip": 1.04651558, + "balance_loss_mlp": 1.03588903, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.8852737181932746, + "language_loss": 0.82376844, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84568679, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 2.6843016147613525 + }, + { + "auxiliary_loss_clip": 0.0102811, + "auxiliary_loss_mlp": 0.01009734, + "balance_loss_clip": 1.02512741, + "balance_loss_mlp": 1.00432169, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8246352211402224, + "language_loss": 0.60628545, + "learning_rate": 3.970971519207095e-06, + "loss": 0.6266638, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.2099709510803223 + }, + { + "auxiliary_loss_clip": 0.01046593, + "auxiliary_loss_mlp": 0.01023434, + "balance_loss_clip": 1.01680148, + "balance_loss_mlp": 1.01985824, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9099217862423317, + "language_loss": 0.62219155, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64289176, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.0903382301330566 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01071342, + "balance_loss_clip": 1.05350459, + "balance_loss_mlp": 1.04645109, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.7735197913286915, + "language_loss": 0.82657033, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84869343, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 2.755949020385742 + }, + { + "auxiliary_loss_clip": 0.01159694, + "auxiliary_loss_mlp": 0.01057584, + "balance_loss_clip": 1.05330241, + "balance_loss_mlp": 1.03210855, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.7961647656882556, + "language_loss": 0.85199648, + "learning_rate": 3.970772840048147e-06, + "loss": 0.87416923, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.6892292499542236 + }, + { + "auxiliary_loss_clip": 0.01170534, + "auxiliary_loss_mlp": 0.01063916, + "balance_loss_clip": 1.05183697, + "balance_loss_mlp": 1.03884649, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 1.980706466627614, + "language_loss": 0.87718731, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89953184, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.635087251663208 + }, + { + "auxiliary_loss_clip": 0.01137967, + "auxiliary_loss_mlp": 0.01058921, + "balance_loss_clip": 1.04861677, + "balance_loss_mlp": 1.03513861, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 2.119612359905843, + "language_loss": 0.78462172, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80659062, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.8511531352996826 + }, + { + "auxiliary_loss_clip": 0.01168562, + "auxiliary_loss_mlp": 0.01058695, + "balance_loss_clip": 1.05525756, + "balance_loss_mlp": 1.03325605, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.9508064509269643, + "language_loss": 0.85777712, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88004971, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.5820884704589844 + }, + { + "auxiliary_loss_clip": 0.01182751, + "auxiliary_loss_mlp": 0.00752479, + "balance_loss_clip": 1.05697203, + "balance_loss_mlp": 1.0005157, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 2.7602812532708665, + "language_loss": 0.88577652, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90512884, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 2.5272932052612305 + }, + { + "auxiliary_loss_clip": 0.01139923, + "auxiliary_loss_mlp": 0.01055075, + "balance_loss_clip": 1.04884052, + "balance_loss_mlp": 1.03185296, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.442953486546965, + "language_loss": 0.77270514, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79465508, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 2.632375478744507 + }, + { + "auxiliary_loss_clip": 0.01173978, + "auxiliary_loss_mlp": 0.01059369, + "balance_loss_clip": 1.05252719, + "balance_loss_mlp": 1.03439438, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 1.846572952192679, + "language_loss": 0.82636857, + "learning_rate": 3.97037346403694e-06, + "loss": 0.84870207, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.01129249, + "auxiliary_loss_mlp": 0.01058021, + "balance_loss_clip": 1.04761457, + "balance_loss_mlp": 1.02960181, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.47572696548019, + "language_loss": 0.84972727, + "learning_rate": 3.970306639845e-06, + "loss": 0.87159997, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 2.670611619949341 + }, + { + "auxiliary_loss_clip": 0.01141171, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_clip": 1.05119133, + "balance_loss_mlp": 1.03456235, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 1.7939891673307875, + "language_loss": 0.68530309, + "learning_rate": 3.970239740938835e-06, + "loss": 0.70731425, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 2.664289951324463 + }, + { + "auxiliary_loss_clip": 0.01155729, + "auxiliary_loss_mlp": 0.0105943, + "balance_loss_clip": 1.04797912, + "balance_loss_mlp": 1.03430057, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.687938678782885, + "language_loss": 0.82032925, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84248078, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 2.5994250774383545 + }, + { + "auxiliary_loss_clip": 0.01162803, + "auxiliary_loss_mlp": 0.01074382, + "balance_loss_clip": 1.05231035, + "balance_loss_mlp": 1.04722619, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 1.9916755015665693, + "language_loss": 0.77142328, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79379511, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.702211618423462 + }, + { + "auxiliary_loss_clip": 0.01121104, + "auxiliary_loss_mlp": 0.01065031, + "balance_loss_clip": 1.05066466, + "balance_loss_mlp": 1.03823245, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.8702378416813246, + "language_loss": 0.79695559, + "learning_rate": 3.970038595960369e-06, + "loss": 0.8188169, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 2.7058603763580322 + }, + { + "auxiliary_loss_clip": 0.0115947, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.05180669, + "balance_loss_mlp": 1.03737402, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 2.1820155766544107, + "language_loss": 0.87396884, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89618671, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 2.6101884841918945 + }, + { + "auxiliary_loss_clip": 0.01142709, + "auxiliary_loss_mlp": 0.0106852, + "balance_loss_clip": 1.04707682, + "balance_loss_mlp": 1.04092276, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.761815913517505, + "language_loss": 0.86720312, + "learning_rate": 3.969904125783517e-06, + "loss": 0.88931537, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 2.6381685733795166 + }, + { + "auxiliary_loss_clip": 0.01140376, + "auxiliary_loss_mlp": 0.01078609, + "balance_loss_clip": 1.0497973, + "balance_loss_mlp": 1.05380106, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 2.5714599517101937, + "language_loss": 0.87765753, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89984739, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.743748426437378 + }, + { + "auxiliary_loss_clip": 0.01170069, + "auxiliary_loss_mlp": 0.01064353, + "balance_loss_clip": 1.0510664, + "balance_loss_mlp": 1.03890157, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.395257127206589, + "language_loss": 0.80156237, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82390654, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.630317449569702 + }, + { + "auxiliary_loss_clip": 0.01184296, + "auxiliary_loss_mlp": 0.01060372, + "balance_loss_clip": 1.05638468, + "balance_loss_mlp": 1.03623223, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 2.3509610769896443, + "language_loss": 0.85190427, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87435091, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.6320559978485107 + }, + { + "auxiliary_loss_clip": 0.01135124, + "auxiliary_loss_mlp": 0.01059944, + "balance_loss_clip": 1.05398977, + "balance_loss_mlp": 1.03457654, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 2.036974010903184, + "language_loss": 0.82789171, + "learning_rate": 3.969634289062719e-06, + "loss": 0.84984243, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 4.2703351974487305 + }, + { + "auxiliary_loss_clip": 0.01172919, + "auxiliary_loss_mlp": 0.00752463, + "balance_loss_clip": 1.05393147, + "balance_loss_mlp": 1.00043845, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 11.284378743099085, + "language_loss": 0.83225918, + "learning_rate": 3.969566643154293e-06, + "loss": 0.85151303, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.641488790512085 + }, + { + "auxiliary_loss_clip": 0.01170993, + "auxiliary_loss_mlp": 0.01060836, + "balance_loss_clip": 1.05544257, + "balance_loss_mlp": 1.03381121, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 1.919429160462299, + "language_loss": 0.76927722, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79159546, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.6020121574401855 + }, + { + "auxiliary_loss_clip": 0.01145263, + "auxiliary_loss_mlp": 0.01054172, + "balance_loss_clip": 1.05420804, + "balance_loss_mlp": 1.02792263, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.365385355309445, + "language_loss": 0.77626675, + "learning_rate": 3.969431127281516e-06, + "loss": 0.79826117, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 4.0970306396484375 + }, + { + "auxiliary_loss_clip": 0.01178086, + "auxiliary_loss_mlp": 0.01054302, + "balance_loss_clip": 1.05205595, + "balance_loss_mlp": 1.0307461, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 3.997340446340018, + "language_loss": 0.94931126, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97163516, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.5918099880218506 + }, + { + "auxiliary_loss_clip": 0.01155171, + "auxiliary_loss_mlp": 0.01064171, + "balance_loss_clip": 1.04941225, + "balance_loss_mlp": 1.03727746, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 3.182352928218243, + "language_loss": 0.8172549, + "learning_rate": 3.96929531268464e-06, + "loss": 0.83944833, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 5.596367835998535 + }, + { + "auxiliary_loss_clip": 0.0115462, + "auxiliary_loss_mlp": 0.01060926, + "balance_loss_clip": 1.04992402, + "balance_loss_mlp": 1.03576064, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 1.895435973108149, + "language_loss": 0.86650306, + "learning_rate": 3.969227293371099e-06, + "loss": 0.88865852, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.626131534576416 + }, + { + "auxiliary_loss_clip": 0.01182084, + "auxiliary_loss_mlp": 0.01062827, + "balance_loss_clip": 1.05136907, + "balance_loss_mlp": 1.03623176, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.251563873142216, + "language_loss": 0.87292862, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89537776, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.5562736988067627 + }, + { + "auxiliary_loss_clip": 0.0112532, + "auxiliary_loss_mlp": 0.00752406, + "balance_loss_clip": 1.0438621, + "balance_loss_mlp": 1.00042212, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 3.8534413957393783, + "language_loss": 0.89317375, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.91195095, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.8715639114379883 + }, + { + "auxiliary_loss_clip": 0.01153219, + "auxiliary_loss_mlp": 0.01053234, + "balance_loss_clip": 1.04918921, + "balance_loss_mlp": 1.0278182, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.8482713212448276, + "language_loss": 0.80220532, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82426983, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.6471476554870605 + }, + { + "auxiliary_loss_clip": 0.01165807, + "auxiliary_loss_mlp": 0.0107541, + "balance_loss_clip": 1.05392575, + "balance_loss_mlp": 1.04936326, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.09072995597194, + "language_loss": 0.83881956, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86123168, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.6586356163024902 + }, + { + "auxiliary_loss_clip": 0.01165019, + "auxiliary_loss_mlp": 0.01057161, + "balance_loss_clip": 1.04827559, + "balance_loss_mlp": 1.03352165, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.540702899781002, + "language_loss": 0.80250478, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82472658, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 2.648163080215454 + }, + { + "auxiliary_loss_clip": 0.01158422, + "auxiliary_loss_mlp": 0.0106667, + "balance_loss_clip": 1.05293941, + "balance_loss_mlp": 1.04132605, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8746191300247859, + "language_loss": 0.79315805, + "learning_rate": 3.96881760944111e-06, + "loss": 0.815409, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.607267141342163 + }, + { + "auxiliary_loss_clip": 0.01173948, + "auxiliary_loss_mlp": 0.0105658, + "balance_loss_clip": 1.05333591, + "balance_loss_mlp": 1.03176022, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 2.4051896902340433, + "language_loss": 0.9249627, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94726801, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 2.570037841796875 + }, + { + "auxiliary_loss_clip": 0.01051991, + "auxiliary_loss_mlp": 0.01012744, + "balance_loss_clip": 1.02047288, + "balance_loss_mlp": 1.00861931, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8872425683493782, + "language_loss": 0.61776251, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63840985, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 3.19663667678833 + }, + { + "auxiliary_loss_clip": 0.01175089, + "auxiliary_loss_mlp": 0.01062907, + "balance_loss_clip": 1.05217576, + "balance_loss_mlp": 1.03933978, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 1.7728844357656874, + "language_loss": 0.86504495, + "learning_rate": 3.968611759561355e-06, + "loss": 0.88742489, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 2.5368592739105225 + }, + { + "auxiliary_loss_clip": 0.01166894, + "auxiliary_loss_mlp": 0.01055185, + "balance_loss_clip": 1.05072927, + "balance_loss_mlp": 1.02858925, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.0279173943302493, + "language_loss": 0.74312496, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76534581, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 2.548521041870117 + }, + { + "auxiliary_loss_clip": 0.01069256, + "auxiliary_loss_mlp": 0.01006257, + "balance_loss_clip": 1.01566482, + "balance_loss_mlp": 1.00210857, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9058572485444139, + "language_loss": 0.56816161, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58891678, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 3.0131335258483887 + }, + { + "auxiliary_loss_clip": 0.01139217, + "auxiliary_loss_mlp": 0.01066083, + "balance_loss_clip": 1.04599857, + "balance_loss_mlp": 1.04047728, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.9548371260105863, + "language_loss": 0.89296532, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91501832, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 2.5484585762023926 + }, + { + "auxiliary_loss_clip": 0.01153362, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_clip": 1.05099118, + "balance_loss_mlp": 1.0318718, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 1.7431677409114856, + "language_loss": 0.87860048, + "learning_rate": 3.968336247967844e-06, + "loss": 0.9007113, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.57454252243042 + }, + { + "auxiliary_loss_clip": 0.01155408, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_clip": 1.05109644, + "balance_loss_mlp": 1.03697419, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.6863644528261268, + "language_loss": 0.77530599, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79746652, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.6038568019866943 + }, + { + "auxiliary_loss_clip": 0.01169989, + "auxiliary_loss_mlp": 0.01059285, + "balance_loss_clip": 1.05386758, + "balance_loss_mlp": 1.03655183, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.767845370043703, + "language_loss": 0.7076174, + "learning_rate": 3.968198044323587e-06, + "loss": 0.72991008, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 2.9267935752868652 + }, + { + "auxiliary_loss_clip": 0.01160832, + "auxiliary_loss_mlp": 0.01061124, + "balance_loss_clip": 1.05291474, + "balance_loss_mlp": 1.03553009, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 2.2281641837388895, + "language_loss": 0.75250614, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77472574, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 2.668947219848633 + }, + { + "auxiliary_loss_clip": 0.01159884, + "auxiliary_loss_mlp": 0.01051607, + "balance_loss_clip": 1.05340838, + "balance_loss_mlp": 1.02725267, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 3.897901782761132, + "language_loss": 0.81871426, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84082919, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.591020345687866 + }, + { + "auxiliary_loss_clip": 0.01024351, + "auxiliary_loss_mlp": 0.01003587, + "balance_loss_clip": 1.01017106, + "balance_loss_mlp": 0.99984342, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8602200266466636, + "language_loss": 0.56564975, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58592916, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 3.1740241050720215 + }, + { + "auxiliary_loss_clip": 0.01178468, + "auxiliary_loss_mlp": 0.0106011, + "balance_loss_clip": 1.05046606, + "balance_loss_mlp": 1.0358268, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.347222703052044, + "language_loss": 0.69979823, + "learning_rate": 3.967920741444886e-06, + "loss": 0.722184, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.66206693649292 + }, + { + "auxiliary_loss_clip": 0.0113747, + "auxiliary_loss_mlp": 0.01050571, + "balance_loss_clip": 1.04641414, + "balance_loss_mlp": 1.02635992, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.5232397624463032, + "language_loss": 0.8808428, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90272319, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 2.772845983505249 + }, + { + "auxiliary_loss_clip": 0.01061289, + "auxiliary_loss_mlp": 0.01005448, + "balance_loss_clip": 1.00915313, + "balance_loss_mlp": 1.00175226, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7956012097307209, + "language_loss": 0.63463104, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65529835, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 3.1281046867370605 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_clip": 1.05280042, + "balance_loss_mlp": 1.03564465, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 1.857301134543779, + "language_loss": 0.83234233, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85433203, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 2.6810970306396484 + }, + { + "auxiliary_loss_clip": 0.01140571, + "auxiliary_loss_mlp": 0.01063453, + "balance_loss_clip": 1.05059671, + "balance_loss_mlp": 1.03931332, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.6509636088570385, + "language_loss": 0.74924397, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77128422, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 2.6908485889434814 + }, + { + "auxiliary_loss_clip": 0.011297, + "auxiliary_loss_mlp": 0.01059236, + "balance_loss_clip": 1.0529325, + "balance_loss_mlp": 1.03506064, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.863105807163924, + "language_loss": 0.75860548, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78049481, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.6712546348571777 + }, + { + "auxiliary_loss_clip": 0.01177068, + "auxiliary_loss_mlp": 0.01051986, + "balance_loss_clip": 1.05245686, + "balance_loss_mlp": 1.02882338, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.0084551188191617, + "language_loss": 0.9307518, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95304239, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 2.518448829650879 + }, + { + "auxiliary_loss_clip": 0.01128859, + "auxiliary_loss_mlp": 0.01066037, + "balance_loss_clip": 1.04955268, + "balance_loss_mlp": 1.03946495, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.2650385119071514, + "language_loss": 0.7583192, + "learning_rate": 3.967432588494471e-06, + "loss": 0.78026819, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 2.656435251235962 + }, + { + "auxiliary_loss_clip": 0.01176413, + "auxiliary_loss_mlp": 0.01057847, + "balance_loss_clip": 1.05177689, + "balance_loss_mlp": 1.03511345, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 3.169680367434129, + "language_loss": 0.82344466, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84578729, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 2.5161890983581543 + }, + { + "auxiliary_loss_clip": 0.01158123, + "auxiliary_loss_mlp": 0.01066156, + "balance_loss_clip": 1.04996419, + "balance_loss_mlp": 1.04085994, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.0806025606485195, + "language_loss": 0.80028462, + "learning_rate": 3.967292444736023e-06, + "loss": 0.82252747, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 2.666384220123291 + }, + { + "auxiliary_loss_clip": 0.0115528, + "auxiliary_loss_mlp": 0.01059051, + "balance_loss_clip": 1.05225563, + "balance_loss_mlp": 1.03556645, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.2169284408804675, + "language_loss": 0.88206255, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90420586, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 2.5736992359161377 + }, + { + "auxiliary_loss_clip": 0.01128944, + "auxiliary_loss_mlp": 0.01073955, + "balance_loss_clip": 1.050915, + "balance_loss_mlp": 1.04966068, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.9006953381325897, + "language_loss": 0.81335974, + "learning_rate": 3.96715200257787e-06, + "loss": 0.83538878, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.660993814468384 + }, + { + "auxiliary_loss_clip": 0.01138369, + "auxiliary_loss_mlp": 0.01064781, + "balance_loss_clip": 1.05117047, + "balance_loss_mlp": 1.04040229, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.5311399976127482, + "language_loss": 0.77729088, + "learning_rate": 3.967081669605559e-06, + "loss": 0.79932237, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.6606578826904297 + }, + { + "auxiliary_loss_clip": 0.01154009, + "auxiliary_loss_mlp": 0.01062031, + "balance_loss_clip": 1.04689264, + "balance_loss_mlp": 1.03649664, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 1.8412131661076934, + "language_loss": 0.73355269, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75571311, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.651054859161377 + }, + { + "auxiliary_loss_clip": 0.01133795, + "auxiliary_loss_mlp": 0.00752351, + "balance_loss_clip": 1.0458957, + "balance_loss_mlp": 1.0003258, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.499963110288956, + "language_loss": 0.86033738, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87919885, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.68430757522583 + }, + { + "auxiliary_loss_clip": 0.01153737, + "auxiliary_loss_mlp": 0.01054651, + "balance_loss_clip": 1.04653895, + "balance_loss_mlp": 1.03074956, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.4246842229875893, + "language_loss": 0.7890079, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81109178, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 4.294910907745361 + }, + { + "auxiliary_loss_clip": 0.01032638, + "auxiliary_loss_mlp": 0.01044074, + "balance_loss_clip": 1.01009989, + "balance_loss_mlp": 1.03973472, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8823324400072583, + "language_loss": 0.57950997, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60027713, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.312201738357544 + }, + { + "auxiliary_loss_clip": 0.01154431, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.04681706, + "balance_loss_mlp": 1.03283906, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.2496702352249, + "language_loss": 0.6904164, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71254683, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.6493258476257324 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.01056144, + "balance_loss_clip": 1.04422927, + "balance_loss_mlp": 1.03125334, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 1.9419612440008605, + "language_loss": 0.72490168, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74649417, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 4.276704788208008 + }, + { + "auxiliary_loss_clip": 0.01163802, + "auxiliary_loss_mlp": 0.01056339, + "balance_loss_clip": 1.05143905, + "balance_loss_mlp": 1.03198481, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.5862782474356272, + "language_loss": 0.64261824, + "learning_rate": 3.966587250374945e-06, + "loss": 0.6648196, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 4.328041315078735 + }, + { + "auxiliary_loss_clip": 0.01136354, + "auxiliary_loss_mlp": 0.01056044, + "balance_loss_clip": 1.04692626, + "balance_loss_mlp": 1.03103399, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.0119489884236605, + "language_loss": 0.87831318, + "learning_rate": 3.966516320742077e-06, + "loss": 0.90023708, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 4.105233430862427 + }, + { + "auxiliary_loss_clip": 0.01140013, + "auxiliary_loss_mlp": 0.00752447, + "balance_loss_clip": 1.04967606, + "balance_loss_mlp": 1.00037026, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.611119614120606, + "language_loss": 0.83589685, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85482144, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.6571977138519287 + }, + { + "auxiliary_loss_clip": 0.0106898, + "auxiliary_loss_mlp": 0.01008008, + "balance_loss_clip": 1.01145828, + "balance_loss_mlp": 1.00388384, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.850048395892927, + "language_loss": 0.60510951, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62587941, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.1763229370117188 + }, + { + "auxiliary_loss_clip": 0.011526, + "auxiliary_loss_mlp": 0.01050793, + "balance_loss_clip": 1.05033422, + "balance_loss_mlp": 1.02689171, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.541110289019951, + "language_loss": 0.79043412, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81246805, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.612948179244995 + }, + { + "auxiliary_loss_clip": 0.01165608, + "auxiliary_loss_mlp": 0.01052307, + "balance_loss_clip": 1.04883933, + "balance_loss_mlp": 1.02833438, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.7315345550688306, + "language_loss": 0.82039338, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84257257, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.620396137237549 + }, + { + "auxiliary_loss_clip": 0.01182709, + "auxiliary_loss_mlp": 0.0105077, + "balance_loss_clip": 1.05237091, + "balance_loss_mlp": 1.0272857, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 2.112028292464434, + "language_loss": 0.86831522, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89065003, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 2.535879611968994 + }, + { + "auxiliary_loss_clip": 0.01165973, + "auxiliary_loss_mlp": 0.01049999, + "balance_loss_clip": 1.05306065, + "balance_loss_mlp": 1.02799273, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.876576826784259, + "language_loss": 0.81789386, + "learning_rate": 3.96608917705879e-06, + "loss": 0.84005356, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 2.578575611114502 + }, + { + "auxiliary_loss_clip": 0.01049562, + "auxiliary_loss_mlp": 0.01008258, + "balance_loss_clip": 1.00950003, + "balance_loss_mlp": 1.00499141, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.723662954868664, + "language_loss": 0.54769421, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56827235, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.1584742069244385 + }, + { + "auxiliary_loss_clip": 0.01136223, + "auxiliary_loss_mlp": 0.01059052, + "balance_loss_clip": 1.04841197, + "balance_loss_mlp": 1.03600848, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.18150206649419, + "language_loss": 0.84653419, + "learning_rate": 3.965946199367804e-06, + "loss": 0.868487, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 2.6673810482025146 + }, + { + "auxiliary_loss_clip": 0.01180772, + "auxiliary_loss_mlp": 0.01056985, + "balance_loss_clip": 1.05218124, + "balance_loss_mlp": 1.0336442, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 2.8473075150402085, + "language_loss": 0.80096328, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82334077, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 2.573153018951416 + }, + { + "auxiliary_loss_clip": 0.0112483, + "auxiliary_loss_mlp": 0.01054496, + "balance_loss_clip": 1.04889202, + "balance_loss_mlp": 1.03134537, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 1.6514954917436009, + "language_loss": 0.70998359, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73177683, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.7787301540374756 + }, + { + "auxiliary_loss_clip": 0.01109323, + "auxiliary_loss_mlp": 0.01052468, + "balance_loss_clip": 1.04682124, + "balance_loss_mlp": 1.02958035, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 3.3805184444128424, + "language_loss": 0.83329839, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85491633, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.631268262863159 + }, + { + "auxiliary_loss_clip": 0.01122234, + "auxiliary_loss_mlp": 0.00752212, + "balance_loss_clip": 1.04623961, + "balance_loss_mlp": 1.00029588, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 2.0817819746892363, + "language_loss": 0.74205494, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76079941, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.6867148876190186 + }, + { + "auxiliary_loss_clip": 0.01138794, + "auxiliary_loss_mlp": 0.01062436, + "balance_loss_clip": 1.0472368, + "balance_loss_mlp": 1.03803372, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 4.4104876960125035, + "language_loss": 0.80233163, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82434392, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 2.591137409210205 + }, + { + "auxiliary_loss_clip": 0.01156101, + "auxiliary_loss_mlp": 0.01063099, + "balance_loss_clip": 1.05282259, + "balance_loss_mlp": 1.03925705, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.946726464808636, + "language_loss": 0.71264005, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73483205, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.6451261043548584 + }, + { + "auxiliary_loss_clip": 0.01062393, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.01605463, + "balance_loss_mlp": 1.02382565, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7834118467710158, + "language_loss": 0.58591926, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60682273, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 3.152238130569458 + }, + { + "auxiliary_loss_clip": 0.01177056, + "auxiliary_loss_mlp": 0.01058669, + "balance_loss_clip": 1.0511415, + "balance_loss_mlp": 1.03569722, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.7007216409454196, + "language_loss": 0.77423859, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79659581, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 2.6495280265808105 + }, + { + "auxiliary_loss_clip": 0.01108993, + "auxiliary_loss_mlp": 0.01057089, + "balance_loss_clip": 1.04333234, + "balance_loss_mlp": 1.03316367, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 1.8036311488847891, + "language_loss": 0.72241521, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74407601, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 2.754753828048706 + }, + { + "auxiliary_loss_clip": 0.01159786, + "auxiliary_loss_mlp": 0.01047264, + "balance_loss_clip": 1.04721689, + "balance_loss_mlp": 1.02513862, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.6029214952939967, + "language_loss": 0.86365283, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88572335, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 2.585117816925049 + }, + { + "auxiliary_loss_clip": 0.01156339, + "auxiliary_loss_mlp": 0.01062372, + "balance_loss_clip": 1.05220389, + "balance_loss_mlp": 1.03967428, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.6758432138215615, + "language_loss": 0.80384338, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82603049, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 2.5595040321350098 + }, + { + "auxiliary_loss_clip": 0.0111532, + "auxiliary_loss_mlp": 0.01055505, + "balance_loss_clip": 1.04576015, + "balance_loss_mlp": 1.03152037, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.3191616349546513, + "language_loss": 0.84422982, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86593807, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 2.7052254676818848 + }, + { + "auxiliary_loss_clip": 0.01161589, + "auxiliary_loss_mlp": 0.01052937, + "balance_loss_clip": 1.048159, + "balance_loss_mlp": 1.0310142, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.6959677924633456, + "language_loss": 0.80618745, + "learning_rate": 3.965009576834394e-06, + "loss": 0.82833272, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 2.5491292476654053 + }, + { + "auxiliary_loss_clip": 0.01149936, + "auxiliary_loss_mlp": 0.01059702, + "balance_loss_clip": 1.0484252, + "balance_loss_mlp": 1.03696918, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.7397596203003367, + "language_loss": 0.76447988, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78657621, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 2.7733678817749023 + }, + { + "auxiliary_loss_clip": 0.01157212, + "auxiliary_loss_mlp": 0.01061832, + "balance_loss_clip": 1.05114329, + "balance_loss_mlp": 1.0380733, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 1.911011675723718, + "language_loss": 0.74802423, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.77021468, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 2.6460514068603516 + }, + { + "auxiliary_loss_clip": 0.01174658, + "auxiliary_loss_mlp": 0.01057039, + "balance_loss_clip": 1.05250311, + "balance_loss_mlp": 1.03233862, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 5.250147203757203, + "language_loss": 0.83528113, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85759807, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.640458106994629 + }, + { + "auxiliary_loss_clip": 0.01154807, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.04855776, + "balance_loss_mlp": 1.03673756, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 1.8572128485403896, + "language_loss": 0.78287917, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80502224, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 2.763101577758789 + }, + { + "auxiliary_loss_clip": 0.01182877, + "auxiliary_loss_mlp": 0.0106153, + "balance_loss_clip": 1.05321848, + "balance_loss_mlp": 1.03963184, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 2.750667305649387, + "language_loss": 0.85237193, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.874816, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 2.659963846206665 + }, + { + "auxiliary_loss_clip": 0.0109764, + "auxiliary_loss_mlp": 0.00752327, + "balance_loss_clip": 1.04153168, + "balance_loss_mlp": 1.00042152, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 2.782462149824551, + "language_loss": 0.84418261, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86268228, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.755016326904297 + }, + { + "auxiliary_loss_clip": 0.01163796, + "auxiliary_loss_mlp": 0.01054448, + "balance_loss_clip": 1.05197263, + "balance_loss_mlp": 1.03194094, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.856359390073181, + "language_loss": 0.75772858, + "learning_rate": 3.964500025305907e-06, + "loss": 0.77991104, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 2.651864528656006 + }, + { + "auxiliary_loss_clip": 0.01161704, + "auxiliary_loss_mlp": 0.0105323, + "balance_loss_clip": 1.05089331, + "balance_loss_mlp": 1.03207052, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 2.0681317742760914, + "language_loss": 0.80528307, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82743239, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 2.560521364212036 + }, + { + "auxiliary_loss_clip": 0.01179053, + "auxiliary_loss_mlp": 0.01055699, + "balance_loss_clip": 1.05157685, + "balance_loss_mlp": 1.03345501, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 2.1296913534172415, + "language_loss": 0.77542019, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.79776764, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.5085620880126953 + }, + { + "auxiliary_loss_clip": 0.01173504, + "auxiliary_loss_mlp": 0.01060497, + "balance_loss_clip": 1.05129552, + "balance_loss_mlp": 1.03717923, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.831387952869996, + "language_loss": 0.84071183, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86305183, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.582040548324585 + }, + { + "auxiliary_loss_clip": 0.0113403, + "auxiliary_loss_mlp": 0.01050443, + "balance_loss_clip": 1.04597855, + "balance_loss_mlp": 1.03028464, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.7746163683481884, + "language_loss": 0.83237147, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85421622, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.803515911102295 + }, + { + "auxiliary_loss_clip": 0.01158219, + "auxiliary_loss_mlp": 0.01057744, + "balance_loss_clip": 1.05112422, + "balance_loss_mlp": 1.03409326, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 3.28808622524407, + "language_loss": 0.82685381, + "learning_rate": 3.964133825052146e-06, + "loss": 0.84901345, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.612891912460327 + }, + { + "auxiliary_loss_clip": 0.01106361, + "auxiliary_loss_mlp": 0.01055636, + "balance_loss_clip": 1.04306912, + "balance_loss_mlp": 1.03377342, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.6664444108692347, + "language_loss": 0.78730834, + "learning_rate": 3.964060361549816e-06, + "loss": 0.80892837, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.843082904815674 + }, + { + "auxiliary_loss_clip": 0.01122276, + "auxiliary_loss_mlp": 0.01066535, + "balance_loss_clip": 1.04519761, + "balance_loss_mlp": 1.04110718, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.681163270942508, + "language_loss": 0.78976566, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81165373, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 4.164418935775757 + }, + { + "auxiliary_loss_clip": 0.01173065, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.04958272, + "balance_loss_mlp": 1.02712703, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 5.8452702609610165, + "language_loss": 0.74243808, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76466286, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.712050199508667 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.0106049, + "balance_loss_clip": 1.0527842, + "balance_loss_mlp": 1.03700566, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.4861618622713775, + "language_loss": 0.74765337, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76983798, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.803562879562378 + }, + { + "auxiliary_loss_clip": 0.0117514, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_clip": 1.05146861, + "balance_loss_mlp": 1.02477384, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.5480251190135013, + "language_loss": 0.86922789, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89145213, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.5906405448913574 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01059209, + "balance_loss_clip": 1.04746795, + "balance_loss_mlp": 1.03788292, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 1.5489160674596465, + "language_loss": 0.77470875, + "learning_rate": 3.963691926933495e-06, + "loss": 0.79685962, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 4.136504173278809 + }, + { + "auxiliary_loss_clip": 0.01143871, + "auxiliary_loss_mlp": 0.01052375, + "balance_loss_clip": 1.04661536, + "balance_loss_mlp": 1.02989256, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 3.2856429377324465, + "language_loss": 0.77804458, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.8000071, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 4.150518178939819 + }, + { + "auxiliary_loss_clip": 0.0116405, + "auxiliary_loss_mlp": 0.01064794, + "balance_loss_clip": 1.04910779, + "balance_loss_mlp": 1.04130936, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 1.754191889992292, + "language_loss": 0.66890359, + "learning_rate": 3.963544031823624e-06, + "loss": 0.69119209, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.5838983058929443 + }, + { + "auxiliary_loss_clip": 0.01121514, + "auxiliary_loss_mlp": 0.01048465, + "balance_loss_clip": 1.04541993, + "balance_loss_mlp": 1.02712691, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 2.1888595557789374, + "language_loss": 0.96266246, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98436224, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.7964611053466797 + }, + { + "auxiliary_loss_clip": 0.01135494, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_clip": 1.0483253, + "balance_loss_mlp": 1.03094625, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 1.9823791723827031, + "language_loss": 0.78755403, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80944407, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.641385316848755 + }, + { + "auxiliary_loss_clip": 0.01151434, + "auxiliary_loss_mlp": 0.01079188, + "balance_loss_clip": 1.04825163, + "balance_loss_mlp": 1.05662203, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.3211746615484388, + "language_loss": 0.85741961, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87972581, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 2.525758743286133 + }, + { + "auxiliary_loss_clip": 0.01181742, + "auxiliary_loss_mlp": 0.01056777, + "balance_loss_clip": 1.05352449, + "balance_loss_mlp": 1.0338645, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.659693929032315, + "language_loss": 0.80312979, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82551491, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.611283302307129 + }, + { + "auxiliary_loss_clip": 0.01160433, + "auxiliary_loss_mlp": 0.01055226, + "balance_loss_clip": 1.05059946, + "balance_loss_mlp": 1.03238499, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 1.8033976688460809, + "language_loss": 0.82873631, + "learning_rate": 3.96317299108688e-06, + "loss": 0.8508929, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 2.5335376262664795 + }, + { + "auxiliary_loss_clip": 0.01130426, + "auxiliary_loss_mlp": 0.01060094, + "balance_loss_clip": 1.04993308, + "balance_loss_mlp": 1.03768313, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.6516636588814093, + "language_loss": 0.76469135, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78659654, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 2.6119937896728516 + }, + { + "auxiliary_loss_clip": 0.01138493, + "auxiliary_loss_mlp": 0.01062229, + "balance_loss_clip": 1.04378152, + "balance_loss_mlp": 1.03872085, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 3.14786916463352, + "language_loss": 0.82535726, + "learning_rate": 3.963024053666449e-06, + "loss": 0.84736443, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.6455142498016357 + }, + { + "auxiliary_loss_clip": 0.01154031, + "auxiliary_loss_mlp": 0.0104954, + "balance_loss_clip": 1.04727292, + "balance_loss_mlp": 1.0279634, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9615273923895398, + "language_loss": 0.7203176, + "learning_rate": 3.962949473297718e-06, + "loss": 0.74235338, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 2.8033559322357178 + }, + { + "auxiliary_loss_clip": 0.0113288, + "auxiliary_loss_mlp": 0.01051744, + "balance_loss_clip": 1.0428493, + "balance_loss_mlp": 1.0296073, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.760045796406188, + "language_loss": 0.89281595, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91466224, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.7107181549072266 + }, + { + "auxiliary_loss_clip": 0.01170523, + "auxiliary_loss_mlp": 0.01063596, + "balance_loss_clip": 1.0496192, + "balance_loss_mlp": 1.04130375, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 4.930845004187584, + "language_loss": 0.73683619, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75917745, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.5840766429901123 + }, + { + "auxiliary_loss_clip": 0.01174492, + "auxiliary_loss_mlp": 0.00751873, + "balance_loss_clip": 1.05197608, + "balance_loss_mlp": 1.00017405, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6543864209063444, + "language_loss": 0.77066147, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.7899251, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 2.554311990737915 + }, + { + "auxiliary_loss_clip": 0.01173936, + "auxiliary_loss_mlp": 0.01054865, + "balance_loss_clip": 1.0518862, + "balance_loss_mlp": 1.03265643, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.1708623081082643, + "language_loss": 0.70811296, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73040092, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.6405935287475586 + }, + { + "auxiliary_loss_clip": 0.01174784, + "auxiliary_loss_mlp": 0.01056738, + "balance_loss_clip": 1.05153418, + "balance_loss_mlp": 1.03421962, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 2.1802586737363807, + "language_loss": 0.86797428, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89028955, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 2.5286917686462402 + }, + { + "auxiliary_loss_clip": 0.01065177, + "auxiliary_loss_mlp": 0.01068071, + "balance_loss_clip": 1.0403744, + "balance_loss_mlp": 1.04434848, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.762513810158524, + "language_loss": 0.82905644, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85038888, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.702133893966675 + }, + { + "auxiliary_loss_clip": 0.01153885, + "auxiliary_loss_mlp": 0.01056688, + "balance_loss_clip": 1.05007052, + "balance_loss_mlp": 1.03372848, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 2.1906554072494004, + "language_loss": 0.70644963, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72855538, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 2.5599889755249023 + }, + { + "auxiliary_loss_clip": 0.01147169, + "auxiliary_loss_mlp": 0.01051872, + "balance_loss_clip": 1.04566407, + "balance_loss_mlp": 1.03139174, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.8720146833873155, + "language_loss": 0.79883063, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82082105, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 2.5889151096343994 + }, + { + "auxiliary_loss_clip": 0.01113881, + "auxiliary_loss_mlp": 0.01055245, + "balance_loss_clip": 1.04616189, + "balance_loss_mlp": 1.03205884, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.4240421748100727, + "language_loss": 0.8275758, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84926713, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 2.7189061641693115 + }, + { + "auxiliary_loss_clip": 0.01156347, + "auxiliary_loss_mlp": 0.01060434, + "balance_loss_clip": 1.05056775, + "balance_loss_mlp": 1.03946471, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.398462793971311, + "language_loss": 0.79077458, + "learning_rate": 3.962199576140195e-06, + "loss": 0.81294239, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 2.5768203735351562 + }, + { + "auxiliary_loss_clip": 0.01142999, + "auxiliary_loss_mlp": 0.00751934, + "balance_loss_clip": 1.04665923, + "balance_loss_mlp": 1.00013602, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.8132377194246705, + "language_loss": 0.93208075, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95103014, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.553614854812622 + }, + { + "auxiliary_loss_clip": 0.01131441, + "auxiliary_loss_mlp": 0.01051997, + "balance_loss_clip": 1.04559159, + "balance_loss_mlp": 1.02830982, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 6.833911798792164, + "language_loss": 0.7428596, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76469398, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 2.5682015419006348 + }, + { + "auxiliary_loss_clip": 0.01045003, + "auxiliary_loss_mlp": 0.01004377, + "balance_loss_clip": 1.02120972, + "balance_loss_mlp": 1.00094402, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7248257057386254, + "language_loss": 0.58287036, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60336417, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 3.15031361579895 + }, + { + "auxiliary_loss_clip": 0.01126905, + "auxiliary_loss_mlp": 0.01063153, + "balance_loss_clip": 1.04034948, + "balance_loss_mlp": 1.04011023, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 6.114857738461515, + "language_loss": 0.69249582, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71439642, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 2.64797306060791 + }, + { + "auxiliary_loss_clip": 0.01120923, + "auxiliary_loss_mlp": 0.01070998, + "balance_loss_clip": 1.04411697, + "balance_loss_mlp": 1.04877782, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.4911909575435423, + "language_loss": 0.86069554, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88261473, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.6012027263641357 + }, + { + "auxiliary_loss_clip": 0.01127489, + "auxiliary_loss_mlp": 0.01067217, + "balance_loss_clip": 1.04541445, + "balance_loss_mlp": 1.04109836, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.8986620269302807, + "language_loss": 0.72417378, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74612087, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 2.6201109886169434 + }, + { + "auxiliary_loss_clip": 0.01121959, + "auxiliary_loss_mlp": 0.01058499, + "balance_loss_clip": 1.04501355, + "balance_loss_mlp": 1.0352298, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.387685663124287, + "language_loss": 0.81370682, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83551139, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 2.595269203186035 + }, + { + "auxiliary_loss_clip": 0.0112899, + "auxiliary_loss_mlp": 0.01055795, + "balance_loss_clip": 1.04632974, + "balance_loss_mlp": 1.03421772, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 1.918031215455942, + "language_loss": 0.76101679, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78286457, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 2.6529479026794434 + }, + { + "auxiliary_loss_clip": 0.0104777, + "auxiliary_loss_mlp": 0.01001705, + "balance_loss_clip": 1.01816285, + "balance_loss_mlp": 0.99841499, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7330134110024364, + "language_loss": 0.57640046, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59689522, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 2.9607183933258057 + }, + { + "auxiliary_loss_clip": 0.01154466, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.04976094, + "balance_loss_mlp": 1.03395247, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 1.8131909922271008, + "language_loss": 0.84984529, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87195557, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.5096189975738525 + }, + { + "auxiliary_loss_clip": 0.01155348, + "auxiliary_loss_mlp": 0.0106794, + "balance_loss_clip": 1.05126512, + "balance_loss_mlp": 1.04386008, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.5270087878388097, + "language_loss": 0.84154415, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86377704, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.5395259857177734 + }, + { + "auxiliary_loss_clip": 0.01135677, + "auxiliary_loss_mlp": 0.01061946, + "balance_loss_clip": 1.04635501, + "balance_loss_mlp": 1.03868794, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.8775500816190005, + "language_loss": 0.85239077, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87436706, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.52587628364563 + }, + { + "auxiliary_loss_clip": 0.01132027, + "auxiliary_loss_mlp": 0.01053664, + "balance_loss_clip": 1.04710507, + "balance_loss_mlp": 1.03134823, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.6404591071670507, + "language_loss": 0.85351527, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87537211, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.658299207687378 + }, + { + "auxiliary_loss_clip": 0.01146133, + "auxiliary_loss_mlp": 0.01057058, + "balance_loss_clip": 1.05347967, + "balance_loss_mlp": 1.03525472, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.4818624906152427, + "language_loss": 0.86867249, + "learning_rate": 3.961137220422749e-06, + "loss": 0.89070445, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.5761964321136475 + }, + { + "auxiliary_loss_clip": 0.01161045, + "auxiliary_loss_mlp": 0.01057023, + "balance_loss_clip": 1.05316436, + "balance_loss_mlp": 1.03617382, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.701716683443983, + "language_loss": 0.86674643, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8889271, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.6330182552337646 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01057727, + "balance_loss_clip": 1.05034506, + "balance_loss_mlp": 1.03697276, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.72191094917208, + "language_loss": 0.89991683, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92171073, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 4.158310651779175 + }, + { + "auxiliary_loss_clip": 0.01149165, + "auxiliary_loss_mlp": 0.01055662, + "balance_loss_clip": 1.04832828, + "balance_loss_mlp": 1.03278565, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.0321657492452605, + "language_loss": 0.85544592, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87749422, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.671285390853882 + }, + { + "auxiliary_loss_clip": 0.01143256, + "auxiliary_loss_mlp": 0.0105933, + "balance_loss_clip": 1.04531646, + "balance_loss_mlp": 1.03592968, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.7561037998071394, + "language_loss": 0.81001449, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83204031, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.6819236278533936 + }, + { + "auxiliary_loss_clip": 0.01165304, + "auxiliary_loss_mlp": 0.01069176, + "balance_loss_clip": 1.05015409, + "balance_loss_mlp": 1.04650283, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 3.01150856072293, + "language_loss": 0.78117895, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80352372, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.5994839668273926 + }, + { + "auxiliary_loss_clip": 0.01150237, + "auxiliary_loss_mlp": 0.0106024, + "balance_loss_clip": 1.04728925, + "balance_loss_mlp": 1.03766155, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.9931210964158443, + "language_loss": 0.86294144, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88504624, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 4.0415332317352295 + }, + { + "auxiliary_loss_clip": 0.01153588, + "auxiliary_loss_mlp": 0.01053894, + "balance_loss_clip": 1.04904211, + "balance_loss_mlp": 1.02944386, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.1394403864496083, + "language_loss": 0.73499078, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75706565, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 4.127172946929932 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01057387, + "balance_loss_clip": 1.05199409, + "balance_loss_mlp": 1.03467727, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 3.079334894631646, + "language_loss": 0.85248405, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87456369, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 4.226459264755249 + }, + { + "auxiliary_loss_clip": 0.01103995, + "auxiliary_loss_mlp": 0.01063505, + "balance_loss_clip": 1.04833043, + "balance_loss_mlp": 1.03994966, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 1.6520405063648327, + "language_loss": 0.83557779, + "learning_rate": 3.960446580030599e-06, + "loss": 0.85725278, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.719783306121826 + }, + { + "auxiliary_loss_clip": 0.01170576, + "auxiliary_loss_mlp": 0.01061867, + "balance_loss_clip": 1.04980087, + "balance_loss_mlp": 1.03860915, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 1.5567939889852322, + "language_loss": 0.81069744, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83302188, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 2.5831844806671143 + }, + { + "auxiliary_loss_clip": 0.01141303, + "auxiliary_loss_mlp": 0.00752193, + "balance_loss_clip": 1.04645407, + "balance_loss_mlp": 1.0002557, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.122028403270503, + "language_loss": 0.7419098, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76084483, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.5712316036224365 + }, + { + "auxiliary_loss_clip": 0.01121938, + "auxiliary_loss_mlp": 0.01059239, + "balance_loss_clip": 1.0465095, + "balance_loss_mlp": 1.03562331, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 1.8792624131205233, + "language_loss": 0.8608126, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88262439, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 2.586648464202881 + }, + { + "auxiliary_loss_clip": 0.01151304, + "auxiliary_loss_mlp": 0.01049231, + "balance_loss_clip": 1.05006576, + "balance_loss_mlp": 1.02578259, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.429442206680656, + "language_loss": 0.74414855, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76615387, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 2.737748384475708 + }, + { + "auxiliary_loss_clip": 0.01174156, + "auxiliary_loss_mlp": 0.01052953, + "balance_loss_clip": 1.05135965, + "balance_loss_mlp": 1.03094673, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.9086932585704253, + "language_loss": 0.77177334, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79404444, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.6243114471435547 + }, + { + "auxiliary_loss_clip": 0.01163936, + "auxiliary_loss_mlp": 0.01054146, + "balance_loss_clip": 1.04933894, + "balance_loss_mlp": 1.02979183, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 3.2168482033267285, + "language_loss": 0.783916, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80609679, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 2.580636501312256 + }, + { + "auxiliary_loss_clip": 0.0113126, + "auxiliary_loss_mlp": 0.01049449, + "balance_loss_clip": 1.04528677, + "balance_loss_mlp": 1.02670348, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.623033851320513, + "language_loss": 0.77046037, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79226738, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 2.640974283218384 + }, + { + "auxiliary_loss_clip": 0.0117761, + "auxiliary_loss_mlp": 0.0075215, + "balance_loss_clip": 1.0502671, + "balance_loss_mlp": 1.00021017, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.8231696893615286, + "language_loss": 0.82753152, + "learning_rate": 3.959827622252211e-06, + "loss": 0.84682912, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.5647451877593994 + }, + { + "auxiliary_loss_clip": 0.0111194, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.04219162, + "balance_loss_mlp": 1.04090953, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.21844610741953, + "language_loss": 0.8427512, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86451644, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 2.6653573513031006 + }, + { + "auxiliary_loss_clip": 0.0112213, + "auxiliary_loss_mlp": 0.01055368, + "balance_loss_clip": 1.0448668, + "balance_loss_mlp": 1.0318954, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 2.0320999354549394, + "language_loss": 0.8111062, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83288115, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 2.6076488494873047 + }, + { + "auxiliary_loss_clip": 0.01145821, + "auxiliary_loss_mlp": 0.01057741, + "balance_loss_clip": 1.04836369, + "balance_loss_mlp": 1.0336132, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 2.14895493790663, + "language_loss": 0.84058583, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.86262143, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.73958158493042 + }, + { + "auxiliary_loss_clip": 0.01139928, + "auxiliary_loss_mlp": 0.01058265, + "balance_loss_clip": 1.05167747, + "balance_loss_mlp": 1.03447139, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 2.6215467142983986, + "language_loss": 0.90332532, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92530727, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 2.6180505752563477 + }, + { + "auxiliary_loss_clip": 0.01143163, + "auxiliary_loss_mlp": 0.01065785, + "balance_loss_clip": 1.04814065, + "balance_loss_mlp": 1.03911781, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.1254343262960287, + "language_loss": 0.75861621, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78070569, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 2.696988821029663 + }, + { + "auxiliary_loss_clip": 0.01158992, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.04721224, + "balance_loss_mlp": 1.02915478, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.745200438711925, + "language_loss": 0.81422776, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83633077, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 2.531090259552002 + }, + { + "auxiliary_loss_clip": 0.01171002, + "auxiliary_loss_mlp": 0.01059922, + "balance_loss_clip": 1.04873562, + "balance_loss_mlp": 1.03706992, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.0295065207252194, + "language_loss": 0.89121437, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91352367, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 2.5284759998321533 + }, + { + "auxiliary_loss_clip": 0.01145925, + "auxiliary_loss_mlp": 0.01060683, + "balance_loss_clip": 1.0457449, + "balance_loss_mlp": 1.03661454, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.478869435293402, + "language_loss": 0.80935717, + "learning_rate": 3.959203908195741e-06, + "loss": 0.83142328, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 2.5829126834869385 + }, + { + "auxiliary_loss_clip": 0.0104362, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.01765156, + "balance_loss_mlp": 1.02496648, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.7562992189278476, + "language_loss": 0.57456577, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59528577, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 3.218860626220703 + }, + { + "auxiliary_loss_clip": 0.01147681, + "auxiliary_loss_mlp": 0.01055265, + "balance_loss_clip": 1.04931235, + "balance_loss_mlp": 1.03190005, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 3.0018217270761633, + "language_loss": 0.6712184, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69324791, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.551048517227173 + }, + { + "auxiliary_loss_clip": 0.01132339, + "auxiliary_loss_mlp": 0.01049163, + "balance_loss_clip": 1.04633069, + "balance_loss_mlp": 1.02493978, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 2.097176042014011, + "language_loss": 0.83968842, + "learning_rate": 3.958968789505198e-06, + "loss": 0.86150342, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 2.622739315032959 + }, + { + "auxiliary_loss_clip": 0.0106559, + "auxiliary_loss_mlp": 0.01001958, + "balance_loss_clip": 1.01574302, + "balance_loss_mlp": 0.99869138, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8802655229208836, + "language_loss": 0.61910415, + "learning_rate": 3.9588902680358e-06, + "loss": 0.63977963, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 3.0512614250183105 + }, + { + "auxiliary_loss_clip": 0.01154968, + "auxiliary_loss_mlp": 0.01060419, + "balance_loss_clip": 1.05212331, + "balance_loss_mlp": 1.03813899, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.5260347385504498, + "language_loss": 0.82723176, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84938562, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 2.610701322555542 + }, + { + "auxiliary_loss_clip": 0.01121532, + "auxiliary_loss_mlp": 0.01063889, + "balance_loss_clip": 1.04259992, + "balance_loss_mlp": 1.03976095, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 2.4488953009124645, + "language_loss": 0.72446305, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74631727, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 2.945585250854492 + }, + { + "auxiliary_loss_clip": 0.0115307, + "auxiliary_loss_mlp": 0.01056921, + "balance_loss_clip": 1.04743397, + "balance_loss_mlp": 1.03149319, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.545538962325421, + "language_loss": 0.77346766, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79556757, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.6784393787384033 + }, + { + "auxiliary_loss_clip": 0.01128996, + "auxiliary_loss_mlp": 0.01054734, + "balance_loss_clip": 1.04716587, + "balance_loss_mlp": 1.03104675, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 4.115412979769657, + "language_loss": 0.75418937, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.77602667, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 2.574162244796753 + }, + { + "auxiliary_loss_clip": 0.01148682, + "auxiliary_loss_mlp": 0.01058081, + "balance_loss_clip": 1.04721451, + "balance_loss_mlp": 1.03339314, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.4523508447388997, + "language_loss": 0.84388125, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86594886, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.5804061889648438 + }, + { + "auxiliary_loss_clip": 0.01127928, + "auxiliary_loss_mlp": 0.01056289, + "balance_loss_clip": 1.04214239, + "balance_loss_mlp": 1.03278065, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 2.1826739164114652, + "language_loss": 0.67454803, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69639021, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.6650216579437256 + }, + { + "auxiliary_loss_clip": 0.01106025, + "auxiliary_loss_mlp": 0.01058851, + "balance_loss_clip": 1.0407443, + "balance_loss_mlp": 1.03555703, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.3056558851960545, + "language_loss": 0.83262706, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85427582, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 2.6601295471191406 + }, + { + "auxiliary_loss_clip": 0.01163476, + "auxiliary_loss_mlp": 0.01050056, + "balance_loss_clip": 1.05156016, + "balance_loss_mlp": 1.02783573, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7296250738914933, + "language_loss": 0.75670373, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77883899, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.6359429359436035 + }, + { + "auxiliary_loss_clip": 0.01136665, + "auxiliary_loss_mlp": 0.01076134, + "balance_loss_clip": 1.04779851, + "balance_loss_mlp": 1.04916894, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.4800778936840695, + "language_loss": 0.83423197, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85635996, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.649104595184326 + }, + { + "auxiliary_loss_clip": 0.01046473, + "auxiliary_loss_mlp": 0.00750297, + "balance_loss_clip": 1.00950408, + "balance_loss_mlp": 1.00032508, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7460860775431577, + "language_loss": 0.6186052, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63657284, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 3.2216365337371826 + }, + { + "auxiliary_loss_clip": 0.01045998, + "auxiliary_loss_mlp": 0.01010253, + "balance_loss_clip": 1.01566267, + "balance_loss_mlp": 1.00646234, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.909891522519421, + "language_loss": 0.58899164, + "learning_rate": 3.958021629962681e-06, + "loss": 0.60955405, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 3.24188494682312 + }, + { + "auxiliary_loss_clip": 0.01131094, + "auxiliary_loss_mlp": 0.01057114, + "balance_loss_clip": 1.04496431, + "balance_loss_mlp": 1.03362966, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 1.7829917143558103, + "language_loss": 0.87412345, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89600551, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 4.07952356338501 + }, + { + "auxiliary_loss_clip": 0.01135855, + "auxiliary_loss_mlp": 0.01060874, + "balance_loss_clip": 1.04764223, + "balance_loss_mlp": 1.0371747, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 1.9499981274472524, + "language_loss": 0.81437957, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83634681, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.6003053188323975 + }, + { + "auxiliary_loss_clip": 0.0105027, + "auxiliary_loss_mlp": 0.0101651, + "balance_loss_clip": 1.01351416, + "balance_loss_mlp": 1.01369691, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8785707411076596, + "language_loss": 0.59569812, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61636597, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.1207149028778076 + }, + { + "auxiliary_loss_clip": 0.01158224, + "auxiliary_loss_mlp": 0.01064739, + "balance_loss_clip": 1.0486697, + "balance_loss_mlp": 1.04213738, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5999902128057957, + "language_loss": 0.84221494, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86444461, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.6491000652313232 + }, + { + "auxiliary_loss_clip": 0.01095989, + "auxiliary_loss_mlp": 0.01072134, + "balance_loss_clip": 1.04426479, + "balance_loss_mlp": 1.0470643, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.7464745521006189, + "language_loss": 0.78081769, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80249894, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 2.7582614421844482 + }, + { + "auxiliary_loss_clip": 0.01148228, + "auxiliary_loss_mlp": 0.01057274, + "balance_loss_clip": 1.04872191, + "balance_loss_mlp": 1.03405201, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.030179907959521, + "language_loss": 0.79984212, + "learning_rate": 3.957544040455379e-06, + "loss": 0.82189715, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 4.16639256477356 + }, + { + "auxiliary_loss_clip": 0.01125347, + "auxiliary_loss_mlp": 0.01061879, + "balance_loss_clip": 1.04522014, + "balance_loss_mlp": 1.03966999, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 2.342452091425716, + "language_loss": 0.76706004, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78893226, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 4.152467489242554 + }, + { + "auxiliary_loss_clip": 0.01123692, + "auxiliary_loss_mlp": 0.01059788, + "balance_loss_clip": 1.04595792, + "balance_loss_mlp": 1.0358628, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.6692655507829508, + "language_loss": 0.80803055, + "learning_rate": 3.95738425007858e-06, + "loss": 0.82986534, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.0115857, + "auxiliary_loss_mlp": 0.01045331, + "balance_loss_clip": 1.04565883, + "balance_loss_mlp": 1.022681, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.3201650633094646, + "language_loss": 0.61466849, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63670743, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.6694562435150146 + }, + { + "auxiliary_loss_clip": 0.01146721, + "auxiliary_loss_mlp": 0.01056903, + "balance_loss_clip": 1.04944825, + "balance_loss_mlp": 1.0356245, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.0578458723157538, + "language_loss": 0.85194635, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87398255, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 2.6048853397369385 + }, + { + "auxiliary_loss_clip": 0.01150392, + "auxiliary_loss_mlp": 0.01047051, + "balance_loss_clip": 1.05094337, + "balance_loss_mlp": 1.02585614, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 2.0527551285006522, + "language_loss": 0.76216525, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78413963, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 2.6084394454956055 + }, + { + "auxiliary_loss_clip": 0.01131282, + "auxiliary_loss_mlp": 0.01055114, + "balance_loss_clip": 1.04521918, + "balance_loss_mlp": 1.03298879, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 1.9882599383359474, + "language_loss": 0.79907393, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82093787, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.587935447692871 + }, + { + "auxiliary_loss_clip": 0.01141351, + "auxiliary_loss_mlp": 0.01064612, + "balance_loss_clip": 1.04520106, + "balance_loss_mlp": 1.04205728, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 2.1506721988624347, + "language_loss": 0.75558841, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77764797, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 2.5916402339935303 + }, + { + "auxiliary_loss_clip": 0.01132668, + "auxiliary_loss_mlp": 0.00752068, + "balance_loss_clip": 1.04594636, + "balance_loss_mlp": 1.00016356, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 1.860399710404926, + "language_loss": 0.7792244, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79807174, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 2.6186792850494385 + }, + { + "auxiliary_loss_clip": 0.01142439, + "auxiliary_loss_mlp": 0.01054497, + "balance_loss_clip": 1.04718566, + "balance_loss_mlp": 1.03289676, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 1.799000619586819, + "language_loss": 0.82483298, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84680235, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.6108648777008057 + }, + { + "auxiliary_loss_clip": 0.01173563, + "auxiliary_loss_mlp": 0.01050894, + "balance_loss_clip": 1.0505091, + "balance_loss_mlp": 1.02748179, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 2.054136118515801, + "language_loss": 0.76555526, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78779984, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.4960830211639404 + }, + { + "auxiliary_loss_clip": 0.01100531, + "auxiliary_loss_mlp": 0.01060318, + "balance_loss_clip": 1.04052627, + "balance_loss_mlp": 1.03561795, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.254497107109793, + "language_loss": 0.85695457, + "learning_rate": 3.956661519635756e-06, + "loss": 0.87856305, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 2.587963104248047 + }, + { + "auxiliary_loss_clip": 0.01105365, + "auxiliary_loss_mlp": 0.0105623, + "balance_loss_clip": 1.0448854, + "balance_loss_mlp": 1.03180385, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.5778020046133587, + "language_loss": 0.76478833, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78640425, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.6897544860839844 + }, + { + "auxiliary_loss_clip": 0.01119977, + "auxiliary_loss_mlp": 0.01056625, + "balance_loss_clip": 1.04606867, + "balance_loss_mlp": 1.03374863, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.5993914482271188, + "language_loss": 0.7900039, + "learning_rate": 3.956500096627561e-06, + "loss": 0.8117699, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 2.69705867767334 + }, + { + "auxiliary_loss_clip": 0.01132687, + "auxiliary_loss_mlp": 0.01074033, + "balance_loss_clip": 1.05149245, + "balance_loss_mlp": 1.04977357, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.6948456316661913, + "language_loss": 0.87666541, + "learning_rate": 3.956419273835913e-06, + "loss": 0.8987326, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 2.781961679458618 + }, + { + "auxiliary_loss_clip": 0.0114442, + "auxiliary_loss_mlp": 0.01062273, + "balance_loss_clip": 1.04661727, + "balance_loss_mlp": 1.03728688, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.7228303571515053, + "language_loss": 0.82255977, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84462667, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 2.7010340690612793 + }, + { + "auxiliary_loss_clip": 0.01139964, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_clip": 1.04679835, + "balance_loss_mlp": 1.03055823, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.0636581126912685, + "language_loss": 0.80797607, + "learning_rate": 3.95625740569284e-06, + "loss": 0.82989526, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 2.6203365325927734 + }, + { + "auxiliary_loss_clip": 0.01166164, + "auxiliary_loss_mlp": 0.01064803, + "balance_loss_clip": 1.04763615, + "balance_loss_mlp": 1.0414381, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 1.9005137128962322, + "language_loss": 0.86353922, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88584894, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 2.558635711669922 + }, + { + "auxiliary_loss_clip": 0.01036952, + "auxiliary_loss_mlp": 0.01002471, + "balance_loss_clip": 1.01184702, + "balance_loss_mlp": 0.99901432, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9750128383198621, + "language_loss": 0.65839636, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67879063, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.0896952152252197 + }, + { + "auxiliary_loss_clip": 0.01130158, + "auxiliary_loss_mlp": 0.01048869, + "balance_loss_clip": 1.04403496, + "balance_loss_mlp": 1.02722025, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 1.9365184482520348, + "language_loss": 0.79334199, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81513214, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 2.699967622756958 + }, + { + "auxiliary_loss_clip": 0.01165793, + "auxiliary_loss_mlp": 0.01060377, + "balance_loss_clip": 1.04632413, + "balance_loss_mlp": 1.03782248, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 2.0028319140905744, + "language_loss": 0.78026444, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80252612, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 2.558110237121582 + }, + { + "auxiliary_loss_clip": 0.0110745, + "auxiliary_loss_mlp": 0.01063872, + "balance_loss_clip": 1.04159129, + "balance_loss_mlp": 1.03786087, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 1.8279203985878392, + "language_loss": 0.72951019, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75122339, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.634735107421875 + }, + { + "auxiliary_loss_clip": 0.01138219, + "auxiliary_loss_mlp": 0.01055445, + "balance_loss_clip": 1.04427481, + "balance_loss_mlp": 1.03277111, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 1.7946441234291006, + "language_loss": 0.77420044, + "learning_rate": 3.955770021006627e-06, + "loss": 0.79613715, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.6994311809539795 + }, + { + "auxiliary_loss_clip": 0.01129232, + "auxiliary_loss_mlp": 0.01055722, + "balance_loss_clip": 1.04839849, + "balance_loss_mlp": 1.03331089, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 1.9655168168526207, + "language_loss": 0.87111151, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89296108, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 2.616598129272461 + }, + { + "auxiliary_loss_clip": 0.01155099, + "auxiliary_loss_mlp": 0.01059123, + "balance_loss_clip": 1.04647017, + "balance_loss_mlp": 1.03513849, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.804874127058825, + "language_loss": 0.66845965, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69060194, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 2.5583932399749756 + }, + { + "auxiliary_loss_clip": 0.01159756, + "auxiliary_loss_mlp": 0.01051917, + "balance_loss_clip": 1.05016613, + "balance_loss_mlp": 1.0281347, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8175743136161309, + "language_loss": 0.70395029, + "learning_rate": 3.95552532742147e-06, + "loss": 0.72606701, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 2.603327751159668 + }, + { + "auxiliary_loss_clip": 0.01123523, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.04497719, + "balance_loss_mlp": 1.03406787, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.6677123481364873, + "language_loss": 0.80881262, + "learning_rate": 3.955443614581525e-06, + "loss": 0.8306036, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.712507724761963 + }, + { + "auxiliary_loss_clip": 0.01140732, + "auxiliary_loss_mlp": 0.01062518, + "balance_loss_clip": 1.04571354, + "balance_loss_mlp": 1.03751945, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.9282345877463207, + "language_loss": 0.71857274, + "learning_rate": 3.955361827590961e-06, + "loss": 0.74060524, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.641545057296753 + }, + { + "auxiliary_loss_clip": 0.01028438, + "auxiliary_loss_mlp": 0.01011672, + "balance_loss_clip": 1.01619458, + "balance_loss_mlp": 1.00816751, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8116661854920807, + "language_loss": 0.5549047, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57530576, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 2.9939234256744385 + }, + { + "auxiliary_loss_clip": 0.01108824, + "auxiliary_loss_mlp": 0.01055952, + "balance_loss_clip": 1.04285669, + "balance_loss_mlp": 1.03296876, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.8252127422653852, + "language_loss": 0.81173974, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83338749, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.71448016166687 + }, + { + "auxiliary_loss_clip": 0.01112241, + "auxiliary_loss_mlp": 0.01065481, + "balance_loss_clip": 1.04186547, + "balance_loss_mlp": 1.04229534, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.4781535584622203, + "language_loss": 0.81612766, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83790493, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.699157238006592 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.00752127, + "balance_loss_clip": 1.04635763, + "balance_loss_mlp": 1.00013781, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.5279197530655992, + "language_loss": 0.6488899, + "learning_rate": 3.955033938184601e-06, + "loss": 0.66748273, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 2.951622486114502 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.01058631, + "balance_loss_clip": 1.04419756, + "balance_loss_mlp": 1.03586197, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.622441981793349, + "language_loss": 0.83050913, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85237825, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.7235465049743652 + }, + { + "auxiliary_loss_clip": 0.01146837, + "auxiliary_loss_mlp": 0.01059509, + "balance_loss_clip": 1.04502487, + "balance_loss_mlp": 1.03665638, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.3610232606461423, + "language_loss": 0.73806608, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76012951, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.579730272293091 + }, + { + "auxiliary_loss_clip": 0.01152926, + "auxiliary_loss_mlp": 0.01058465, + "balance_loss_clip": 1.04601741, + "balance_loss_mlp": 1.03533792, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.729787227329357, + "language_loss": 0.73823595, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76034981, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.673491954803467 + }, + { + "auxiliary_loss_clip": 0.01161482, + "auxiliary_loss_mlp": 0.01058933, + "balance_loss_clip": 1.05149293, + "balance_loss_mlp": 1.03697467, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.7760624541321972, + "language_loss": 0.69514954, + "learning_rate": 3.954704862616971e-06, + "loss": 0.7173537, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.5712766647338867 + }, + { + "auxiliary_loss_clip": 0.01156317, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.04802608, + "balance_loss_mlp": 1.03575158, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.159044653742487, + "language_loss": 0.82824731, + "learning_rate": 3.954622408410747e-06, + "loss": 0.85038292, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 4.020762205123901 + }, + { + "auxiliary_loss_clip": 0.01132592, + "auxiliary_loss_mlp": 0.01059702, + "balance_loss_clip": 1.04402065, + "balance_loss_mlp": 1.03527641, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 1.8923399826199452, + "language_loss": 0.85012412, + "learning_rate": 3.954539880085045e-06, + "loss": 0.87204707, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.6371326446533203 + }, + { + "auxiliary_loss_clip": 0.01150054, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.04985642, + "balance_loss_mlp": 1.02963781, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 2.873846147200193, + "language_loss": 0.6882211, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71026421, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 2.7044081687927246 + }, + { + "auxiliary_loss_clip": 0.01149122, + "auxiliary_loss_mlp": 0.00752134, + "balance_loss_clip": 1.04475451, + "balance_loss_mlp": 1.00022066, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.9146375453030657, + "language_loss": 0.74806428, + "learning_rate": 3.954374601087729e-06, + "loss": 0.76707685, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.586347818374634 + }, + { + "auxiliary_loss_clip": 0.0116097, + "auxiliary_loss_mlp": 0.01056595, + "balance_loss_clip": 1.05116677, + "balance_loss_mlp": 1.03219259, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.9677352321498511, + "language_loss": 0.69040787, + "learning_rate": 3.954291850422382e-06, + "loss": 0.71258354, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 4.13450026512146 + }, + { + "auxiliary_loss_clip": 0.01138227, + "auxiliary_loss_mlp": 0.01061068, + "balance_loss_clip": 1.04793453, + "balance_loss_mlp": 1.03806019, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.284331726133663, + "language_loss": 0.8431375, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86513042, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 5.719265937805176 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_clip": 1.04661369, + "balance_loss_mlp": 1.03353631, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 1.99740908229271, + "language_loss": 0.80234754, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82424223, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.5754165649414062 + }, + { + "auxiliary_loss_clip": 0.01162708, + "auxiliary_loss_mlp": 0.01059842, + "balance_loss_clip": 1.04887819, + "balance_loss_mlp": 1.03644085, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.677995923660753, + "language_loss": 0.82410669, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84633219, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.5687520503997803 + }, + { + "auxiliary_loss_clip": 0.01114974, + "auxiliary_loss_mlp": 0.01056239, + "balance_loss_clip": 1.04368186, + "balance_loss_mlp": 1.03180146, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.912394608092038, + "language_loss": 0.62555337, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64726549, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 2.6619374752044678 + }, + { + "auxiliary_loss_clip": 0.01172397, + "auxiliary_loss_mlp": 0.0105489, + "balance_loss_clip": 1.05081749, + "balance_loss_mlp": 1.03009439, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.3687417679476357, + "language_loss": 0.71181226, + "learning_rate": 3.953876985554364e-06, + "loss": 0.7340852, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.5697946548461914 + }, + { + "auxiliary_loss_clip": 0.01156537, + "auxiliary_loss_mlp": 0.01054335, + "balance_loss_clip": 1.04830289, + "balance_loss_mlp": 1.03280616, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.098054166789861, + "language_loss": 0.80056655, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82267523, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 2.657893419265747 + }, + { + "auxiliary_loss_clip": 0.01142898, + "auxiliary_loss_mlp": 0.01046968, + "balance_loss_clip": 1.0456059, + "balance_loss_mlp": 1.02400815, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 2.143082877097634, + "language_loss": 0.74377304, + "learning_rate": 3.953710520946634e-06, + "loss": 0.76567173, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.6625964641571045 + }, + { + "auxiliary_loss_clip": 0.01154812, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.05017471, + "balance_loss_mlp": 1.02640653, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 2.1783862513414425, + "language_loss": 0.7567938, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77883106, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 2.5681357383728027 + }, + { + "auxiliary_loss_clip": 0.01119008, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.04133654, + "balance_loss_mlp": 1.02511871, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 1.757576795552208, + "language_loss": 0.866575, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88823986, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.6420540809631348 + }, + { + "auxiliary_loss_clip": 0.01105288, + "auxiliary_loss_mlp": 0.01062991, + "balance_loss_clip": 1.04494202, + "balance_loss_mlp": 1.03742075, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.5287106338466248, + "language_loss": 0.71324939, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73493218, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 2.857781410217285 + }, + { + "auxiliary_loss_clip": 0.01133886, + "auxiliary_loss_mlp": 0.01056435, + "balance_loss_clip": 1.0472796, + "balance_loss_mlp": 1.03420293, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 2.0385145605402855, + "language_loss": 0.84547174, + "learning_rate": 3.953376702737693e-06, + "loss": 0.8673749, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 2.601144790649414 + }, + { + "auxiliary_loss_clip": 0.01145258, + "auxiliary_loss_mlp": 0.01052145, + "balance_loss_clip": 1.05061197, + "balance_loss_mlp": 1.02856529, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 8.098507410631884, + "language_loss": 0.66829139, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69026542, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 2.60559344291687 + }, + { + "auxiliary_loss_clip": 0.01101203, + "auxiliary_loss_mlp": 0.01061127, + "balance_loss_clip": 1.04038334, + "balance_loss_mlp": 1.03817916, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.7289063114948031, + "language_loss": 0.81172991, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83335316, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 2.6812691688537598 + }, + { + "auxiliary_loss_clip": 0.0116579, + "auxiliary_loss_mlp": 0.01069099, + "balance_loss_clip": 1.05296719, + "balance_loss_mlp": 1.04623413, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 3.1302406321525598, + "language_loss": 0.81021243, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83256131, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 2.573866128921509 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01053389, + "balance_loss_clip": 1.04732037, + "balance_loss_mlp": 1.02942824, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 1.72939200627105, + "language_loss": 0.84389675, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86572778, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 2.66778564453125 + }, + { + "auxiliary_loss_clip": 0.01054261, + "auxiliary_loss_mlp": 0.0074998, + "balance_loss_clip": 1.01754045, + "balance_loss_mlp": 0.99994326, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.8115893618525486, + "language_loss": 0.54618156, + "learning_rate": 3.952957763374992e-06, + "loss": 0.564224, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.086244821548462 + }, + { + "auxiliary_loss_clip": 0.01011662, + "auxiliary_loss_mlp": 0.01008608, + "balance_loss_clip": 1.01103747, + "balance_loss_mlp": 1.0055325, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7637194369531104, + "language_loss": 0.58189797, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60210073, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 3.363748550415039 + }, + { + "auxiliary_loss_clip": 0.01133393, + "auxiliary_loss_mlp": 0.01064122, + "balance_loss_clip": 1.04661739, + "balance_loss_mlp": 1.03887343, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.8885225022047798, + "language_loss": 0.69026119, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71223634, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 2.6233925819396973 + }, + { + "auxiliary_loss_clip": 0.01133788, + "auxiliary_loss_mlp": 0.0105965, + "balance_loss_clip": 1.04544425, + "balance_loss_mlp": 1.03417516, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 2.5774356988573324, + "language_loss": 0.80675697, + "learning_rate": 3.952705511055698e-06, + "loss": 0.82869136, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 2.7354419231414795 + }, + { + "auxiliary_loss_clip": 0.01150795, + "auxiliary_loss_mlp": 0.01053157, + "balance_loss_clip": 1.05183601, + "balance_loss_mlp": 1.03203356, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.6875568052162826, + "language_loss": 0.92858982, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95062935, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 2.678264856338501 + }, + { + "auxiliary_loss_clip": 0.01156227, + "auxiliary_loss_mlp": 0.01054708, + "balance_loss_clip": 1.05048251, + "balance_loss_mlp": 1.03143811, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 2.008175144318696, + "language_loss": 0.88825214, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.91036147, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 2.7915239334106445 + }, + { + "auxiliary_loss_clip": 0.01134576, + "auxiliary_loss_mlp": 0.01061406, + "balance_loss_clip": 1.0466907, + "balance_loss_mlp": 1.03650331, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 1.964642484374834, + "language_loss": 0.76992536, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79188514, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.597893714904785 + }, + { + "auxiliary_loss_clip": 0.0110875, + "auxiliary_loss_mlp": 0.01073358, + "balance_loss_clip": 1.04093421, + "balance_loss_mlp": 1.04838324, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 2.3726127075846684, + "language_loss": 0.7780261, + "learning_rate": 3.952368137989871e-06, + "loss": 0.79984713, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.617095708847046 + }, + { + "auxiliary_loss_clip": 0.01130685, + "auxiliary_loss_mlp": 0.01057998, + "balance_loss_clip": 1.04603755, + "balance_loss_mlp": 1.03417957, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.8777350930661394, + "language_loss": 0.8557182, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.87760508, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 2.678297996520996 + }, + { + "auxiliary_loss_clip": 0.01159227, + "auxiliary_loss_mlp": 0.010584, + "balance_loss_clip": 1.05026948, + "balance_loss_mlp": 1.03528571, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.211474209275523, + "language_loss": 0.80456173, + "learning_rate": 3.952199007240184e-06, + "loss": 0.826738, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.505467414855957 + }, + { + "auxiliary_loss_clip": 0.01156083, + "auxiliary_loss_mlp": 0.01048802, + "balance_loss_clip": 1.04678941, + "balance_loss_mlp": 1.0270462, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 4.090127858117508, + "language_loss": 0.85397512, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87602401, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.538219690322876 + }, + { + "auxiliary_loss_clip": 0.0115987, + "auxiliary_loss_mlp": 0.01059977, + "balance_loss_clip": 1.04933453, + "balance_loss_mlp": 1.03791106, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 2.22470317363761, + "language_loss": 0.85394675, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87614518, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.554896116256714 + }, + { + "auxiliary_loss_clip": 0.01144875, + "auxiliary_loss_mlp": 0.00752096, + "balance_loss_clip": 1.04740465, + "balance_loss_mlp": 1.00008488, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.9879733544861464, + "language_loss": 0.83198142, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85095114, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.576404094696045 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01056236, + "balance_loss_clip": 1.0470047, + "balance_loss_mlp": 1.03452754, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.8671090963055206, + "language_loss": 0.84123075, + "learning_rate": 3.951859857435534e-06, + "loss": 0.8632893, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.5599472522735596 + }, + { + "auxiliary_loss_clip": 0.01150471, + "auxiliary_loss_mlp": 0.01054315, + "balance_loss_clip": 1.04485166, + "balance_loss_mlp": 1.03177226, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.554816219235143, + "language_loss": 0.75839233, + "learning_rate": 3.951774884939523e-06, + "loss": 0.78044015, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.596428394317627 + }, + { + "auxiliary_loss_clip": 0.01113672, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.05090141, + "balance_loss_mlp": 1.02945733, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.6574833792921384, + "language_loss": 0.7786786, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80034089, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.689486265182495 + }, + { + "auxiliary_loss_clip": 0.01146849, + "auxiliary_loss_mlp": 0.01051538, + "balance_loss_clip": 1.05004084, + "balance_loss_mlp": 1.02714777, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 2.3004475472368457, + "language_loss": 0.86399829, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88598216, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.608246326446533 + }, + { + "auxiliary_loss_clip": 0.01151829, + "auxiliary_loss_mlp": 0.01053025, + "balance_loss_clip": 1.05297077, + "balance_loss_mlp": 1.03090024, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.04528528549594, + "language_loss": 0.82988387, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85193247, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.5942513942718506 + }, + { + "auxiliary_loss_clip": 0.01129258, + "auxiliary_loss_mlp": 0.01054962, + "balance_loss_clip": 1.0472219, + "balance_loss_mlp": 1.03373027, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.4922517323421145, + "language_loss": 0.78597188, + "learning_rate": 3.951434254872751e-06, + "loss": 0.80781406, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.631326675415039 + }, + { + "auxiliary_loss_clip": 0.01148223, + "auxiliary_loss_mlp": 0.0105472, + "balance_loss_clip": 1.04522324, + "balance_loss_mlp": 1.0321542, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.1842756463642714, + "language_loss": 0.73322624, + "learning_rate": 3.951348912351521e-06, + "loss": 0.7552557, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.5578055381774902 + }, + { + "auxiliary_loss_clip": 0.01138871, + "auxiliary_loss_mlp": 0.01068818, + "balance_loss_clip": 1.04476404, + "balance_loss_mlp": 1.04511964, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.741702929336541, + "language_loss": 0.72703141, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74910831, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 4.077885150909424 + }, + { + "auxiliary_loss_clip": 0.0113045, + "auxiliary_loss_mlp": 0.01067073, + "balance_loss_clip": 1.04563916, + "balance_loss_mlp": 1.04140687, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 2.5364620798214395, + "language_loss": 0.77984232, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80181754, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.6490256786346436 + }, + { + "auxiliary_loss_clip": 0.01139682, + "auxiliary_loss_mlp": 0.01058323, + "balance_loss_clip": 1.04558992, + "balance_loss_mlp": 1.03559017, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.035972793722003, + "language_loss": 0.69769466, + "learning_rate": 3.951092440828715e-06, + "loss": 0.71967465, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.610805034637451 + }, + { + "auxiliary_loss_clip": 0.01165581, + "auxiliary_loss_mlp": 0.01058623, + "balance_loss_clip": 1.04845321, + "balance_loss_mlp": 1.0358181, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.080136925757945, + "language_loss": 0.77424538, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79648745, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.5263280868530273 + }, + { + "auxiliary_loss_clip": 0.01122183, + "auxiliary_loss_mlp": 0.01047387, + "balance_loss_clip": 1.04783607, + "balance_loss_mlp": 1.02550018, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 2.230053481631954, + "language_loss": 0.72667986, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74837554, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 4.1720592975616455 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_clip": 1.04516327, + "balance_loss_mlp": 1.02222395, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.767653644303018, + "language_loss": 0.88743901, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90938503, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 5.545159101486206 + }, + { + "auxiliary_loss_clip": 0.0115159, + "auxiliary_loss_mlp": 0.01041068, + "balance_loss_clip": 1.04637206, + "balance_loss_mlp": 1.01877594, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 1.834947860897161, + "language_loss": 0.80634081, + "learning_rate": 3.950749443014801e-06, + "loss": 0.82826746, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.5600028038024902 + }, + { + "auxiliary_loss_clip": 0.01151652, + "auxiliary_loss_mlp": 0.0105701, + "balance_loss_clip": 1.04630709, + "balance_loss_mlp": 1.03295326, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 9.447632363262393, + "language_loss": 0.85453236, + "learning_rate": 3.95066350862165e-06, + "loss": 0.87661904, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 2.510509490966797 + }, + { + "auxiliary_loss_clip": 0.01125678, + "auxiliary_loss_mlp": 0.01052971, + "balance_loss_clip": 1.04640365, + "balance_loss_mlp": 1.03090572, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.6702870255229518, + "language_loss": 0.8097074, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83149385, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.7186849117279053 + }, + { + "auxiliary_loss_clip": 0.01148623, + "auxiliary_loss_mlp": 0.01070109, + "balance_loss_clip": 1.04513144, + "balance_loss_mlp": 1.04774559, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9897920504567672, + "language_loss": 0.82472199, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84690928, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 2.514021873474121 + }, + { + "auxiliary_loss_clip": 0.01136305, + "auxiliary_loss_mlp": 0.00752056, + "balance_loss_clip": 1.04317737, + "balance_loss_mlp": 1.0001111, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7698601399676417, + "language_loss": 0.68337595, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.7022596, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 2.616781711578369 + }, + { + "auxiliary_loss_clip": 0.01039707, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.01522946, + "balance_loss_mlp": 1.00420856, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 1.2560470298675848, + "language_loss": 0.60818923, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62866151, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 3.096405029296875 + }, + { + "auxiliary_loss_clip": 0.01122244, + "auxiliary_loss_mlp": 0.01054846, + "balance_loss_clip": 1.04544485, + "balance_loss_mlp": 1.03059912, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 1.7176397601495466, + "language_loss": 0.733814, + "learning_rate": 3.950232727180833e-06, + "loss": 0.7555849, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 2.6952991485595703 + }, + { + "auxiliary_loss_clip": 0.01138662, + "auxiliary_loss_mlp": 0.01058775, + "balance_loss_clip": 1.0491786, + "balance_loss_mlp": 1.03769839, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 2.225023565893935, + "language_loss": 0.8414802, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86345458, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 2.656973123550415 + }, + { + "auxiliary_loss_clip": 0.01044061, + "auxiliary_loss_mlp": 0.01005264, + "balance_loss_clip": 1.01016235, + "balance_loss_mlp": 1.00226009, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7298277819513458, + "language_loss": 0.55737877, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57787204, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 3.0507149696350098 + }, + { + "auxiliary_loss_clip": 0.01146167, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.04351473, + "balance_loss_mlp": 1.02365506, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.413203732790574, + "language_loss": 0.9001382, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92205012, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 2.6571414470672607 + }, + { + "auxiliary_loss_clip": 0.01015504, + "auxiliary_loss_mlp": 0.0074992, + "balance_loss_clip": 1.01915026, + "balance_loss_mlp": 0.99999809, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.7938861316436955, + "language_loss": 0.63725555, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65490985, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 3.359930992126465 + }, + { + "auxiliary_loss_clip": 0.01148568, + "auxiliary_loss_mlp": 0.01056884, + "balance_loss_clip": 1.04698753, + "balance_loss_mlp": 1.03332853, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9384093525533161, + "language_loss": 0.8795898, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90164429, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 2.5808753967285156 + }, + { + "auxiliary_loss_clip": 0.01144038, + "auxiliary_loss_mlp": 0.01057026, + "balance_loss_clip": 1.04985976, + "balance_loss_mlp": 1.03506804, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9496674325153758, + "language_loss": 0.82033277, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84234339, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.5789248943328857 + }, + { + "auxiliary_loss_clip": 0.01152255, + "auxiliary_loss_mlp": 0.00751828, + "balance_loss_clip": 1.04753697, + "balance_loss_mlp": 1.00008321, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 1.9096368900963805, + "language_loss": 0.79484856, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81388938, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.601391315460205 + }, + { + "auxiliary_loss_clip": 0.01163614, + "auxiliary_loss_mlp": 0.01055039, + "balance_loss_clip": 1.05173349, + "balance_loss_mlp": 1.03410554, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.6398603898495987, + "language_loss": 0.81093001, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83311659, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 2.506157398223877 + }, + { + "auxiliary_loss_clip": 0.01158247, + "auxiliary_loss_mlp": 0.01049867, + "balance_loss_clip": 1.04580641, + "balance_loss_mlp": 1.02811098, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9866211122304964, + "language_loss": 0.80803806, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83011913, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.560345411300659 + }, + { + "auxiliary_loss_clip": 0.01152561, + "auxiliary_loss_mlp": 0.01061452, + "balance_loss_clip": 1.04989886, + "balance_loss_mlp": 1.03933883, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.6754881030956492, + "language_loss": 0.88842553, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91056567, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 2.5287230014801025 + }, + { + "auxiliary_loss_clip": 0.01142152, + "auxiliary_loss_mlp": 0.01052913, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.0294168, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.0475834525471233, + "language_loss": 0.85426134, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.876212, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.5836966037750244 + }, + { + "auxiliary_loss_clip": 0.01059681, + "auxiliary_loss_mlp": 0.01010044, + "balance_loss_clip": 1.01705968, + "balance_loss_mlp": 1.00700402, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.8998106194854959, + "language_loss": 0.60826695, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62896419, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 3.1342709064483643 + }, + { + "auxiliary_loss_clip": 0.01132471, + "auxiliary_loss_mlp": 0.01054912, + "balance_loss_clip": 1.04368675, + "balance_loss_mlp": 1.03184485, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.925100060572068, + "language_loss": 0.85226321, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87413704, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.5964138507843018 + }, + { + "auxiliary_loss_clip": 0.01126973, + "auxiliary_loss_mlp": 0.01057217, + "balance_loss_clip": 1.04546785, + "balance_loss_mlp": 1.03330314, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.919915027544868, + "language_loss": 0.80084133, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82268322, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.581061840057373 + }, + { + "auxiliary_loss_clip": 0.01153902, + "auxiliary_loss_mlp": 0.01057843, + "balance_loss_clip": 1.04966164, + "balance_loss_mlp": 1.03441834, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 3.3424155816695644, + "language_loss": 0.83463776, + "learning_rate": 3.948929291548443e-06, + "loss": 0.8567552, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 2.6302788257598877 + }, + { + "auxiliary_loss_clip": 0.01134299, + "auxiliary_loss_mlp": 0.01061717, + "balance_loss_clip": 1.04414499, + "balance_loss_mlp": 1.03681421, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.1317083280540348, + "language_loss": 0.88742435, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.90938449, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 2.6047115325927734 + }, + { + "auxiliary_loss_clip": 0.0116043, + "auxiliary_loss_mlp": 0.01053908, + "balance_loss_clip": 1.05226398, + "balance_loss_mlp": 1.03049541, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.754805165008505, + "language_loss": 0.70166373, + "learning_rate": 3.948754243526191e-06, + "loss": 0.7238071, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.5738167762756348 + }, + { + "auxiliary_loss_clip": 0.01127647, + "auxiliary_loss_mlp": 0.01053292, + "balance_loss_clip": 1.04935026, + "balance_loss_mlp": 1.03022492, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.2186322835866217, + "language_loss": 0.78904277, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81085217, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.6321911811828613 + }, + { + "auxiliary_loss_clip": 0.01159472, + "auxiliary_loss_mlp": 0.01066771, + "balance_loss_clip": 1.05581427, + "balance_loss_mlp": 1.04394269, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.7228257956769422, + "language_loss": 0.69853485, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.7207973, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.5608503818511963 + }, + { + "auxiliary_loss_clip": 0.01092606, + "auxiliary_loss_mlp": 0.01070637, + "balance_loss_clip": 1.04533339, + "balance_loss_mlp": 1.04530525, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.9158926558663043, + "language_loss": 0.78741574, + "learning_rate": 3.948491117273956e-06, + "loss": 0.80904818, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.795877456665039 + }, + { + "auxiliary_loss_clip": 0.01130438, + "auxiliary_loss_mlp": 0.01061769, + "balance_loss_clip": 1.04583871, + "balance_loss_mlp": 1.03684211, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 3.752167414395232, + "language_loss": 0.77153003, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79345214, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 2.68310546875 + }, + { + "auxiliary_loss_clip": 0.01167758, + "auxiliary_loss_mlp": 0.01055431, + "balance_loss_clip": 1.05132294, + "balance_loss_mlp": 1.03138673, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.6814837295753633, + "language_loss": 0.77777976, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80001163, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.6021392345428467 + }, + { + "auxiliary_loss_clip": 0.0117059, + "auxiliary_loss_mlp": 0.01069409, + "balance_loss_clip": 1.0507772, + "balance_loss_mlp": 1.04526949, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.0957668162720506, + "language_loss": 0.85471129, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87711132, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.574596881866455 + }, + { + "auxiliary_loss_clip": 0.01157843, + "auxiliary_loss_mlp": 0.01055637, + "balance_loss_clip": 1.04634798, + "balance_loss_mlp": 1.03310633, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.6152762801704572, + "language_loss": 0.76886308, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79099786, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.540492296218872 + }, + { + "auxiliary_loss_clip": 0.01049294, + "auxiliary_loss_mlp": 0.01009172, + "balance_loss_clip": 1.01630545, + "balance_loss_mlp": 1.00625169, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7857897113333195, + "language_loss": 0.6073525, + "learning_rate": 3.948051095825149e-06, + "loss": 0.6279372, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.2112395763397217 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01063225, + "balance_loss_clip": 1.04484117, + "balance_loss_mlp": 1.03933537, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 1.9860307199978562, + "language_loss": 0.76819742, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79009116, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.600525140762329 + }, + { + "auxiliary_loss_clip": 0.01108516, + "auxiliary_loss_mlp": 0.01060001, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.03606427, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.1833165370054877, + "language_loss": 0.73102194, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75270712, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 4.065597057342529 + }, + { + "auxiliary_loss_clip": 0.01154195, + "auxiliary_loss_mlp": 0.00751913, + "balance_loss_clip": 1.04773021, + "balance_loss_mlp": 1.00004816, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.0869641732304167, + "language_loss": 0.7918036, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81086469, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.5923962593078613 + }, + { + "auxiliary_loss_clip": 0.01160588, + "auxiliary_loss_mlp": 0.01066587, + "balance_loss_clip": 1.04692841, + "balance_loss_mlp": 1.04568982, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.1891697656094187, + "language_loss": 0.81563783, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83790958, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.559743881225586 + }, + { + "auxiliary_loss_clip": 0.01154771, + "auxiliary_loss_mlp": 0.01055416, + "balance_loss_clip": 1.04970074, + "balance_loss_mlp": 1.03375626, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.0781431363571663, + "language_loss": 0.86308354, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88518542, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.6200942993164062 + }, + { + "auxiliary_loss_clip": 0.01143853, + "auxiliary_loss_mlp": 0.01055802, + "balance_loss_clip": 1.0479728, + "balance_loss_mlp": 1.03393936, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 2.620981879450972, + "language_loss": 0.86483604, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88683259, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.5804507732391357 + }, + { + "auxiliary_loss_clip": 0.01137879, + "auxiliary_loss_mlp": 0.01054998, + "balance_loss_clip": 1.04730392, + "balance_loss_mlp": 1.0324558, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.70920423876092, + "language_loss": 0.89380038, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91572917, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.6010971069335938 + }, + { + "auxiliary_loss_clip": 0.01052167, + "auxiliary_loss_mlp": 0.01001681, + "balance_loss_clip": 1.0097549, + "balance_loss_mlp": 0.99862903, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7986825957599681, + "language_loss": 0.52953625, + "learning_rate": 3.947343220426312e-06, + "loss": 0.5500747, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 6.086484909057617 + }, + { + "auxiliary_loss_clip": 0.01164438, + "auxiliary_loss_mlp": 0.00751784, + "balance_loss_clip": 1.05077362, + "balance_loss_mlp": 0.99995244, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.6259193422793368, + "language_loss": 0.76999962, + "learning_rate": 3.947254403670641e-06, + "loss": 0.78916186, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 2.4880242347717285 + }, + { + "auxiliary_loss_clip": 0.01133431, + "auxiliary_loss_mlp": 0.01059381, + "balance_loss_clip": 1.04530048, + "balance_loss_mlp": 1.03338158, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.68669417486897, + "language_loss": 0.93661219, + "learning_rate": 3.947165513074889e-06, + "loss": 0.95854032, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 2.5922253131866455 + }, + { + "auxiliary_loss_clip": 0.01152696, + "auxiliary_loss_mlp": 0.01053935, + "balance_loss_clip": 1.04721928, + "balance_loss_mlp": 1.03141618, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 1.8836652349831406, + "language_loss": 0.87703109, + "learning_rate": 3.947076548642425e-06, + "loss": 0.89909744, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 2.4922828674316406 + }, + { + "auxiliary_loss_clip": 0.01106615, + "auxiliary_loss_mlp": 0.01060014, + "balance_loss_clip": 1.04264545, + "balance_loss_mlp": 1.03724551, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.8248570843235383, + "language_loss": 0.74247611, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76414239, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 2.6623809337615967 + }, + { + "auxiliary_loss_clip": 0.01041152, + "auxiliary_loss_mlp": 0.01010491, + "balance_loss_clip": 1.01642537, + "balance_loss_mlp": 1.0075587, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.7594208655970212, + "language_loss": 0.61099046, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63150686, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 3.18839955329895 + }, + { + "auxiliary_loss_clip": 0.01136987, + "auxiliary_loss_mlp": 0.0105439, + "balance_loss_clip": 1.04466999, + "balance_loss_mlp": 1.0313592, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 3.151619791389988, + "language_loss": 0.62001598, + "learning_rate": 3.946809212358516e-06, + "loss": 0.64192975, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 2.6736226081848145 + }, + { + "auxiliary_loss_clip": 0.01126967, + "auxiliary_loss_mlp": 0.01058426, + "balance_loss_clip": 1.05071855, + "balance_loss_mlp": 1.03500175, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 3.4665178359438986, + "language_loss": 0.81455851, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83641243, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 2.7496495246887207 + }, + { + "auxiliary_loss_clip": 0.01153391, + "auxiliary_loss_mlp": 0.01056088, + "balance_loss_clip": 1.04886127, + "balance_loss_mlp": 1.03365302, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 2.0762388462109174, + "language_loss": 0.72111756, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74321234, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 2.622558832168579 + }, + { + "auxiliary_loss_clip": 0.01132867, + "auxiliary_loss_mlp": 0.01060921, + "balance_loss_clip": 1.04548144, + "balance_loss_mlp": 1.03775835, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 2.284682704007163, + "language_loss": 0.86664999, + "learning_rate": 3.94654121166582e-06, + "loss": 0.88858783, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 2.615590810775757 + }, + { + "auxiliary_loss_clip": 0.01152154, + "auxiliary_loss_mlp": 0.01052317, + "balance_loss_clip": 1.04571629, + "balance_loss_mlp": 1.03188431, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.7489034723724453, + "language_loss": 0.87873209, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90077686, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.7349700927734375 + }, + { + "auxiliary_loss_clip": 0.01142791, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.04821408, + "balance_loss_mlp": 1.03251708, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 2.015329161283785, + "language_loss": 0.83181185, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85380369, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 2.6367452144622803 + }, + { + "auxiliary_loss_clip": 0.01146398, + "auxiliary_loss_mlp": 0.01051608, + "balance_loss_clip": 1.04927444, + "balance_loss_mlp": 1.02906609, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.4954777110765964, + "language_loss": 0.66691208, + "learning_rate": 3.946272546655801e-06, + "loss": 0.68889225, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 2.7111473083496094 + }, + { + "auxiliary_loss_clip": 0.01123548, + "auxiliary_loss_mlp": 0.01068734, + "balance_loss_clip": 1.04313326, + "balance_loss_mlp": 1.04550028, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.6958936379888407, + "language_loss": 0.7560395, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77796233, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 2.6094465255737305 + }, + { + "auxiliary_loss_clip": 0.01108956, + "auxiliary_loss_mlp": 0.010576, + "balance_loss_clip": 1.04293537, + "balance_loss_mlp": 1.03273296, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.8220260317328096, + "language_loss": 0.87210846, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89377403, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 2.6184518337249756 + }, + { + "auxiliary_loss_clip": 0.01111183, + "auxiliary_loss_mlp": 0.01058699, + "balance_loss_clip": 1.04323816, + "balance_loss_mlp": 1.03379607, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 1.804171697054664, + "language_loss": 0.78962731, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81132609, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 2.6807456016540527 + }, + { + "auxiliary_loss_clip": 0.0111194, + "auxiliary_loss_mlp": 0.01062681, + "balance_loss_clip": 1.04246461, + "balance_loss_mlp": 1.03932822, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7440004948106576, + "language_loss": 0.86558592, + "learning_rate": 3.945913293418447e-06, + "loss": 0.8873322, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.708857774734497 + }, + { + "auxiliary_loss_clip": 0.01145475, + "auxiliary_loss_mlp": 0.01055519, + "balance_loss_clip": 1.04699492, + "balance_loss_mlp": 1.03276157, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 2.39667259295631, + "language_loss": 0.8224237, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84443361, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 2.627596139907837 + }, + { + "auxiliary_loss_clip": 0.01167056, + "auxiliary_loss_mlp": 0.01053188, + "balance_loss_clip": 1.04828644, + "balance_loss_mlp": 1.02981114, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 2.2668579776616893, + "language_loss": 0.81052822, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83273059, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.5299808979034424 + }, + { + "auxiliary_loss_clip": 0.01130637, + "auxiliary_loss_mlp": 0.01048764, + "balance_loss_clip": 1.048087, + "balance_loss_mlp": 1.02631736, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 2.8380011140792583, + "language_loss": 0.76092374, + "learning_rate": 3.945643078691637e-06, + "loss": 0.7827177, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.674274206161499 + }, + { + "auxiliary_loss_clip": 0.01144981, + "auxiliary_loss_mlp": 0.01052789, + "balance_loss_clip": 1.05015123, + "balance_loss_mlp": 1.02999663, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.882655706746538, + "language_loss": 0.80533707, + "learning_rate": 3.945552859553516e-06, + "loss": 0.82731479, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.582495927810669 + }, + { + "auxiliary_loss_clip": 0.01155069, + "auxiliary_loss_mlp": 0.01056571, + "balance_loss_clip": 1.04874754, + "balance_loss_mlp": 1.03312206, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8771391662268544, + "language_loss": 0.77226877, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79438525, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 2.583169937133789 + }, + { + "auxiliary_loss_clip": 0.01162094, + "auxiliary_loss_mlp": 0.01053997, + "balance_loss_clip": 1.05071509, + "balance_loss_mlp": 1.03007174, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.4296541837403476, + "language_loss": 0.7751478, + "learning_rate": 3.945372199954019e-06, + "loss": 0.79730868, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.6634812355041504 + }, + { + "auxiliary_loss_clip": 0.01139019, + "auxiliary_loss_mlp": 0.01053345, + "balance_loss_clip": 1.04662728, + "balance_loss_mlp": 1.03139889, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.206281569691862, + "language_loss": 0.94825327, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97017694, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 2.6000239849090576 + }, + { + "auxiliary_loss_clip": 0.01003716, + "auxiliary_loss_mlp": 0.01007367, + "balance_loss_clip": 1.00847673, + "balance_loss_mlp": 1.00424421, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8971840645923125, + "language_loss": 0.55067229, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57078314, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.1299431324005127 + }, + { + "auxiliary_loss_clip": 0.01166117, + "auxiliary_loss_mlp": 0.01053323, + "balance_loss_clip": 1.04911947, + "balance_loss_mlp": 1.03017235, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.168975839691327, + "language_loss": 0.84263289, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86482728, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.574871778488159 + }, + { + "auxiliary_loss_clip": 0.01028056, + "auxiliary_loss_mlp": 0.01009042, + "balance_loss_clip": 1.0144043, + "balance_loss_mlp": 1.00539458, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7692331747152757, + "language_loss": 0.60452592, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62489688, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.173914909362793 + }, + { + "auxiliary_loss_clip": 0.01126322, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.04282975, + "balance_loss_mlp": 1.03264773, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.2490353740692512, + "language_loss": 0.85931659, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88114446, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.6224424839019775 + }, + { + "auxiliary_loss_clip": 0.01151391, + "auxiliary_loss_mlp": 0.01048598, + "balance_loss_clip": 1.04948962, + "balance_loss_mlp": 1.02667546, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.4698453723831557, + "language_loss": 0.72732389, + "learning_rate": 3.944828450816369e-06, + "loss": 0.74932384, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.6081345081329346 + }, + { + "auxiliary_loss_clip": 0.01139196, + "auxiliary_loss_mlp": 0.00751893, + "balance_loss_clip": 1.04942405, + "balance_loss_mlp": 0.99997669, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.6477608623577478, + "language_loss": 0.91046238, + "learning_rate": 3.944737567821709e-06, + "loss": 0.92937326, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.5991108417510986 + }, + { + "auxiliary_loss_clip": 0.01102654, + "auxiliary_loss_mlp": 0.01061209, + "balance_loss_clip": 1.04514277, + "balance_loss_mlp": 1.03718817, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 1.9044350440248672, + "language_loss": 0.87913513, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90077382, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.69521164894104 + }, + { + "auxiliary_loss_clip": 0.01149942, + "auxiliary_loss_mlp": 0.01057165, + "balance_loss_clip": 1.04693437, + "balance_loss_mlp": 1.03462207, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.8449143788599667, + "language_loss": 0.79570502, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81777608, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.5892691612243652 + }, + { + "auxiliary_loss_clip": 0.01139038, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.04964006, + "balance_loss_mlp": 1.03348947, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 1.9055182143458294, + "language_loss": 0.73626077, + "learning_rate": 3.944464476383668e-06, + "loss": 0.75822353, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.6469688415527344 + }, + { + "auxiliary_loss_clip": 0.01113403, + "auxiliary_loss_mlp": 0.01055275, + "balance_loss_clip": 1.04769325, + "balance_loss_mlp": 1.03231525, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.7313690727330635, + "language_loss": 0.86881268, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89049947, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.583256244659424 + }, + { + "auxiliary_loss_clip": 0.01151602, + "auxiliary_loss_mlp": 0.01062574, + "balance_loss_clip": 1.04885042, + "balance_loss_mlp": 1.04162908, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 1.5379523146040524, + "language_loss": 0.72406441, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74620616, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.715752124786377 + }, + { + "auxiliary_loss_clip": 0.01156227, + "auxiliary_loss_mlp": 0.01063641, + "balance_loss_clip": 1.04808664, + "balance_loss_mlp": 1.03970349, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 1.839809129916066, + "language_loss": 0.90692931, + "learning_rate": 3.944190721337053e-06, + "loss": 0.92912805, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 4.0717713832855225 + }, + { + "auxiliary_loss_clip": 0.0114943, + "auxiliary_loss_mlp": 0.01058355, + "balance_loss_clip": 1.0453794, + "balance_loss_mlp": 1.03606296, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 2.7393327828887015, + "language_loss": 0.75583887, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77791679, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.6910758018493652 + }, + { + "auxiliary_loss_clip": 0.01142734, + "auxiliary_loss_mlp": 0.01068001, + "balance_loss_clip": 1.04693103, + "balance_loss_mlp": 1.0439446, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 1.8789814886046823, + "language_loss": 0.85806185, + "learning_rate": 3.944007849347342e-06, + "loss": 0.88016915, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.6350624561309814 + }, + { + "auxiliary_loss_clip": 0.01100785, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_clip": 1.04126489, + "balance_loss_mlp": 1.04731131, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.111482039848324, + "language_loss": 0.82501733, + "learning_rate": 3.943916302775292e-06, + "loss": 0.84673488, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.717273712158203 + }, + { + "auxiliary_loss_clip": 0.01152682, + "auxiliary_loss_mlp": 0.01054392, + "balance_loss_clip": 1.05000734, + "balance_loss_mlp": 1.03090763, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 2.566192098884565, + "language_loss": 0.73245972, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75453055, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 4.182495594024658 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01048959, + "balance_loss_clip": 1.04976654, + "balance_loss_mlp": 1.02758491, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.170568339171937, + "language_loss": 0.92944342, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.95144296, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 5.614846467971802 + }, + { + "auxiliary_loss_clip": 0.01119401, + "auxiliary_loss_mlp": 0.0105326, + "balance_loss_clip": 1.04301286, + "balance_loss_mlp": 1.03018141, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 1.628692073870711, + "language_loss": 0.79422861, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81595516, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 2.6373701095581055 + }, + { + "auxiliary_loss_clip": 0.01112301, + "auxiliary_loss_mlp": 0.0106651, + "balance_loss_clip": 1.04530096, + "balance_loss_mlp": 1.0389843, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 1.8063860779579886, + "language_loss": 0.8076545, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82944262, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 2.6398305892944336 + }, + { + "auxiliary_loss_clip": 0.01041171, + "auxiliary_loss_mlp": 0.01021625, + "balance_loss_clip": 1.01755357, + "balance_loss_mlp": 1.01843059, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9518923129608025, + "language_loss": 0.6714232, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69205117, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 2.919466495513916 + }, + { + "auxiliary_loss_clip": 0.01153035, + "auxiliary_loss_mlp": 0.01056425, + "balance_loss_clip": 1.04592204, + "balance_loss_mlp": 1.03457439, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.787894718153272, + "language_loss": 0.77647913, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.79857379, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.5477964878082275 + }, + { + "auxiliary_loss_clip": 0.0113017, + "auxiliary_loss_mlp": 0.01057284, + "balance_loss_clip": 1.04716742, + "balance_loss_mlp": 1.03548086, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.7230993071194802, + "language_loss": 0.75022542, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77209997, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 2.8569653034210205 + }, + { + "auxiliary_loss_clip": 0.01118501, + "auxiliary_loss_mlp": 0.01058674, + "balance_loss_clip": 1.04592395, + "balance_loss_mlp": 1.03486764, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 1.897300332973555, + "language_loss": 0.74932945, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77110124, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 2.6351606845855713 + }, + { + "auxiliary_loss_clip": 0.01130811, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_clip": 1.04693699, + "balance_loss_mlp": 1.0325582, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 2.276826729869509, + "language_loss": 0.73657405, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.75844014, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 2.6255838871002197 + }, + { + "auxiliary_loss_clip": 0.01137336, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.04545951, + "balance_loss_mlp": 1.03289056, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.334673666522059, + "language_loss": 0.84743077, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86935186, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.50905704498291 + }, + { + "auxiliary_loss_clip": 0.01142308, + "auxiliary_loss_mlp": 0.01052736, + "balance_loss_clip": 1.04930711, + "balance_loss_mlp": 1.03014636, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 1.808503482571198, + "language_loss": 0.70691347, + "learning_rate": 3.942904426157406e-06, + "loss": 0.72886389, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.6188738346099854 + }, + { + "auxiliary_loss_clip": 0.01138528, + "auxiliary_loss_mlp": 0.01055168, + "balance_loss_clip": 1.04709518, + "balance_loss_mlp": 1.03071833, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.2366090772954714, + "language_loss": 0.81412506, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83606195, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 2.5618512630462646 + }, + { + "auxiliary_loss_clip": 0.01064637, + "auxiliary_loss_mlp": 0.01048164, + "balance_loss_clip": 1.0431025, + "balance_loss_mlp": 1.02608645, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 5.984463862488825, + "language_loss": 0.7591939, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78032196, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 2.852867841720581 + }, + { + "auxiliary_loss_clip": 0.01113611, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_clip": 1.04688907, + "balance_loss_mlp": 1.03091717, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.928107942699276, + "language_loss": 0.8261168, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84777153, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 3.510993480682373 + }, + { + "auxiliary_loss_clip": 0.01124625, + "auxiliary_loss_mlp": 0.01055311, + "balance_loss_clip": 1.04925203, + "balance_loss_mlp": 1.03441381, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.5102873909533012, + "language_loss": 0.83208704, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85388637, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 2.618687391281128 + }, + { + "auxiliary_loss_clip": 0.0114117, + "auxiliary_loss_mlp": 0.01055761, + "balance_loss_clip": 1.05057371, + "balance_loss_mlp": 1.03381455, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.152589385704083, + "language_loss": 0.76567185, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78764117, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 2.6043708324432373 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01048125, + "balance_loss_clip": 1.04610443, + "balance_loss_mlp": 1.02691817, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.8345612465742074, + "language_loss": 0.75115031, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77286065, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.6590003967285156 + }, + { + "auxiliary_loss_clip": 0.0115042, + "auxiliary_loss_mlp": 0.01054204, + "balance_loss_clip": 1.04809213, + "balance_loss_mlp": 1.0315069, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 2.2359012221995997, + "language_loss": 0.7872045, + "learning_rate": 3.94225586284712e-06, + "loss": 0.80925077, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 2.6918671131134033 + }, + { + "auxiliary_loss_clip": 0.01150077, + "auxiliary_loss_mlp": 0.01057768, + "balance_loss_clip": 1.04884064, + "balance_loss_mlp": 1.03584504, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 2.398898332768462, + "language_loss": 0.70648336, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72856188, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.578878164291382 + }, + { + "auxiliary_loss_clip": 0.01135024, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.04484999, + "balance_loss_mlp": 1.03207493, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 1.9552382350046467, + "language_loss": 0.81726527, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83918965, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 2.6594672203063965 + }, + { + "auxiliary_loss_clip": 0.01166218, + "auxiliary_loss_mlp": 0.01059183, + "balance_loss_clip": 1.0487529, + "balance_loss_mlp": 1.035115, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 2.136650699107753, + "language_loss": 0.75048006, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77273405, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 2.54264235496521 + }, + { + "auxiliary_loss_clip": 0.01127546, + "auxiliary_loss_mlp": 0.01056631, + "balance_loss_clip": 1.04697371, + "balance_loss_mlp": 1.03433919, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.6012152546538467, + "language_loss": 0.77136266, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79320449, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.6343204975128174 + }, + { + "auxiliary_loss_clip": 0.01139226, + "auxiliary_loss_mlp": 0.01056407, + "balance_loss_clip": 1.05249035, + "balance_loss_mlp": 1.03456807, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.0456761309890488, + "language_loss": 0.85886192, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88081831, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 2.6032967567443848 + }, + { + "auxiliary_loss_clip": 0.01140979, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.04798865, + "balance_loss_mlp": 1.03385222, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 2.8331832304272324, + "language_loss": 0.75477064, + "learning_rate": 3.941697079021942e-06, + "loss": 0.7767539, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 2.5836739540100098 + }, + { + "auxiliary_loss_clip": 0.01114471, + "auxiliary_loss_mlp": 0.01061742, + "balance_loss_clip": 1.04955363, + "balance_loss_mlp": 1.04113042, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.9083601079432502, + "language_loss": 0.87548041, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89724255, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 2.6536998748779297 + }, + { + "auxiliary_loss_clip": 0.0112317, + "auxiliary_loss_mlp": 0.01055395, + "balance_loss_clip": 1.04559731, + "balance_loss_mlp": 1.03157711, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 1.897213403687128, + "language_loss": 0.75669169, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77847737, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.601436138153076 + }, + { + "auxiliary_loss_clip": 0.01150351, + "auxiliary_loss_mlp": 0.01054043, + "balance_loss_clip": 1.04913294, + "balance_loss_mlp": 1.03349113, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 1.8195065029165003, + "language_loss": 0.78733939, + "learning_rate": 3.941416693065451e-06, + "loss": 0.80938339, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.7097220420837402 + }, + { + "auxiliary_loss_clip": 0.01161749, + "auxiliary_loss_mlp": 0.01065239, + "balance_loss_clip": 1.04810226, + "balance_loss_mlp": 1.04287529, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 1.8761161301767124, + "language_loss": 0.82535279, + "learning_rate": 3.941323083837794e-06, + "loss": 0.84762263, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.5764379501342773 + }, + { + "auxiliary_loss_clip": 0.01140961, + "auxiliary_loss_mlp": 0.01064036, + "balance_loss_clip": 1.04875255, + "balance_loss_mlp": 1.04198229, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.5481317635464236, + "language_loss": 0.70296204, + "learning_rate": 3.941229400994971e-06, + "loss": 0.725012, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 2.788945436477661 + }, + { + "auxiliary_loss_clip": 0.01144013, + "auxiliary_loss_mlp": 0.01065793, + "balance_loss_clip": 1.05225646, + "balance_loss_mlp": 1.04285741, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.394032497530021, + "language_loss": 0.84407151, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86616957, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.7157790660858154 + }, + { + "auxiliary_loss_clip": 0.01157163, + "auxiliary_loss_mlp": 0.01058218, + "balance_loss_clip": 1.04554224, + "balance_loss_mlp": 1.03553236, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.6902232404974828, + "language_loss": 0.71400642, + "learning_rate": 3.941041814478041e-06, + "loss": 0.73616028, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.5352706909179688 + }, + { + "auxiliary_loss_clip": 0.01136867, + "auxiliary_loss_mlp": 0.01058376, + "balance_loss_clip": 1.04463124, + "balance_loss_mlp": 1.03615558, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 2.0453539996203025, + "language_loss": 0.81673414, + "learning_rate": 3.940947910811047e-06, + "loss": 0.83868659, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.566387414932251 + }, + { + "auxiliary_loss_clip": 0.01133117, + "auxiliary_loss_mlp": 0.01061047, + "balance_loss_clip": 1.04911053, + "balance_loss_mlp": 1.03834927, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3168310470079416, + "language_loss": 0.92314649, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94508821, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.5866806507110596 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01051313, + "balance_loss_clip": 1.04990935, + "balance_loss_mlp": 1.02984381, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 1.9050016245872772, + "language_loss": 0.7910589, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81307119, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.5713462829589844 + }, + { + "auxiliary_loss_clip": 0.01100746, + "auxiliary_loss_mlp": 0.01058982, + "balance_loss_clip": 1.04755282, + "balance_loss_mlp": 1.03568864, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.017675756925595, + "language_loss": 0.75901771, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78061497, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.68627667427063 + }, + { + "auxiliary_loss_clip": 0.01127923, + "auxiliary_loss_mlp": 0.01056085, + "balance_loss_clip": 1.05100083, + "balance_loss_mlp": 1.03177834, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 1.8442887243605013, + "language_loss": 0.83433974, + "learning_rate": 3.940571560169328e-06, + "loss": 0.85617983, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 4.350549697875977 + }, + { + "auxiliary_loss_clip": 0.01116945, + "auxiliary_loss_mlp": 0.01054192, + "balance_loss_clip": 1.04783165, + "balance_loss_mlp": 1.02974248, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 3.129351738614942, + "language_loss": 0.68635654, + "learning_rate": 3.940477288533302e-06, + "loss": 0.70806789, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.7525579929351807 + }, + { + "auxiliary_loss_clip": 0.01143644, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.04936242, + "balance_loss_mlp": 1.03975797, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 2.152954067171016, + "language_loss": 0.76576215, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78782201, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.6031482219696045 + }, + { + "auxiliary_loss_clip": 0.01167379, + "auxiliary_loss_mlp": 0.01063437, + "balance_loss_clip": 1.05151391, + "balance_loss_mlp": 1.04232526, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7702771093375878, + "language_loss": 0.7993052, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82161337, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.5520131587982178 + }, + { + "auxiliary_loss_clip": 0.01124319, + "auxiliary_loss_mlp": 0.01055253, + "balance_loss_clip": 1.042907, + "balance_loss_mlp": 1.03303218, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.496906730643159, + "language_loss": 0.78681093, + "learning_rate": 3.940194032140976e-06, + "loss": 0.80860662, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 2.955587863922119 + }, + { + "auxiliary_loss_clip": 0.01146563, + "auxiliary_loss_mlp": 0.01049981, + "balance_loss_clip": 1.04963505, + "balance_loss_mlp": 1.02809453, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 1.9489810726148504, + "language_loss": 0.91547441, + "learning_rate": 3.940099466194054e-06, + "loss": 0.9374398, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 4.108685493469238 + }, + { + "auxiliary_loss_clip": 0.01135531, + "auxiliary_loss_mlp": 0.01053005, + "balance_loss_clip": 1.04641724, + "balance_loss_mlp": 1.02879357, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.1133561700343533, + "language_loss": 0.77285182, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79473722, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 4.214943885803223 + }, + { + "auxiliary_loss_clip": 0.01142846, + "auxiliary_loss_mlp": 0.01058831, + "balance_loss_clip": 1.0484941, + "balance_loss_mlp": 1.03494191, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.260845105353967, + "language_loss": 0.89151013, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91352683, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.594459295272827 + }, + { + "auxiliary_loss_clip": 0.01081849, + "auxiliary_loss_mlp": 0.00752095, + "balance_loss_clip": 1.03908253, + "balance_loss_mlp": 0.99994582, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.1782753609792262, + "language_loss": 0.78114951, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.7994889, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 2.7878236770629883 + }, + { + "auxiliary_loss_clip": 0.01040662, + "auxiliary_loss_mlp": 0.01011681, + "balance_loss_clip": 1.01885486, + "balance_loss_mlp": 1.00853395, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.7577118995502156, + "language_loss": 0.6051231, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62564653, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 3.3322367668151855 + }, + { + "auxiliary_loss_clip": 0.01139587, + "auxiliary_loss_mlp": 0.01048821, + "balance_loss_clip": 1.04702091, + "balance_loss_mlp": 1.02730393, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 1.6982918865328938, + "language_loss": 0.80122769, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82311177, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 2.605079174041748 + }, + { + "auxiliary_loss_clip": 0.01112494, + "auxiliary_loss_mlp": 0.01055893, + "balance_loss_clip": 1.04154098, + "balance_loss_mlp": 1.03097868, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7445758124446422, + "language_loss": 0.80209893, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.8237828, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 2.622464418411255 + }, + { + "auxiliary_loss_clip": 0.01145934, + "auxiliary_loss_mlp": 0.01055896, + "balance_loss_clip": 1.04737544, + "balance_loss_mlp": 1.03365135, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.7020931461485973, + "language_loss": 0.76523256, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78725088, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 2.5490105152130127 + }, + { + "auxiliary_loss_clip": 0.01164553, + "auxiliary_loss_mlp": 0.01058366, + "balance_loss_clip": 1.0508163, + "balance_loss_mlp": 1.0357517, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6197627587881118, + "language_loss": 0.77594572, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79817486, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.6015868186950684 + }, + { + "auxiliary_loss_clip": 0.00976297, + "auxiliary_loss_mlp": 0.01008128, + "balance_loss_clip": 1.01449633, + "balance_loss_mlp": 1.00490904, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6830667654924452, + "language_loss": 0.57931054, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59915483, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 3.6768410205841064 + }, + { + "auxiliary_loss_clip": 0.01118096, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.04466867, + "balance_loss_mlp": 1.02773404, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.4068416760162306, + "language_loss": 0.86375773, + "learning_rate": 3.939149761035749e-06, + "loss": 0.88542283, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 3.6531763076782227 + }, + { + "auxiliary_loss_clip": 0.01117256, + "auxiliary_loss_mlp": 0.00751858, + "balance_loss_clip": 1.0445075, + "balance_loss_mlp": 0.99989915, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.7103483738132736, + "language_loss": 0.61945069, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.63814181, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 2.786959648132324 + }, + { + "auxiliary_loss_clip": 0.01053989, + "auxiliary_loss_mlp": 0.01002595, + "balance_loss_clip": 1.02181315, + "balance_loss_mlp": 0.9994238, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8819916746077274, + "language_loss": 0.5700295, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.5905953, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.021591901779175 + }, + { + "auxiliary_loss_clip": 0.01125331, + "auxiliary_loss_mlp": 0.01059454, + "balance_loss_clip": 1.04649103, + "balance_loss_mlp": 1.03724527, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6761690901710917, + "language_loss": 0.88475543, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90660328, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 2.7500123977661133 + }, + { + "auxiliary_loss_clip": 0.01168275, + "auxiliary_loss_mlp": 0.01062908, + "balance_loss_clip": 1.0502739, + "balance_loss_mlp": 1.03883934, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 7.328964877145804, + "language_loss": 0.76059854, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78291041, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.5430219173431396 + }, + { + "auxiliary_loss_clip": 0.01104317, + "auxiliary_loss_mlp": 0.01064783, + "balance_loss_clip": 1.04412651, + "balance_loss_mlp": 1.03918898, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 3.82329974513788, + "language_loss": 0.82900536, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85069644, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 2.764180898666382 + }, + { + "auxiliary_loss_clip": 0.01142243, + "auxiliary_loss_mlp": 0.00751888, + "balance_loss_clip": 1.05150628, + "balance_loss_mlp": 0.99993753, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.367847455126678, + "language_loss": 0.76325959, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78220093, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.607632875442505 + }, + { + "auxiliary_loss_clip": 0.01054378, + "auxiliary_loss_mlp": 0.01002342, + "balance_loss_clip": 1.01292729, + "balance_loss_mlp": 0.99907523, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.9355414266079195, + "language_loss": 0.57438177, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59494895, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.1460394859313965 + }, + { + "auxiliary_loss_clip": 0.01126385, + "auxiliary_loss_mlp": 0.01068933, + "balance_loss_clip": 1.04488897, + "balance_loss_mlp": 1.04406643, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.4746305402412483, + "language_loss": 0.83709013, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8590433, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 2.635389566421509 + }, + { + "auxiliary_loss_clip": 0.0109176, + "auxiliary_loss_mlp": 0.00751823, + "balance_loss_clip": 1.04393458, + "balance_loss_mlp": 0.9999494, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 1.8126986879989837, + "language_loss": 0.87493587, + "learning_rate": 3.938288739241625e-06, + "loss": 0.8933717, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.7568938732147217 + }, + { + "auxiliary_loss_clip": 0.01130344, + "auxiliary_loss_mlp": 0.00751823, + "balance_loss_clip": 1.05961919, + "balance_loss_mlp": 0.99992085, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 1.946091698310913, + "language_loss": 0.83965695, + "learning_rate": 3.938192702604417e-06, + "loss": 0.85847861, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.751958131790161 + }, + { + "auxiliary_loss_clip": 0.01118921, + "auxiliary_loss_mlp": 0.00751764, + "balance_loss_clip": 1.04515088, + "balance_loss_mlp": 0.99993062, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 1.9539672643149624, + "language_loss": 0.66920614, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.68791294, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.6591362953186035 + }, + { + "auxiliary_loss_clip": 0.0114929, + "auxiliary_loss_mlp": 0.01048594, + "balance_loss_clip": 1.05164909, + "balance_loss_mlp": 1.02614737, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.1554395998612326, + "language_loss": 0.91718, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93915886, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 2.5540151596069336 + }, + { + "auxiliary_loss_clip": 0.01117037, + "auxiliary_loss_mlp": 0.01053444, + "balance_loss_clip": 1.04834783, + "balance_loss_mlp": 1.03139031, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 1.8579467263679657, + "language_loss": 0.79347163, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81517643, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.6804277896881104 + }, + { + "auxiliary_loss_clip": 0.0113843, + "auxiliary_loss_mlp": 0.01059465, + "balance_loss_clip": 1.04718947, + "balance_loss_mlp": 1.03731596, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 4.983893598782886, + "language_loss": 0.78839409, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81037307, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.5887255668640137 + }, + { + "auxiliary_loss_clip": 0.01145611, + "auxiliary_loss_mlp": 0.01060366, + "balance_loss_clip": 1.04967797, + "balance_loss_mlp": 1.03726304, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 2.3728064056853255, + "language_loss": 0.86048305, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88254273, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.631654977798462 + }, + { + "auxiliary_loss_clip": 0.01132868, + "auxiliary_loss_mlp": 0.01054234, + "balance_loss_clip": 1.04713321, + "balance_loss_mlp": 1.03100038, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 2.1853408420218914, + "language_loss": 1.00486231, + "learning_rate": 3.937614939483143e-06, + "loss": 1.02673328, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 2.6162402629852295 + }, + { + "auxiliary_loss_clip": 0.01150384, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.05147409, + "balance_loss_mlp": 1.03587055, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.3352662448001773, + "language_loss": 0.84987491, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87195688, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.5950727462768555 + }, + { + "auxiliary_loss_clip": 0.0116115, + "auxiliary_loss_mlp": 0.01056393, + "balance_loss_clip": 1.04678702, + "balance_loss_mlp": 1.03301609, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.7427185833302947, + "language_loss": 0.78915131, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81132668, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.5591514110565186 + }, + { + "auxiliary_loss_clip": 0.01154697, + "auxiliary_loss_mlp": 0.01044879, + "balance_loss_clip": 1.04991436, + "balance_loss_mlp": 1.02227712, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.349264660238439, + "language_loss": 0.82874167, + "learning_rate": 3.937325065966719e-06, + "loss": 0.85073745, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 2.5625176429748535 + }, + { + "auxiliary_loss_clip": 0.01159901, + "auxiliary_loss_mlp": 0.01063327, + "balance_loss_clip": 1.04762077, + "balance_loss_mlp": 1.04201198, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 1.8980169355648635, + "language_loss": 0.78289169, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80512404, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 2.5558149814605713 + }, + { + "auxiliary_loss_clip": 0.01162623, + "auxiliary_loss_mlp": 0.01064552, + "balance_loss_clip": 1.05036592, + "balance_loss_mlp": 1.04008985, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.657690805715739, + "language_loss": 0.75075203, + "learning_rate": 3.937131449631859e-06, + "loss": 0.77302384, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.555781364440918 + }, + { + "auxiliary_loss_clip": 0.01156147, + "auxiliary_loss_mlp": 0.00751763, + "balance_loss_clip": 1.05027866, + "balance_loss_mlp": 0.99996251, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.2693224888043164, + "language_loss": 0.7879585, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80703759, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.6305654048919678 + }, + { + "auxiliary_loss_clip": 0.01109878, + "auxiliary_loss_mlp": 0.01068031, + "balance_loss_clip": 1.04294527, + "balance_loss_mlp": 1.04510725, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.9451955163224075, + "language_loss": 0.70815456, + "learning_rate": 3.936937539472126e-06, + "loss": 0.72993362, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.6637933254241943 + }, + { + "auxiliary_loss_clip": 0.0113064, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_clip": 1.04713559, + "balance_loss_mlp": 1.02355623, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.1802957732956827, + "language_loss": 0.76264727, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.7844348, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.7076168060302734 + }, + { + "auxiliary_loss_clip": 0.01093136, + "auxiliary_loss_mlp": 0.01064832, + "balance_loss_clip": 1.0439384, + "balance_loss_mlp": 1.04193223, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.4845069840954401, + "language_loss": 0.85144794, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87302762, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 4.15510368347168 + }, + { + "auxiliary_loss_clip": 0.01093743, + "auxiliary_loss_mlp": 0.01052197, + "balance_loss_clip": 1.04101062, + "balance_loss_mlp": 1.02862906, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.6793103604262707, + "language_loss": 0.74928528, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77074468, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.695070743560791 + }, + { + "auxiliary_loss_clip": 0.01097851, + "auxiliary_loss_mlp": 0.01059889, + "balance_loss_clip": 1.03981352, + "balance_loss_mlp": 1.03675056, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 3.9358535232596625, + "language_loss": 0.81756401, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83914143, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 2.6508076190948486 + }, + { + "auxiliary_loss_clip": 0.0111416, + "auxiliary_loss_mlp": 0.01077961, + "balance_loss_clip": 1.04412246, + "balance_loss_mlp": 1.05158019, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.1282533181157812, + "language_loss": 0.7381525, + "learning_rate": 3.936451478782111e-06, + "loss": 0.76007366, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.596043825149536 + }, + { + "auxiliary_loss_clip": 0.01136654, + "auxiliary_loss_mlp": 0.01046758, + "balance_loss_clip": 1.04575706, + "balance_loss_mlp": 1.02608728, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 2.0688861280137565, + "language_loss": 0.81245136, + "learning_rate": 3.936354046338046e-06, + "loss": 0.8342855, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 2.6213786602020264 + }, + { + "auxiliary_loss_clip": 0.01122225, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_clip": 1.04408228, + "balance_loss_mlp": 1.03128684, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4879486659100976, + "language_loss": 0.85560393, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87737346, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 5.680144548416138 + }, + { + "auxiliary_loss_clip": 0.01134149, + "auxiliary_loss_mlp": 0.01065219, + "balance_loss_clip": 1.04840374, + "balance_loss_mlp": 1.04348707, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 1.7768707661559253, + "language_loss": 0.775401, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79739463, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 4.296800374984741 + }, + { + "auxiliary_loss_clip": 0.01159453, + "auxiliary_loss_mlp": 0.01051224, + "balance_loss_clip": 1.04921293, + "balance_loss_mlp": 1.03045785, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.46983843279311, + "language_loss": 0.73237193, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.75447875, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 2.630401611328125 + }, + { + "auxiliary_loss_clip": 0.01167939, + "auxiliary_loss_mlp": 0.0105117, + "balance_loss_clip": 1.05156958, + "balance_loss_mlp": 1.02976036, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 2.463597554657655, + "language_loss": 0.66363537, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68582642, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 2.696683883666992 + }, + { + "auxiliary_loss_clip": 0.0113456, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.04644847, + "balance_loss_mlp": 1.03871703, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.8439347011799678, + "language_loss": 0.81756842, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83952028, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 2.6301333904266357 + }, + { + "auxiliary_loss_clip": 0.01141932, + "auxiliary_loss_mlp": 0.01054209, + "balance_loss_clip": 1.0472858, + "balance_loss_mlp": 1.03214383, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.7007059191422442, + "language_loss": 0.90941709, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93137848, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.590785026550293 + }, + { + "auxiliary_loss_clip": 0.01116751, + "auxiliary_loss_mlp": 0.01055778, + "balance_loss_clip": 1.04873538, + "balance_loss_mlp": 1.03230524, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 1.9168130302592625, + "language_loss": 0.76359421, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78531945, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 2.899348497390747 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_clip": 1.05021548, + "balance_loss_mlp": 1.02777016, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.71509169563108, + "language_loss": 0.86132598, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88314044, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.758556604385376 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.00751759, + "balance_loss_clip": 1.04630184, + "balance_loss_mlp": 0.99993861, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 2.2425994315425792, + "language_loss": 0.80715811, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82615584, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 2.6746339797973633 + }, + { + "auxiliary_loss_clip": 0.01122348, + "auxiliary_loss_mlp": 0.01053372, + "balance_loss_clip": 1.04825056, + "balance_loss_mlp": 1.03349972, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.9670657615437226, + "language_loss": 0.78937781, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.81113499, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 2.7747929096221924 + }, + { + "auxiliary_loss_clip": 0.01140174, + "auxiliary_loss_mlp": 0.01051657, + "balance_loss_clip": 1.05048907, + "balance_loss_mlp": 1.03022337, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.586578556707318, + "language_loss": 0.7932508, + "learning_rate": 3.935277444103342e-06, + "loss": 0.8151691, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 2.7542760372161865 + }, + { + "auxiliary_loss_clip": 0.01160335, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_clip": 1.04946566, + "balance_loss_mlp": 1.03607368, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 1.8809489855168888, + "language_loss": 0.85003686, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87221479, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 2.6138699054718018 + }, + { + "auxiliary_loss_clip": 0.01098666, + "auxiliary_loss_mlp": 0.01054839, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.03102124, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 3.000566010440547, + "language_loss": 0.63415855, + "learning_rate": 3.935080744080564e-06, + "loss": 0.65569365, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.696457862854004 + }, + { + "auxiliary_loss_clip": 0.011314, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_clip": 1.04621625, + "balance_loss_mlp": 1.03172827, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 2.0144922473420674, + "language_loss": 0.74107456, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76292026, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.699829578399658 + }, + { + "auxiliary_loss_clip": 0.01120857, + "auxiliary_loss_mlp": 0.01048375, + "balance_loss_clip": 1.04530001, + "balance_loss_mlp": 1.02617836, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.6675353733297307, + "language_loss": 0.72943532, + "learning_rate": 3.934883750543966e-06, + "loss": 0.75112766, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 2.635830879211426 + }, + { + "auxiliary_loss_clip": 0.01115531, + "auxiliary_loss_mlp": 0.01055321, + "balance_loss_clip": 1.04510832, + "balance_loss_mlp": 1.03374422, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 4.312642636420752, + "language_loss": 0.82838112, + "learning_rate": 3.93478514371732e-06, + "loss": 0.85008967, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.6465516090393066 + }, + { + "auxiliary_loss_clip": 0.01128689, + "auxiliary_loss_mlp": 0.01055352, + "balance_loss_clip": 1.0508877, + "balance_loss_mlp": 1.03441882, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.25237993147563, + "language_loss": 0.84463382, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86647427, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.704810380935669 + }, + { + "auxiliary_loss_clip": 0.0113039, + "auxiliary_loss_mlp": 0.01049251, + "balance_loss_clip": 1.04966569, + "balance_loss_mlp": 1.02587438, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 3.904754360915119, + "language_loss": 0.71773916, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73953557, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.666837692260742 + }, + { + "auxiliary_loss_clip": 0.01144894, + "auxiliary_loss_mlp": 0.01060778, + "balance_loss_clip": 1.0471437, + "balance_loss_mlp": 1.03878379, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.0342624101756597, + "language_loss": 0.7305305, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.7525872, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 2.633188247680664 + }, + { + "auxiliary_loss_clip": 0.01104508, + "auxiliary_loss_mlp": 0.01055645, + "balance_loss_clip": 1.0481509, + "balance_loss_mlp": 1.03341198, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.7368682176971488, + "language_loss": 0.67048794, + "learning_rate": 3.934389982775706e-06, + "loss": 0.69208944, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.8176515102386475 + }, + { + "auxiliary_loss_clip": 0.01139766, + "auxiliary_loss_mlp": 0.01057598, + "balance_loss_clip": 1.05141532, + "balance_loss_mlp": 1.03597307, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.321393484963102, + "language_loss": 0.73140413, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75337774, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 2.6138243675231934 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.00751585, + "balance_loss_clip": 1.0471251, + "balance_loss_mlp": 0.99992323, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 2.2653944840786027, + "language_loss": 0.74231189, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76105314, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.6493313312530518 + }, + { + "auxiliary_loss_clip": 0.01163275, + "auxiliary_loss_mlp": 0.01055413, + "balance_loss_clip": 1.0523243, + "balance_loss_mlp": 1.03272736, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.41364295004601, + "language_loss": 0.82502466, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84721154, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 2.5071096420288086 + }, + { + "auxiliary_loss_clip": 0.01132083, + "auxiliary_loss_mlp": 0.01053639, + "balance_loss_clip": 1.04686832, + "balance_loss_mlp": 1.03197896, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 1.896365239060294, + "language_loss": 0.75846714, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78032434, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.6565463542938232 + }, + { + "auxiliary_loss_clip": 0.01120942, + "auxiliary_loss_mlp": 0.01049424, + "balance_loss_clip": 1.04224467, + "balance_loss_mlp": 1.02912223, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.5671317003467855, + "language_loss": 0.79334915, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81505275, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.678161859512329 + }, + { + "auxiliary_loss_clip": 0.0113066, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.04789209, + "balance_loss_mlp": 1.02381229, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.4565528504185763, + "language_loss": 0.79489553, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81665856, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.6999640464782715 + }, + { + "auxiliary_loss_clip": 0.01121532, + "auxiliary_loss_mlp": 0.01055158, + "balance_loss_clip": 1.04446089, + "balance_loss_mlp": 1.03426075, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.122110025406234, + "language_loss": 0.87668729, + "learning_rate": 3.933695627210554e-06, + "loss": 0.89845425, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.600175619125366 + }, + { + "auxiliary_loss_clip": 0.01109087, + "auxiliary_loss_mlp": 0.01054397, + "balance_loss_clip": 1.04193354, + "balance_loss_mlp": 1.03304696, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.7985657300687181, + "language_loss": 0.76208103, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78371584, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.769244432449341 + }, + { + "auxiliary_loss_clip": 0.01045463, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.01636243, + "balance_loss_mlp": 1.02592468, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8509981829476606, + "language_loss": 0.55012786, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57086968, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.135929584503174 + }, + { + "auxiliary_loss_clip": 0.01054439, + "auxiliary_loss_mlp": 0.0101795, + "balance_loss_clip": 1.01504421, + "balance_loss_mlp": 1.01492262, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7572481423757075, + "language_loss": 0.55370897, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57443285, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.129791259765625 + }, + { + "auxiliary_loss_clip": 0.0113993, + "auxiliary_loss_mlp": 0.0105601, + "balance_loss_clip": 1.04841316, + "balance_loss_mlp": 1.03352737, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.602837064726642, + "language_loss": 0.8424511, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86441052, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.6199798583984375 + }, + { + "auxiliary_loss_clip": 0.01112899, + "auxiliary_loss_mlp": 0.0105373, + "balance_loss_clip": 1.04817307, + "balance_loss_mlp": 1.03134274, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 1.6436981150849672, + "language_loss": 0.8882035, + "learning_rate": 3.933197459096614e-06, + "loss": 0.90986985, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 2.875441312789917 + }, + { + "auxiliary_loss_clip": 0.0103525, + "auxiliary_loss_mlp": 0.01002723, + "balance_loss_clip": 1.01608753, + "balance_loss_mlp": 0.99968356, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6853850735837881, + "language_loss": 0.55524671, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57562643, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.1611764430999756 + }, + { + "auxiliary_loss_clip": 0.01140567, + "auxiliary_loss_mlp": 0.01066047, + "balance_loss_clip": 1.05115247, + "balance_loss_mlp": 1.04364777, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 1.9595958271808718, + "language_loss": 0.9052034, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92726952, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 2.6580824851989746 + }, + { + "auxiliary_loss_clip": 0.01040826, + "auxiliary_loss_mlp": 0.01003042, + "balance_loss_clip": 1.0117979, + "balance_loss_mlp": 1.00009751, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7167497355040795, + "language_loss": 0.59866309, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61910188, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.125927448272705 + }, + { + "auxiliary_loss_clip": 0.01150969, + "auxiliary_loss_mlp": 0.01050634, + "balance_loss_clip": 1.04752445, + "balance_loss_mlp": 1.02888989, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 4.1166256788949624, + "language_loss": 0.80838984, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83040583, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 4.077435493469238 + }, + { + "auxiliary_loss_clip": 0.01123787, + "auxiliary_loss_mlp": 0.01055956, + "balance_loss_clip": 1.0506928, + "balance_loss_mlp": 1.03265107, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.3598557210379636, + "language_loss": 0.90926373, + "learning_rate": 3.932697458306779e-06, + "loss": 0.93106121, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 2.802109956741333 + }, + { + "auxiliary_loss_clip": 0.01114971, + "auxiliary_loss_mlp": 0.01057665, + "balance_loss_clip": 1.04834807, + "balance_loss_mlp": 1.03296506, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.139130450476117, + "language_loss": 0.63386989, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65559626, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 2.7118706703186035 + }, + { + "auxiliary_loss_clip": 0.01119751, + "auxiliary_loss_mlp": 0.01053567, + "balance_loss_clip": 1.04553354, + "balance_loss_mlp": 1.03270578, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.8183279761760232, + "language_loss": 0.72957158, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75130475, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 2.7756428718566895 + }, + { + "auxiliary_loss_clip": 0.01144072, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_clip": 1.04884851, + "balance_loss_mlp": 1.03104496, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.8150047342306683, + "language_loss": 0.78219754, + "learning_rate": 3.93239657834556e-06, + "loss": 0.80415875, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 2.607102632522583 + }, + { + "auxiliary_loss_clip": 0.0113763, + "auxiliary_loss_mlp": 0.01068552, + "balance_loss_clip": 1.04908442, + "balance_loss_mlp": 1.04714179, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 1.9630546938226145, + "language_loss": 0.71733767, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73939949, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 5.6621174812316895 + }, + { + "auxiliary_loss_clip": 0.01168523, + "auxiliary_loss_mlp": 0.00751481, + "balance_loss_clip": 1.05553198, + "balance_loss_mlp": 0.99991024, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.828165759201452, + "language_loss": 0.78873682, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80793691, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 4.139099836349487 + }, + { + "auxiliary_loss_clip": 0.0113391, + "auxiliary_loss_mlp": 0.01049388, + "balance_loss_clip": 1.04544663, + "balance_loss_mlp": 1.02790618, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.6476008873070183, + "language_loss": 0.88297844, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90481138, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 2.5934653282165527 + }, + { + "auxiliary_loss_clip": 0.01109001, + "auxiliary_loss_mlp": 0.01052469, + "balance_loss_clip": 1.04536796, + "balance_loss_mlp": 1.0320009, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 1.7212395234019615, + "language_loss": 0.90524578, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92686051, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 2.6486220359802246 + }, + { + "auxiliary_loss_clip": 0.01135316, + "auxiliary_loss_mlp": 0.01055659, + "balance_loss_clip": 1.04573154, + "balance_loss_mlp": 1.03600097, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 1.9390084301070512, + "language_loss": 0.85999978, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88190955, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 2.661095380783081 + }, + { + "auxiliary_loss_clip": 0.01100292, + "auxiliary_loss_mlp": 0.00751732, + "balance_loss_clip": 1.04570365, + "balance_loss_mlp": 0.99991035, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 2.6352555950859164, + "language_loss": 0.75050044, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76902062, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.709426164627075 + }, + { + "auxiliary_loss_clip": 0.01160779, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_clip": 1.0505476, + "balance_loss_mlp": 1.03036487, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 1.9919011773284327, + "language_loss": 0.75981164, + "learning_rate": 3.931691960597165e-06, + "loss": 0.78194374, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 2.587444543838501 + }, + { + "auxiliary_loss_clip": 0.01137465, + "auxiliary_loss_mlp": 0.01052435, + "balance_loss_clip": 1.0497452, + "balance_loss_mlp": 1.031847, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.5194752998058452, + "language_loss": 0.76181901, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78371799, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.623020887374878 + }, + { + "auxiliary_loss_clip": 0.0115218, + "auxiliary_loss_mlp": 0.01052273, + "balance_loss_clip": 1.05008399, + "balance_loss_mlp": 1.03236496, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.2868553033192818, + "language_loss": 0.86126626, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88331079, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 2.5093822479248047 + }, + { + "auxiliary_loss_clip": 0.01161865, + "auxiliary_loss_mlp": 0.01052161, + "balance_loss_clip": 1.0494957, + "balance_loss_mlp": 1.03098941, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 1.9316148896398655, + "language_loss": 0.77385533, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79599559, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.6000657081604004 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01051696, + "balance_loss_clip": 1.0549376, + "balance_loss_mlp": 1.03195465, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.682400605271752, + "language_loss": 0.77491003, + "learning_rate": 3.931287710300832e-06, + "loss": 0.7969476, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 2.6018638610839844 + }, + { + "auxiliary_loss_clip": 0.01118514, + "auxiliary_loss_mlp": 0.0075159, + "balance_loss_clip": 1.04504085, + "balance_loss_mlp": 0.99988139, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 2.2502090029158524, + "language_loss": 0.71592963, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73463064, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.6414754390716553 + }, + { + "auxiliary_loss_clip": 0.01148451, + "auxiliary_loss_mlp": 0.01047258, + "balance_loss_clip": 1.04909992, + "balance_loss_mlp": 1.02657509, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.8062746550053217, + "language_loss": 0.81151444, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83347154, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 2.6217164993286133 + }, + { + "auxiliary_loss_clip": 0.01145354, + "auxiliary_loss_mlp": 0.01053563, + "balance_loss_clip": 1.05120993, + "balance_loss_mlp": 1.03364325, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 2.426266975000258, + "language_loss": 0.88753808, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90952718, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.5609545707702637 + }, + { + "auxiliary_loss_clip": 0.01147928, + "auxiliary_loss_mlp": 0.01061546, + "balance_loss_clip": 1.0495075, + "balance_loss_mlp": 1.03940916, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.204302970153261, + "language_loss": 0.72002995, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74212474, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.6352319717407227 + }, + { + "auxiliary_loss_clip": 0.01049502, + "auxiliary_loss_mlp": 0.01007346, + "balance_loss_clip": 1.01123595, + "balance_loss_mlp": 1.00388932, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7751390577367512, + "language_loss": 0.53672445, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55729288, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.0425260066986084 + }, + { + "auxiliary_loss_clip": 0.01136151, + "auxiliary_loss_mlp": 0.01056202, + "balance_loss_clip": 1.04643989, + "balance_loss_mlp": 1.03343296, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 3.237221041432978, + "language_loss": 0.844944, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.86686748, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 2.6434309482574463 + }, + { + "auxiliary_loss_clip": 0.01128055, + "auxiliary_loss_mlp": 0.01064285, + "balance_loss_clip": 1.04775786, + "balance_loss_mlp": 1.04428172, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 1.8691388277943455, + "language_loss": 0.81422234, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.83614576, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.6281158924102783 + }, + { + "auxiliary_loss_clip": 0.01146183, + "auxiliary_loss_mlp": 0.01045838, + "balance_loss_clip": 1.04932165, + "balance_loss_mlp": 1.02410626, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 1.6819929910511775, + "language_loss": 0.83322734, + "learning_rate": 3.93047569469238e-06, + "loss": 0.8551476, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.670149087905884 + }, + { + "auxiliary_loss_clip": 0.0112072, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.04254293, + "balance_loss_mlp": 1.02162802, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 1.9416908359453189, + "language_loss": 0.82592368, + "learning_rate": 3.930373863283608e-06, + "loss": 0.84754777, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.6924593448638916 + }, + { + "auxiliary_loss_clip": 0.01111464, + "auxiliary_loss_mlp": 0.01062613, + "balance_loss_clip": 1.04312444, + "balance_loss_mlp": 1.03974843, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.4102966081419783, + "language_loss": 0.9148699, + "learning_rate": 3.930271958674866e-06, + "loss": 0.9366107, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 2.6962385177612305 + }, + { + "auxiliary_loss_clip": 0.01147223, + "auxiliary_loss_mlp": 0.01047192, + "balance_loss_clip": 1.04616666, + "balance_loss_mlp": 1.02594924, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.204729993265614, + "language_loss": 0.8159821, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83792627, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 2.641164541244507 + }, + { + "auxiliary_loss_clip": 0.01132823, + "auxiliary_loss_mlp": 0.01056475, + "balance_loss_clip": 1.04838586, + "balance_loss_mlp": 1.03536284, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 1.9321087466121016, + "language_loss": 0.75114316, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77303612, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.593873977661133 + }, + { + "auxiliary_loss_clip": 0.01156442, + "auxiliary_loss_mlp": 0.01047035, + "balance_loss_clip": 1.04858649, + "balance_loss_mlp": 1.02754474, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 1.7386267983811825, + "language_loss": 0.88922489, + "learning_rate": 3.929965805687474e-06, + "loss": 0.91125965, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.5578010082244873 + }, + { + "auxiliary_loss_clip": 0.01144677, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.05292308, + "balance_loss_mlp": 1.03851438, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.0242188184928986, + "language_loss": 0.8707, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89274168, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.62044358253479 + }, + { + "auxiliary_loss_clip": 0.01136528, + "auxiliary_loss_mlp": 0.01051018, + "balance_loss_clip": 1.04625547, + "balance_loss_mlp": 1.02756953, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 1.6806313002822366, + "language_loss": 0.6432898, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66516531, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.626988172531128 + }, + { + "auxiliary_loss_clip": 0.01095282, + "auxiliary_loss_mlp": 0.0104963, + "balance_loss_clip": 1.04786277, + "balance_loss_mlp": 1.02951896, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0575911466952452, + "language_loss": 0.73745811, + "learning_rate": 3.929658994039627e-06, + "loss": 0.7589072, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.6745104789733887 + }, + { + "auxiliary_loss_clip": 0.01092956, + "auxiliary_loss_mlp": 0.01057509, + "balance_loss_clip": 1.04780054, + "balance_loss_mlp": 1.03398931, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.2479110779036606, + "language_loss": 0.84970516, + "learning_rate": 3.929556577139446e-06, + "loss": 0.87120986, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.7028846740722656 + }, + { + "auxiliary_loss_clip": 0.01065668, + "auxiliary_loss_mlp": 0.00751492, + "balance_loss_clip": 1.03666687, + "balance_loss_mlp": 0.99985325, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.4930638491469208, + "language_loss": 0.81220949, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83038116, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.776864767074585 + }, + { + "auxiliary_loss_clip": 0.01159169, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.04998314, + "balance_loss_mlp": 1.03141046, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.4948021518872903, + "language_loss": 0.86766654, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88977075, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.618649482727051 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.00751408, + "balance_loss_clip": 1.05247116, + "balance_loss_mlp": 0.99983299, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.3949103858092196, + "language_loss": 0.68556553, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70448709, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.6154541969299316 + }, + { + "auxiliary_loss_clip": 0.01111976, + "auxiliary_loss_mlp": 0.01058166, + "balance_loss_clip": 1.04401445, + "balance_loss_mlp": 1.03563595, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.6242432047356545, + "language_loss": 0.77147353, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79317492, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.6824564933776855 + }, + { + "auxiliary_loss_clip": 0.01116397, + "auxiliary_loss_mlp": 0.01052073, + "balance_loss_clip": 1.04888153, + "balance_loss_mlp": 1.02918434, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.746196353119763, + "language_loss": 0.75601578, + "learning_rate": 3.929043395181631e-06, + "loss": 0.77770042, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 2.742192506790161 + }, + { + "auxiliary_loss_clip": 0.01092508, + "auxiliary_loss_mlp": 0.01044832, + "balance_loss_clip": 1.04721403, + "balance_loss_mlp": 1.02501929, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 1.9538187531059175, + "language_loss": 0.81966293, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84103632, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 2.7863075733184814 + }, + { + "auxiliary_loss_clip": 0.01161189, + "auxiliary_loss_mlp": 0.01057261, + "balance_loss_clip": 1.05068862, + "balance_loss_mlp": 1.03634, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 3.505388014473894, + "language_loss": 0.83207798, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85426253, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.576854944229126 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01056114, + "balance_loss_clip": 1.04938579, + "balance_loss_mlp": 1.03396499, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 1.7815957683554577, + "language_loss": 0.9243806, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94622266, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 4.2582738399505615 + }, + { + "auxiliary_loss_clip": 0.01117235, + "auxiliary_loss_mlp": 0.01060501, + "balance_loss_clip": 1.04467428, + "balance_loss_mlp": 1.04010475, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.591552041940324, + "language_loss": 0.75193477, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77371204, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 2.6721482276916504 + }, + { + "auxiliary_loss_clip": 0.01147035, + "auxiliary_loss_mlp": 0.01057238, + "balance_loss_clip": 1.05073214, + "balance_loss_mlp": 1.03783083, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 2.304766851505272, + "language_loss": 0.71772814, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73977077, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 2.6762566566467285 + }, + { + "auxiliary_loss_clip": 0.01126573, + "auxiliary_loss_mlp": 0.01046272, + "balance_loss_clip": 1.04756558, + "balance_loss_mlp": 1.02579188, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 1.91319551091126, + "language_loss": 0.77035749, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79208589, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 2.6117355823516846 + }, + { + "auxiliary_loss_clip": 0.01145856, + "auxiliary_loss_mlp": 0.01059675, + "balance_loss_clip": 1.04782367, + "balance_loss_mlp": 1.03867078, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.3695308954604712, + "language_loss": 0.88171375, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90376908, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 2.576058864593506 + }, + { + "auxiliary_loss_clip": 0.01124286, + "auxiliary_loss_mlp": 0.0104502, + "balance_loss_clip": 1.04830241, + "balance_loss_mlp": 1.02490938, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.7594166956023214, + "language_loss": 0.81183696, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83353001, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 5.756689548492432 + }, + { + "auxiliary_loss_clip": 0.01132077, + "auxiliary_loss_mlp": 0.01056825, + "balance_loss_clip": 1.04487741, + "balance_loss_mlp": 1.03475904, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 3.5180891077040504, + "language_loss": 0.70156074, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72344977, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 4.221498489379883 + }, + { + "auxiliary_loss_clip": 0.01132843, + "auxiliary_loss_mlp": 0.01046811, + "balance_loss_clip": 1.04707575, + "balance_loss_mlp": 1.02658105, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.5716769809275526, + "language_loss": 0.72382712, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74562365, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.597184419631958 + }, + { + "auxiliary_loss_clip": 0.01126261, + "auxiliary_loss_mlp": 0.00751553, + "balance_loss_clip": 1.04742002, + "balance_loss_mlp": 0.99983668, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.8249488870264474, + "language_loss": 0.7441045, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76288265, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 2.672107696533203 + }, + { + "auxiliary_loss_clip": 0.01155155, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.04809082, + "balance_loss_mlp": 1.02927041, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 1.916763245047427, + "language_loss": 0.79330587, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81536591, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 2.5776357650756836 + }, + { + "auxiliary_loss_clip": 0.01121507, + "auxiliary_loss_mlp": 0.01056256, + "balance_loss_clip": 1.04656827, + "balance_loss_mlp": 1.03470325, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 2.0872658643923474, + "language_loss": 0.76808548, + "learning_rate": 3.927700564817529e-06, + "loss": 0.78986323, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 2.6684417724609375 + }, + { + "auxiliary_loss_clip": 0.01042419, + "auxiliary_loss_mlp": 0.01015657, + "balance_loss_clip": 1.01427758, + "balance_loss_mlp": 1.0122714, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7932351565589102, + "language_loss": 0.55283153, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5734123, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 3.0369627475738525 + }, + { + "auxiliary_loss_clip": 0.0106592, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_clip": 1.04204869, + "balance_loss_mlp": 1.02842784, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 2.520478736392857, + "language_loss": 0.90567815, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92682451, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 2.996739387512207 + }, + { + "auxiliary_loss_clip": 0.0108735, + "auxiliary_loss_mlp": 0.01054139, + "balance_loss_clip": 1.04193521, + "balance_loss_mlp": 1.03368235, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 1.7375866445296808, + "language_loss": 0.85043216, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87184703, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 3.0434203147888184 + }, + { + "auxiliary_loss_clip": 0.01118665, + "auxiliary_loss_mlp": 0.01054503, + "balance_loss_clip": 1.04569328, + "balance_loss_mlp": 1.03498816, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.430105404514203, + "language_loss": 0.76070821, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78243989, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.665254592895508 + }, + { + "auxiliary_loss_clip": 0.01113338, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.0496366, + "balance_loss_mlp": 1.03471661, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 3.6736741184204824, + "language_loss": 0.68173188, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70343274, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 2.8523290157318115 + }, + { + "auxiliary_loss_clip": 0.01155207, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_clip": 1.04894805, + "balance_loss_mlp": 1.0252707, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 1.8519426063244782, + "language_loss": 0.84115672, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86316776, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 2.5816867351531982 + }, + { + "auxiliary_loss_clip": 0.01122565, + "auxiliary_loss_mlp": 0.01051322, + "balance_loss_clip": 1.04378653, + "balance_loss_mlp": 1.03113937, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 1.9191291510214699, + "language_loss": 0.64652538, + "learning_rate": 3.926972384863022e-06, + "loss": 0.66826427, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.5707736015319824 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01039031, + "balance_loss_clip": 1.0444963, + "balance_loss_mlp": 1.01929021, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 1.8478471280431388, + "language_loss": 0.88178587, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90336579, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.6628925800323486 + }, + { + "auxiliary_loss_clip": 0.0109594, + "auxiliary_loss_mlp": 0.01069287, + "balance_loss_clip": 1.04862118, + "balance_loss_mlp": 1.04626763, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.4567793653667938, + "language_loss": 0.73290801, + "learning_rate": 3.926763675749339e-06, + "loss": 0.75456023, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 2.770784854888916 + }, + { + "auxiliary_loss_clip": 0.01153363, + "auxiliary_loss_mlp": 0.01061937, + "balance_loss_clip": 1.04615688, + "balance_loss_mlp": 1.04124188, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 1.7923737144801934, + "language_loss": 0.79621017, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81836319, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.532121419906616 + }, + { + "auxiliary_loss_clip": 0.01135368, + "auxiliary_loss_mlp": 0.01055584, + "balance_loss_clip": 1.05060625, + "balance_loss_mlp": 1.03444862, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 3.852465303347869, + "language_loss": 0.79865742, + "learning_rate": 3.926554674383371e-06, + "loss": 0.82056689, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.7075586318969727 + }, + { + "auxiliary_loss_clip": 0.01052019, + "auxiliary_loss_mlp": 0.01010704, + "balance_loss_clip": 1.01464713, + "balance_loss_mlp": 1.00758111, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8890098364854156, + "language_loss": 0.63372171, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65434891, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.1778008937835693 + }, + { + "auxiliary_loss_clip": 0.01127605, + "auxiliary_loss_mlp": 0.01056775, + "balance_loss_clip": 1.04958272, + "balance_loss_mlp": 1.03492379, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6046704669016354, + "language_loss": 0.84928429, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87112808, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.703395366668701 + }, + { + "auxiliary_loss_clip": 0.01155339, + "auxiliary_loss_mlp": 0.00751201, + "balance_loss_clip": 1.04689133, + "balance_loss_mlp": 0.99981976, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.2355274400380454, + "language_loss": 0.80040693, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.81947231, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.6032748222351074 + }, + { + "auxiliary_loss_clip": 0.01091126, + "auxiliary_loss_mlp": 0.01050942, + "balance_loss_clip": 1.04068708, + "balance_loss_mlp": 1.02924633, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.9534222418547078, + "language_loss": 0.73547518, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75689584, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.693250894546509 + }, + { + "auxiliary_loss_clip": 0.01022995, + "auxiliary_loss_mlp": 0.01012359, + "balance_loss_clip": 1.02562249, + "balance_loss_mlp": 1.00931871, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 1.0705213784813297, + "language_loss": 0.63477349, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65512705, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 3.2377066612243652 + }, + { + "auxiliary_loss_clip": 0.01084981, + "auxiliary_loss_mlp": 0.01058323, + "balance_loss_clip": 1.03927732, + "balance_loss_mlp": 1.03766382, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6772856512848884, + "language_loss": 0.78253579, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80396879, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.77018141746521 + }, + { + "auxiliary_loss_clip": 0.01149431, + "auxiliary_loss_mlp": 0.01049608, + "balance_loss_clip": 1.05050051, + "balance_loss_mlp": 1.02923512, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 1.9328053435537136, + "language_loss": 0.8383649, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86035532, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.614778995513916 + }, + { + "auxiliary_loss_clip": 0.01131042, + "auxiliary_loss_mlp": 0.01050338, + "balance_loss_clip": 1.04329407, + "balance_loss_mlp": 1.02823663, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.7069216735042836, + "language_loss": 0.77742732, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79924107, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.639967918395996 + }, + { + "auxiliary_loss_clip": 0.01117428, + "auxiliary_loss_mlp": 0.01038976, + "balance_loss_clip": 1.04269481, + "balance_loss_mlp": 1.02124977, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.904629984547293, + "language_loss": 0.75149399, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77305806, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.6565701961517334 + }, + { + "auxiliary_loss_clip": 0.01120117, + "auxiliary_loss_mlp": 0.01051909, + "balance_loss_clip": 1.04154468, + "balance_loss_mlp": 1.03107142, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.417837147352324, + "language_loss": 0.92124832, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94296861, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.727501153945923 + }, + { + "auxiliary_loss_clip": 0.011354, + "auxiliary_loss_mlp": 0.01044862, + "balance_loss_clip": 1.04165244, + "balance_loss_mlp": 1.02403641, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.4306227723648584, + "language_loss": 0.77806079, + "learning_rate": 3.925399944279861e-06, + "loss": 0.79986334, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.631516456604004 + }, + { + "auxiliary_loss_clip": 0.01155407, + "auxiliary_loss_mlp": 0.01051492, + "balance_loss_clip": 1.04872108, + "balance_loss_mlp": 1.03160775, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.410034478256584, + "language_loss": 0.82074606, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84281504, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.52420711517334 + }, + { + "auxiliary_loss_clip": 0.01108175, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_clip": 1.04378247, + "balance_loss_mlp": 1.04569077, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.454127602291484, + "language_loss": 0.84519833, + "learning_rate": 3.92518904404875e-06, + "loss": 0.86694098, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 2.671741008758545 + }, + { + "auxiliary_loss_clip": 0.01012879, + "auxiliary_loss_mlp": 0.01017075, + "balance_loss_clip": 1.01702142, + "balance_loss_mlp": 1.01383257, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9315818547950256, + "language_loss": 0.61034763, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63064718, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 2.9354512691497803 + }, + { + "auxiliary_loss_clip": 0.01157748, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_clip": 1.05163026, + "balance_loss_mlp": 1.02619052, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 1.772497680118207, + "language_loss": 0.79216981, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81419492, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 2.5553267002105713 + }, + { + "auxiliary_loss_clip": 0.01131398, + "auxiliary_loss_mlp": 0.01049067, + "balance_loss_clip": 1.04928088, + "balance_loss_mlp": 1.02908778, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 2.19004721994548, + "language_loss": 0.76664817, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.78845286, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.564685583114624 + }, + { + "auxiliary_loss_clip": 0.0112873, + "auxiliary_loss_mlp": 0.01046334, + "balance_loss_clip": 1.04487121, + "balance_loss_mlp": 1.0269146, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 1.9868703768135714, + "language_loss": 0.79306787, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.8148185, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 2.708615303039551 + }, + { + "auxiliary_loss_clip": 0.01153312, + "auxiliary_loss_mlp": 0.00751278, + "balance_loss_clip": 1.04763961, + "balance_loss_mlp": 0.9998309, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.7436803512734482, + "language_loss": 0.78314823, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80219412, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.584294080734253 + }, + { + "auxiliary_loss_clip": 0.0114117, + "auxiliary_loss_mlp": 0.01049433, + "balance_loss_clip": 1.04430223, + "balance_loss_mlp": 1.02859545, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 2.077953905831914, + "language_loss": 0.7000469, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72195292, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 2.658643960952759 + }, + { + "auxiliary_loss_clip": 0.00990688, + "auxiliary_loss_mlp": 0.01009544, + "balance_loss_clip": 1.01764488, + "balance_loss_mlp": 1.00634956, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7694583545152828, + "language_loss": 0.61017191, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63017416, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 4.894258737564087 + }, + { + "auxiliary_loss_clip": 0.01139983, + "auxiliary_loss_mlp": 0.01051841, + "balance_loss_clip": 1.04930139, + "balance_loss_mlp": 1.03181422, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.363770610287759, + "language_loss": 0.93098527, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95290351, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 2.681274175643921 + }, + { + "auxiliary_loss_clip": 0.01136498, + "auxiliary_loss_mlp": 0.01058703, + "balance_loss_clip": 1.04898548, + "balance_loss_mlp": 1.03725684, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8452247931538222, + "language_loss": 0.72493815, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74689013, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 2.6239125728607178 + }, + { + "auxiliary_loss_clip": 0.01116977, + "auxiliary_loss_mlp": 0.01050947, + "balance_loss_clip": 1.04875803, + "balance_loss_mlp": 1.03093195, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 1.816117514017141, + "language_loss": 0.74276316, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76444238, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 4.282822132110596 + }, + { + "auxiliary_loss_clip": 0.01127138, + "auxiliary_loss_mlp": 0.01045291, + "balance_loss_clip": 1.04739058, + "balance_loss_mlp": 1.02583599, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 1.9983393225281472, + "language_loss": 0.86696637, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88869071, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 4.136961221694946 + }, + { + "auxiliary_loss_clip": 0.01108448, + "auxiliary_loss_mlp": 0.01065482, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.04375005, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 1.9773125288145188, + "language_loss": 0.86191815, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88365746, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 4.153559446334839 + }, + { + "auxiliary_loss_clip": 0.01138976, + "auxiliary_loss_mlp": 0.01047736, + "balance_loss_clip": 1.04638982, + "balance_loss_mlp": 1.02792406, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 1.8434416452374516, + "language_loss": 0.79244816, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81431532, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.5781023502349854 + }, + { + "auxiliary_loss_clip": 0.01148971, + "auxiliary_loss_mlp": 0.01058998, + "balance_loss_clip": 1.04802322, + "balance_loss_mlp": 1.03785062, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 2.3553750609261317, + "language_loss": 0.78694665, + "learning_rate": 3.923704567851557e-06, + "loss": 0.8090263, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 2.5782968997955322 + }, + { + "auxiliary_loss_clip": 0.01071905, + "auxiliary_loss_mlp": 0.01064812, + "balance_loss_clip": 1.04061174, + "balance_loss_mlp": 1.04507065, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 2.0124658477063146, + "language_loss": 0.8429935, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86436069, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 2.9362952709198 + }, + { + "auxiliary_loss_clip": 0.01144946, + "auxiliary_loss_mlp": 0.01051806, + "balance_loss_clip": 1.04953492, + "balance_loss_mlp": 1.03149247, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.030018845502923, + "language_loss": 0.81043196, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83239949, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 2.8339099884033203 + }, + { + "auxiliary_loss_clip": 0.01032426, + "auxiliary_loss_mlp": 0.01039506, + "balance_loss_clip": 1.01556242, + "balance_loss_mlp": 1.03647804, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.8258875119208937, + "language_loss": 0.6119113, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63263059, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 3.199350595474243 + }, + { + "auxiliary_loss_clip": 0.0113244, + "auxiliary_loss_mlp": 0.01069025, + "balance_loss_clip": 1.0469712, + "balance_loss_mlp": 1.04780567, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 1.7186239787015538, + "language_loss": 0.75038743, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77240211, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.621101140975952 + }, + { + "auxiliary_loss_clip": 0.01090259, + "auxiliary_loss_mlp": 0.00751469, + "balance_loss_clip": 1.04140544, + "balance_loss_mlp": 0.9998405, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.3642969838655223, + "language_loss": 0.72866082, + "learning_rate": 3.923170932221222e-06, + "loss": 0.74707806, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.7252700328826904 + }, + { + "auxiliary_loss_clip": 0.01115727, + "auxiliary_loss_mlp": 0.01048694, + "balance_loss_clip": 1.04750121, + "balance_loss_mlp": 1.02857137, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 3.3629436341216055, + "language_loss": 0.87040204, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89204627, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 2.7100327014923096 + }, + { + "auxiliary_loss_clip": 0.01108021, + "auxiliary_loss_mlp": 0.01060871, + "balance_loss_clip": 1.04696548, + "balance_loss_mlp": 1.03991425, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.5686803507102092, + "language_loss": 0.77426785, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79595679, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 2.6792845726013184 + }, + { + "auxiliary_loss_clip": 0.01155179, + "auxiliary_loss_mlp": 0.01056547, + "balance_loss_clip": 1.05051446, + "balance_loss_mlp": 1.03811681, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.7277282254529587, + "language_loss": 0.76883107, + "learning_rate": 3.922849875688626e-06, + "loss": 0.79094833, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 2.64823317527771 + }, + { + "auxiliary_loss_clip": 0.01121448, + "auxiliary_loss_mlp": 0.01048562, + "balance_loss_clip": 1.04547834, + "balance_loss_mlp": 1.02812958, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 2.512475003107236, + "language_loss": 0.71967983, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74137986, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.620863199234009 + }, + { + "auxiliary_loss_clip": 0.01125789, + "auxiliary_loss_mlp": 0.01065222, + "balance_loss_clip": 1.04726684, + "balance_loss_mlp": 1.04327559, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.7016495072578084, + "language_loss": 0.82580805, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84771818, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 2.5991404056549072 + }, + { + "auxiliary_loss_clip": 0.01021216, + "auxiliary_loss_mlp": 0.01025892, + "balance_loss_clip": 1.02330041, + "balance_loss_mlp": 1.02304327, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7792687095204016, + "language_loss": 0.61117852, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63164961, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.0551068782806396 + }, + { + "auxiliary_loss_clip": 0.01084205, + "auxiliary_loss_mlp": 0.00751289, + "balance_loss_clip": 1.04151261, + "balance_loss_mlp": 0.99984682, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.1457089853313747, + "language_loss": 0.8622632, + "learning_rate": 3.922420779525586e-06, + "loss": 0.8806181, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.7088513374328613 + }, + { + "auxiliary_loss_clip": 0.01092554, + "auxiliary_loss_mlp": 0.01060112, + "balance_loss_clip": 1.04292917, + "balance_loss_mlp": 1.03822494, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.224247928855581, + "language_loss": 0.66044164, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.68196827, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.68052339553833 + }, + { + "auxiliary_loss_clip": 0.01160978, + "auxiliary_loss_mlp": 0.01047568, + "balance_loss_clip": 1.05047262, + "balance_loss_mlp": 1.02932906, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.8702176178346794, + "language_loss": 0.75697505, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77906048, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.4938974380493164 + }, + { + "auxiliary_loss_clip": 0.01156005, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_clip": 1.04706585, + "balance_loss_mlp": 1.02742767, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 2.127940517289583, + "language_loss": 0.84582102, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86786121, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.5542280673980713 + }, + { + "auxiliary_loss_clip": 0.01129116, + "auxiliary_loss_mlp": 0.01041818, + "balance_loss_clip": 1.04468751, + "balance_loss_mlp": 1.02257729, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 1.9395707943718612, + "language_loss": 0.7638849, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78559422, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.62129282951355 + }, + { + "auxiliary_loss_clip": 0.01160527, + "auxiliary_loss_mlp": 0.01048539, + "balance_loss_clip": 1.05051458, + "balance_loss_mlp": 1.02869058, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.5901937048394172, + "language_loss": 0.79346037, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81555098, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 2.5304946899414062 + }, + { + "auxiliary_loss_clip": 0.01124403, + "auxiliary_loss_mlp": 0.01052613, + "balance_loss_clip": 1.04507244, + "balance_loss_mlp": 1.03200138, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 2.231732096099176, + "language_loss": 0.85910594, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88087606, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.6016485691070557 + }, + { + "auxiliary_loss_clip": 0.01132652, + "auxiliary_loss_mlp": 0.01056835, + "balance_loss_clip": 1.04887891, + "balance_loss_mlp": 1.03782105, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.9753903793935788, + "language_loss": 0.7600382, + "learning_rate": 3.921667054809449e-06, + "loss": 0.78193307, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 2.828080892562866 + }, + { + "auxiliary_loss_clip": 0.01130439, + "auxiliary_loss_mlp": 0.00751397, + "balance_loss_clip": 1.04503655, + "balance_loss_mlp": 0.99984145, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.0356004765082694, + "language_loss": 0.88350022, + "learning_rate": 3.921559088338068e-06, + "loss": 0.9023186, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.6216251850128174 + }, + { + "auxiliary_loss_clip": 0.01140864, + "auxiliary_loss_mlp": 0.01050053, + "balance_loss_clip": 1.04671025, + "balance_loss_mlp": 1.03196907, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.8006772846229135, + "language_loss": 0.67788869, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69979787, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.773658037185669 + }, + { + "auxiliary_loss_clip": 0.01131008, + "auxiliary_loss_mlp": 0.01045522, + "balance_loss_clip": 1.04821241, + "balance_loss_mlp": 1.02665162, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 1.8677424914732923, + "language_loss": 0.69202709, + "learning_rate": 3.921342936802265e-06, + "loss": 0.71379238, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.7809014320373535 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.01046726, + "balance_loss_clip": 1.04338777, + "balance_loss_mlp": 1.02903521, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 1.5072714748163845, + "language_loss": 0.82454854, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84635025, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.636929512023926 + }, + { + "auxiliary_loss_clip": 0.01111726, + "auxiliary_loss_mlp": 0.01060631, + "balance_loss_clip": 1.04015076, + "balance_loss_mlp": 1.04150963, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.0756949631158945, + "language_loss": 0.76550281, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78722644, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 2.6793453693389893 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01052051, + "balance_loss_clip": 1.04342091, + "balance_loss_mlp": 1.03408647, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 1.8372814786622274, + "language_loss": 0.68515825, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70677423, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 2.6512420177459717 + }, + { + "auxiliary_loss_clip": 0.01128731, + "auxiliary_loss_mlp": 0.01054616, + "balance_loss_clip": 1.04823446, + "balance_loss_mlp": 1.03523254, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.7443377242958285, + "language_loss": 0.85245991, + "learning_rate": 3.920909759473295e-06, + "loss": 0.87429339, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 2.6231188774108887 + }, + { + "auxiliary_loss_clip": 0.01027037, + "auxiliary_loss_mlp": 0.00749118, + "balance_loss_clip": 1.00997543, + "balance_loss_mlp": 1.0000025, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8122934631426665, + "language_loss": 0.65103114, + "learning_rate": 3.920801283028054e-06, + "loss": 0.66879272, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.192234754562378 + }, + { + "auxiliary_loss_clip": 0.01137555, + "auxiliary_loss_mlp": 0.01051636, + "balance_loss_clip": 1.0489378, + "balance_loss_mlp": 1.03346884, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.5324737360000533, + "language_loss": 0.71779239, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73968434, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 2.644144058227539 + }, + { + "auxiliary_loss_clip": 0.0115027, + "auxiliary_loss_mlp": 0.01055431, + "balance_loss_clip": 1.05156326, + "balance_loss_mlp": 1.03657162, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 2.293946195808744, + "language_loss": 0.76360899, + "learning_rate": 3.920584111630755e-06, + "loss": 0.78566605, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 2.5682694911956787 + }, + { + "auxiliary_loss_clip": 0.01103356, + "auxiliary_loss_mlp": 0.01055679, + "balance_loss_clip": 1.04407644, + "balance_loss_mlp": 1.03665364, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 1.7739122358757347, + "language_loss": 0.76414412, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78573447, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 2.71321964263916 + }, + { + "auxiliary_loss_clip": 0.01099285, + "auxiliary_loss_mlp": 0.01057777, + "balance_loss_clip": 1.03985131, + "balance_loss_mlp": 1.03823888, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 2.0882228784113854, + "language_loss": 0.72807765, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74964827, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 2.6553664207458496 + }, + { + "auxiliary_loss_clip": 0.01123395, + "auxiliary_loss_mlp": 0.00751302, + "balance_loss_clip": 1.04475927, + "balance_loss_mlp": 0.99984956, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.2262576735034405, + "language_loss": 0.79733711, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81608409, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 4.238492965698242 + }, + { + "auxiliary_loss_clip": 0.01088335, + "auxiliary_loss_mlp": 0.01052017, + "balance_loss_clip": 1.04103041, + "balance_loss_mlp": 1.03252637, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 1.975832380204128, + "language_loss": 0.85996938, + "learning_rate": 3.920148894924246e-06, + "loss": 0.88137293, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.81650447845459 + }, + { + "auxiliary_loss_clip": 0.01139567, + "auxiliary_loss_mlp": 0.00751108, + "balance_loss_clip": 1.04350471, + "balance_loss_mlp": 0.99984646, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 2.2737864763856965, + "language_loss": 0.77815855, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79706532, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.627211093902588 + }, + { + "auxiliary_loss_clip": 0.01134703, + "auxiliary_loss_mlp": 0.01049577, + "balance_loss_clip": 1.04562831, + "balance_loss_mlp": 1.02943039, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 2.013460445565856, + "language_loss": 0.80482262, + "learning_rate": 3.91993084968105e-06, + "loss": 0.8266654, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 2.6900408267974854 + }, + { + "auxiliary_loss_clip": 0.01144917, + "auxiliary_loss_mlp": 0.01048446, + "balance_loss_clip": 1.04860771, + "balance_loss_mlp": 1.02950335, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 2.3276499135880697, + "language_loss": 0.77907145, + "learning_rate": 3.919821717851428e-06, + "loss": 0.80100507, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 4.178247451782227 + }, + { + "auxiliary_loss_clip": 0.01126061, + "auxiliary_loss_mlp": 0.01041681, + "balance_loss_clip": 1.04499793, + "balance_loss_mlp": 1.02117693, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 1.7817940208107275, + "language_loss": 0.76935339, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79103082, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 5.653441905975342 + }, + { + "auxiliary_loss_clip": 0.01132444, + "auxiliary_loss_mlp": 0.01045489, + "balance_loss_clip": 1.04520154, + "balance_loss_mlp": 1.02654696, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 1.7426742604644028, + "language_loss": 0.70042336, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72220266, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 2.6535983085632324 + }, + { + "auxiliary_loss_clip": 0.01136283, + "auxiliary_loss_mlp": 0.01047166, + "balance_loss_clip": 1.04636812, + "balance_loss_mlp": 1.02737761, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.4734248404539305, + "language_loss": 0.81186599, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83370048, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 2.598134994506836 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.00751127, + "balance_loss_clip": 1.0456413, + "balance_loss_mlp": 0.99984312, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 1.7627051847006052, + "language_loss": 0.92409796, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94298947, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 2.5481529235839844 + }, + { + "auxiliary_loss_clip": 0.01114632, + "auxiliary_loss_mlp": 0.01053242, + "balance_loss_clip": 1.04300117, + "balance_loss_mlp": 1.03322709, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.4683401004388434, + "language_loss": 0.87772834, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89940715, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 2.6781749725341797 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.00751215, + "balance_loss_clip": 1.04443538, + "balance_loss_mlp": 0.99988377, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 1.9264754025176636, + "language_loss": 0.84535551, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86420047, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 2.6313281059265137 + }, + { + "auxiliary_loss_clip": 0.01094579, + "auxiliary_loss_mlp": 0.01058746, + "balance_loss_clip": 1.043154, + "balance_loss_mlp": 1.03869462, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.9621277547338976, + "language_loss": 0.83099753, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85253078, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.715209484100342 + }, + { + "auxiliary_loss_clip": 0.01156249, + "auxiliary_loss_mlp": 0.01048358, + "balance_loss_clip": 1.04815531, + "balance_loss_mlp": 1.02841437, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.520826321167469, + "language_loss": 0.74683201, + "learning_rate": 3.918946042768707e-06, + "loss": 0.7688781, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 2.7630505561828613 + }, + { + "auxiliary_loss_clip": 0.01140624, + "auxiliary_loss_mlp": 0.0105119, + "balance_loss_clip": 1.05445194, + "balance_loss_mlp": 1.03164029, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 2.5185169857893075, + "language_loss": 0.72951376, + "learning_rate": 3.918836255889908e-06, + "loss": 0.75143188, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 2.636016368865967 + }, + { + "auxiliary_loss_clip": 0.01141798, + "auxiliary_loss_mlp": 0.01048792, + "balance_loss_clip": 1.04725671, + "balance_loss_mlp": 1.0294559, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.0259892292449875, + "language_loss": 0.88071167, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90261751, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 2.7347569465637207 + }, + { + "auxiliary_loss_clip": 0.01141607, + "auxiliary_loss_mlp": 0.01051824, + "balance_loss_clip": 1.05137217, + "balance_loss_mlp": 1.03066421, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.7132003778054075, + "language_loss": 0.6654861, + "learning_rate": 3.918616463849087e-06, + "loss": 0.68742037, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.6866397857666016 + }, + { + "auxiliary_loss_clip": 0.01101663, + "auxiliary_loss_mlp": 0.01053484, + "balance_loss_clip": 1.04242074, + "balance_loss_mlp": 1.03253877, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.6911278822878346, + "language_loss": 0.80702293, + "learning_rate": 3.918506458695399e-06, + "loss": 0.82857442, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.8569748401641846 + }, + { + "auxiliary_loss_clip": 0.01038744, + "auxiliary_loss_mlp": 0.01024246, + "balance_loss_clip": 1.01099467, + "balance_loss_mlp": 1.02102709, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8015298215913679, + "language_loss": 0.66144234, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68207222, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 3.128481149673462 + }, + { + "auxiliary_loss_clip": 0.01127705, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_clip": 1.04458475, + "balance_loss_mlp": 1.02924514, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 2.063163406943905, + "language_loss": 0.79761612, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81938291, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 2.6171088218688965 + }, + { + "auxiliary_loss_clip": 0.01109909, + "auxiliary_loss_mlp": 0.00751059, + "balance_loss_clip": 1.04478383, + "balance_loss_mlp": 0.99986732, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.0272631247652155, + "language_loss": 0.72576916, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74437886, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.6616265773773193 + }, + { + "auxiliary_loss_clip": 0.01113695, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.04619443, + "balance_loss_mlp": 1.02093697, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 1.6104334827883227, + "language_loss": 0.72115785, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74270433, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.874715805053711 + }, + { + "auxiliary_loss_clip": 0.01100177, + "auxiliary_loss_mlp": 0.01040149, + "balance_loss_clip": 1.04246008, + "balance_loss_mlp": 1.01990771, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.0930012027266236, + "language_loss": 0.78267121, + "learning_rate": 3.917955341761128e-06, + "loss": 0.80407453, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.643472671508789 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_clip": 1.04713893, + "balance_loss_mlp": 1.03350627, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.276763869962252, + "language_loss": 0.75179273, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77341282, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.7298355102539062 + }, + { + "auxiliary_loss_clip": 0.0114229, + "auxiliary_loss_mlp": 0.01044999, + "balance_loss_clip": 1.04753685, + "balance_loss_mlp": 1.02607989, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.5080306007291366, + "language_loss": 0.75112504, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77299798, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 2.728236436843872 + }, + { + "auxiliary_loss_clip": 0.01154842, + "auxiliary_loss_mlp": 0.01050754, + "balance_loss_clip": 1.04698968, + "balance_loss_mlp": 1.03170466, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.123897404573277, + "language_loss": 0.74001282, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76206875, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 2.5763771533966064 + }, + { + "auxiliary_loss_clip": 0.01123027, + "auxiliary_loss_mlp": 0.01051075, + "balance_loss_clip": 1.05186844, + "balance_loss_mlp": 1.03108358, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.723208327938475, + "language_loss": 0.73400152, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75574255, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.709249973297119 + }, + { + "auxiliary_loss_clip": 0.01116941, + "auxiliary_loss_mlp": 0.01047175, + "balance_loss_clip": 1.04942703, + "balance_loss_mlp": 1.02793503, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.5850215184176992, + "language_loss": 0.98453212, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00617337, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.9224627017974854 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01047192, + "balance_loss_clip": 1.04767275, + "balance_loss_mlp": 1.02664042, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.76418668752044, + "language_loss": 0.86242521, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88422918, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.6257121562957764 + }, + { + "auxiliary_loss_clip": 0.01138907, + "auxiliary_loss_mlp": 0.01056687, + "balance_loss_clip": 1.05093598, + "balance_loss_mlp": 1.03522956, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 2.032259415367888, + "language_loss": 0.8510834, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87303931, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.6432645320892334 + }, + { + "auxiliary_loss_clip": 0.01116763, + "auxiliary_loss_mlp": 0.01046074, + "balance_loss_clip": 1.04340589, + "balance_loss_mlp": 1.02573705, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 2.0129333365100854, + "language_loss": 0.85520267, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87683105, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 2.6445059776306152 + }, + { + "auxiliary_loss_clip": 0.01104366, + "auxiliary_loss_mlp": 0.01051474, + "balance_loss_clip": 1.04649282, + "balance_loss_mlp": 1.03089809, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.257635278415041, + "language_loss": 0.76924336, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79080176, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.6447181701660156 + }, + { + "auxiliary_loss_clip": 0.01139669, + "auxiliary_loss_mlp": 0.01050219, + "balance_loss_clip": 1.04828811, + "balance_loss_mlp": 1.03057396, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 2.0098950319776874, + "language_loss": 0.83489019, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85678911, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 2.5703186988830566 + }, + { + "auxiliary_loss_clip": 0.01128131, + "auxiliary_loss_mlp": 0.01051872, + "balance_loss_clip": 1.0455395, + "balance_loss_mlp": 1.03184462, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.942069935027087, + "language_loss": 0.74466479, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76646477, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 2.5818891525268555 + }, + { + "auxiliary_loss_clip": 0.0112166, + "auxiliary_loss_mlp": 0.01051028, + "balance_loss_clip": 1.04526806, + "balance_loss_mlp": 1.03193069, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.0908455790894744, + "language_loss": 0.7189039, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74063081, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 2.651585578918457 + }, + { + "auxiliary_loss_clip": 0.01130821, + "auxiliary_loss_mlp": 0.01052627, + "balance_loss_clip": 1.04526281, + "balance_loss_mlp": 1.03128862, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.3371495296399196, + "language_loss": 0.71925366, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74108815, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 2.6643245220184326 + }, + { + "auxiliary_loss_clip": 0.01141984, + "auxiliary_loss_mlp": 0.01062939, + "balance_loss_clip": 1.04652381, + "balance_loss_mlp": 1.04129088, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 1.9152616089599284, + "language_loss": 0.81197774, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83402693, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.6223361492156982 + }, + { + "auxiliary_loss_clip": 0.01106545, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_clip": 1.04041994, + "balance_loss_mlp": 1.02935219, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 3.0287242868295836, + "language_loss": 0.75552982, + "learning_rate": 3.916291083698784e-06, + "loss": 0.77711582, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 2.8532073497772217 + }, + { + "auxiliary_loss_clip": 0.01022721, + "auxiliary_loss_mlp": 0.0100531, + "balance_loss_clip": 1.00942874, + "balance_loss_mlp": 1.0022105, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8613101540286864, + "language_loss": 0.55244422, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57272452, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 3.1620495319366455 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.010481, + "balance_loss_clip": 1.04604864, + "balance_loss_mlp": 1.02833486, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 1.8548508030187183, + "language_loss": 0.78181899, + "learning_rate": 3.916067946991971e-06, + "loss": 0.80343831, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.6663451194763184 + }, + { + "auxiliary_loss_clip": 0.01154522, + "auxiliary_loss_mlp": 0.01047406, + "balance_loss_clip": 1.0465163, + "balance_loss_mlp": 1.02699709, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.9064624090900566, + "language_loss": 0.78876281, + "learning_rate": 3.915956269650216e-06, + "loss": 0.81078207, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.567586660385132 + }, + { + "auxiliary_loss_clip": 0.01099464, + "auxiliary_loss_mlp": 0.01051882, + "balance_loss_clip": 1.03919911, + "balance_loss_mlp": 1.03222489, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 2.8432527460977046, + "language_loss": 0.82496631, + "learning_rate": 3.915844519655208e-06, + "loss": 0.84647983, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 4.244845628738403 + }, + { + "auxiliary_loss_clip": 0.01125781, + "auxiliary_loss_mlp": 0.01051601, + "balance_loss_clip": 1.0438652, + "balance_loss_mlp": 1.03312325, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.3053355257573935, + "language_loss": 0.88753307, + "learning_rate": 3.915732697011183e-06, + "loss": 0.90930688, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 2.5629022121429443 + }, + { + "auxiliary_loss_clip": 0.01128542, + "auxiliary_loss_mlp": 0.01056174, + "balance_loss_clip": 1.04788828, + "balance_loss_mlp": 1.03658843, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.0884012315467424, + "language_loss": 0.74186081, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.763708, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.64078426361084 + }, + { + "auxiliary_loss_clip": 0.01120829, + "auxiliary_loss_mlp": 0.010509, + "balance_loss_clip": 1.04742408, + "balance_loss_mlp": 1.03019345, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 2.1252013045225473, + "language_loss": 0.87935352, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90107083, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 2.6508305072784424 + }, + { + "auxiliary_loss_clip": 0.01143674, + "auxiliary_loss_mlp": 0.00751338, + "balance_loss_clip": 1.04738688, + "balance_loss_mlp": 0.99986303, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 1.7471705502051282, + "language_loss": 0.78748685, + "learning_rate": 3.915396793227428e-06, + "loss": 0.80643696, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 4.088564395904541 + }, + { + "auxiliary_loss_clip": 0.01142875, + "auxiliary_loss_mlp": 0.00751241, + "balance_loss_clip": 1.04826975, + "balance_loss_mlp": 0.99988496, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 1.608791758547119, + "language_loss": 0.73591638, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75485754, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 4.262790203094482 + }, + { + "auxiliary_loss_clip": 0.01157152, + "auxiliary_loss_mlp": 0.01065893, + "balance_loss_clip": 1.04996562, + "balance_loss_mlp": 1.04571092, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.3689621571693285, + "language_loss": 0.75289297, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77512336, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 2.6540000438690186 + }, + { + "auxiliary_loss_clip": 0.01127259, + "auxiliary_loss_mlp": 0.0105368, + "balance_loss_clip": 1.04575586, + "balance_loss_mlp": 1.03362954, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.6144220404928353, + "language_loss": 0.84811711, + "learning_rate": 3.915060235755344e-06, + "loss": 0.86992657, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 2.6172635555267334 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01055254, + "balance_loss_clip": 1.04630935, + "balance_loss_mlp": 1.0363239, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.284057739027511, + "language_loss": 0.74058282, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76243067, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 2.588137149810791 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01056152, + "balance_loss_clip": 1.04776692, + "balance_loss_mlp": 1.03452802, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 1.9332337881857744, + "language_loss": 0.77979851, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80144066, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.6155600547790527 + }, + { + "auxiliary_loss_clip": 0.01139479, + "auxiliary_loss_mlp": 0.01052978, + "balance_loss_clip": 1.04535389, + "balance_loss_mlp": 1.03212857, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.7079017041198366, + "language_loss": 0.72103465, + "learning_rate": 3.914723024709793e-06, + "loss": 0.7429592, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.584787368774414 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01059723, + "balance_loss_clip": 1.0492959, + "balance_loss_mlp": 1.03826487, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 1.6211113363273522, + "language_loss": 0.78419679, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80616295, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 2.6767361164093018 + }, + { + "auxiliary_loss_clip": 0.01047861, + "auxiliary_loss_mlp": 0.0074923, + "balance_loss_clip": 1.01183629, + "balance_loss_mlp": 1.00022399, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9295786867880529, + "language_loss": 0.58081293, + "learning_rate": 3.914497854306543e-06, + "loss": 0.59878385, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 2.9336462020874023 + }, + { + "auxiliary_loss_clip": 0.01128545, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.04682016, + "balance_loss_mlp": 1.03054416, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.9114398499122796, + "language_loss": 0.76806653, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78984559, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 2.6293795108795166 + }, + { + "auxiliary_loss_clip": 0.01120378, + "auxiliary_loss_mlp": 0.01056659, + "balance_loss_clip": 1.04552817, + "balance_loss_mlp": 1.03557098, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.37566186722127, + "language_loss": 0.83886611, + "learning_rate": 3.914272393511494e-06, + "loss": 0.86063647, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.6645102500915527 + }, + { + "auxiliary_loss_clip": 0.01152368, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_clip": 1.04701734, + "balance_loss_mlp": 1.0293349, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 1.9736831797011922, + "language_loss": 0.84022093, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86223388, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.569570779800415 + }, + { + "auxiliary_loss_clip": 0.01156528, + "auxiliary_loss_mlp": 0.01051478, + "balance_loss_clip": 1.05233598, + "balance_loss_mlp": 1.0295316, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.760689665676297, + "language_loss": 0.84458721, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86666727, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 2.5502095222473145 + }, + { + "auxiliary_loss_clip": 0.01116244, + "auxiliary_loss_mlp": 0.00751453, + "balance_loss_clip": 1.04437733, + "balance_loss_mlp": 0.99988604, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.6358727364795314, + "language_loss": 0.84218043, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.86085743, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.6594724655151367 + }, + { + "auxiliary_loss_clip": 0.0112304, + "auxiliary_loss_mlp": 0.01055451, + "balance_loss_clip": 1.04527867, + "balance_loss_mlp": 1.03457761, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 1.8618804858274398, + "language_loss": 0.95960128, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98138618, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.737799644470215 + }, + { + "auxiliary_loss_clip": 0.01122051, + "auxiliary_loss_mlp": 0.01044148, + "balance_loss_clip": 1.04621196, + "balance_loss_mlp": 1.02424049, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.382599616954748, + "language_loss": 0.80195713, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82361913, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.7218055725097656 + }, + { + "auxiliary_loss_clip": 0.01095515, + "auxiliary_loss_mlp": 0.01045713, + "balance_loss_clip": 1.04141617, + "balance_loss_mlp": 1.02442193, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 2.468297352357305, + "language_loss": 0.7716074, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79301965, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.658869504928589 + }, + { + "auxiliary_loss_clip": 0.01140132, + "auxiliary_loss_mlp": 0.01045221, + "balance_loss_clip": 1.04690576, + "balance_loss_mlp": 1.02438307, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.00502263100588, + "language_loss": 0.87010086, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89195436, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.584585428237915 + }, + { + "auxiliary_loss_clip": 0.01145208, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_clip": 1.04333413, + "balance_loss_mlp": 1.02277064, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.547401094617841, + "language_loss": 0.69204247, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71392536, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.4856936931610107 + }, + { + "auxiliary_loss_clip": 0.01129238, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.0464412, + "balance_loss_mlp": 1.02203226, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 2.5521406320901, + "language_loss": 0.80170435, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82343411, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 2.594747543334961 + }, + { + "auxiliary_loss_clip": 0.01128775, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.0437274, + "balance_loss_mlp": 1.02447295, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.164545823301385, + "language_loss": 0.69025135, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71199757, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.5814812183380127 + }, + { + "auxiliary_loss_clip": 0.011171, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.04894412, + "balance_loss_mlp": 1.02611423, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.6746688227245563, + "language_loss": 0.72525775, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74688923, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 2.695380687713623 + }, + { + "auxiliary_loss_clip": 0.01088759, + "auxiliary_loss_mlp": 0.01057758, + "balance_loss_clip": 1.04480743, + "balance_loss_mlp": 1.03619266, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8286798266424762, + "language_loss": 0.92348284, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94494802, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 2.8073554039001465 + }, + { + "auxiliary_loss_clip": 0.01113178, + "auxiliary_loss_mlp": 0.01053445, + "balance_loss_clip": 1.04284644, + "balance_loss_mlp": 1.03302443, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 2.024233401072499, + "language_loss": 0.77681434, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79848051, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 3.00435471534729 + }, + { + "auxiliary_loss_clip": 0.0114931, + "auxiliary_loss_mlp": 0.01041451, + "balance_loss_clip": 1.04727697, + "balance_loss_mlp": 1.02186477, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 1.869815628645263, + "language_loss": 0.80550379, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82741141, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.562781810760498 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.04368734, + "balance_loss_mlp": 1.03399372, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.8675700774079615, + "language_loss": 0.8508029, + "learning_rate": 3.912572184769108e-06, + "loss": 0.8725391, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 2.6099066734313965 + }, + { + "auxiliary_loss_clip": 0.01118714, + "auxiliary_loss_mlp": 0.01046972, + "balance_loss_clip": 1.04449058, + "balance_loss_mlp": 1.02656341, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2913040258473543, + "language_loss": 0.85537463, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87703151, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 2.6752209663391113 + }, + { + "auxiliary_loss_clip": 0.01145698, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.04131258, + "balance_loss_mlp": 1.03181219, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 1.9891440313938715, + "language_loss": 0.72446346, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74644113, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 2.68156099319458 + }, + { + "auxiliary_loss_clip": 0.01126493, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_clip": 1.04369581, + "balance_loss_mlp": 1.02452064, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 1.4844890726275426, + "language_loss": 0.76182032, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78353047, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 2.7406654357910156 + }, + { + "auxiliary_loss_clip": 0.01118274, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.04032683, + "balance_loss_mlp": 1.02492464, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.2302518084336356, + "language_loss": 0.89116341, + "learning_rate": 3.912116039223659e-06, + "loss": 0.91279399, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 2.5785915851593018 + }, + { + "auxiliary_loss_clip": 0.01119224, + "auxiliary_loss_mlp": 0.01051025, + "balance_loss_clip": 1.04173207, + "balance_loss_mlp": 1.03303623, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 1.610728029295055, + "language_loss": 0.75732946, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77903199, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 2.6738784313201904 + }, + { + "auxiliary_loss_clip": 0.01098729, + "auxiliary_loss_mlp": 0.01052457, + "balance_loss_clip": 1.0405407, + "balance_loss_mlp": 1.03166699, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.0346619304999156, + "language_loss": 0.77166808, + "learning_rate": 3.911887531387839e-06, + "loss": 0.79317993, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 2.6750004291534424 + }, + { + "auxiliary_loss_clip": 0.01135454, + "auxiliary_loss_mlp": 0.01050553, + "balance_loss_clip": 1.0429368, + "balance_loss_mlp": 1.03114522, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 1.960729106074255, + "language_loss": 0.79489374, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81675386, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.5848469734191895 + }, + { + "auxiliary_loss_clip": 0.01145387, + "auxiliary_loss_mlp": 0.01044099, + "balance_loss_clip": 1.04480302, + "balance_loss_mlp": 1.02385759, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 1.9465906808278588, + "language_loss": 0.75152296, + "learning_rate": 3.911658733556155e-06, + "loss": 0.77341783, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.5708227157592773 + }, + { + "auxiliary_loss_clip": 0.01150757, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_clip": 1.04846358, + "balance_loss_mlp": 1.02625775, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.7399340235136016, + "language_loss": 0.75509852, + "learning_rate": 3.911544225902707e-06, + "loss": 0.7770533, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.576974630355835 + }, + { + "auxiliary_loss_clip": 0.01129748, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.04118705, + "balance_loss_mlp": 1.02032554, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.7443549324901872, + "language_loss": 0.89050806, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91219491, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.5766761302948 + }, + { + "auxiliary_loss_clip": 0.01128146, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_clip": 1.04602575, + "balance_loss_mlp": 1.02824068, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.1539954040048785, + "language_loss": 0.65266621, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67441916, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 4.163141965866089 + }, + { + "auxiliary_loss_clip": 0.0112779, + "auxiliary_loss_mlp": 0.01047911, + "balance_loss_clip": 1.04505944, + "balance_loss_mlp": 1.02707303, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 2.31015183774416, + "language_loss": 0.76100278, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78275979, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.6228275299072266 + }, + { + "auxiliary_loss_clip": 0.01153456, + "auxiliary_loss_mlp": 0.01048141, + "balance_loss_clip": 1.04709041, + "balance_loss_mlp": 1.02813792, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 2.2012794008461496, + "language_loss": 0.71629548, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73831141, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.5326180458068848 + }, + { + "auxiliary_loss_clip": 0.01123701, + "auxiliary_loss_mlp": 0.01048984, + "balance_loss_clip": 1.04848886, + "balance_loss_mlp": 1.02850378, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.7624837733448235, + "language_loss": 0.83140218, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85312903, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 2.753838062286377 + }, + { + "auxiliary_loss_clip": 0.01128615, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_clip": 1.0453862, + "balance_loss_mlp": 1.03082025, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.401626129715418, + "language_loss": 0.80046117, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82226104, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 4.1118247509002686 + }, + { + "auxiliary_loss_clip": 0.0103493, + "auxiliary_loss_mlp": 0.00749199, + "balance_loss_clip": 1.00903499, + "balance_loss_mlp": 1.00024414, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8191272400403072, + "language_loss": 0.58665442, + "learning_rate": 3.910740642965518e-06, + "loss": 0.6044957, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 6.27490496635437 + }, + { + "auxiliary_loss_clip": 0.01099157, + "auxiliary_loss_mlp": 0.0105592, + "balance_loss_clip": 1.04156852, + "balance_loss_mlp": 1.03292465, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.02631209401562, + "language_loss": 0.8069219, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82847273, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 2.712829351425171 + }, + { + "auxiliary_loss_clip": 0.0112679, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.04413795, + "balance_loss_mlp": 1.0250442, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.7938004075975618, + "language_loss": 0.83220935, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85392511, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 2.6291651725769043 + }, + { + "auxiliary_loss_clip": 0.0110578, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_clip": 1.03925848, + "balance_loss_mlp": 1.0269115, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.8535678424160758, + "language_loss": 0.67592239, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69746578, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 2.6122281551361084 + }, + { + "auxiliary_loss_clip": 0.01109276, + "auxiliary_loss_mlp": 0.01044692, + "balance_loss_clip": 1.04022646, + "balance_loss_mlp": 1.02557063, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.6884690247321328, + "language_loss": 0.81708181, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83862144, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 2.6455206871032715 + }, + { + "auxiliary_loss_clip": 0.01122389, + "auxiliary_loss_mlp": 0.01039867, + "balance_loss_clip": 1.04024696, + "balance_loss_mlp": 1.019279, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 1.972224701396262, + "language_loss": 0.80149329, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82311583, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.5805463790893555 + }, + { + "auxiliary_loss_clip": 0.0109668, + "auxiliary_loss_mlp": 0.01048024, + "balance_loss_clip": 1.04426062, + "balance_loss_mlp": 1.02828264, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 2.0191743665581305, + "language_loss": 0.78343558, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80488265, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.720201253890991 + }, + { + "auxiliary_loss_clip": 0.01141361, + "auxiliary_loss_mlp": 0.01056631, + "balance_loss_clip": 1.04765606, + "balance_loss_mlp": 1.03561437, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 1.9423365937523527, + "language_loss": 0.67296267, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69494265, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.5751960277557373 + }, + { + "auxiliary_loss_clip": 0.01150136, + "auxiliary_loss_mlp": 0.01050428, + "balance_loss_clip": 1.04722714, + "balance_loss_mlp": 1.03013825, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 3.214233431602667, + "language_loss": 0.7248311, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74683672, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 2.523550271987915 + }, + { + "auxiliary_loss_clip": 0.01133546, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.04281175, + "balance_loss_mlp": 1.03467691, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6982539722114174, + "language_loss": 0.76593697, + "learning_rate": 3.909702248319597e-06, + "loss": 0.78781879, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 2.6157140731811523 + }, + { + "auxiliary_loss_clip": 0.01120761, + "auxiliary_loss_mlp": 0.01043409, + "balance_loss_clip": 1.04246044, + "balance_loss_mlp": 1.02564704, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 1.8660429742203728, + "language_loss": 0.85367841, + "learning_rate": 3.909586508997797e-06, + "loss": 0.8753202, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.644256353378296 + }, + { + "auxiliary_loss_clip": 0.0109815, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.04173887, + "balance_loss_mlp": 1.02320385, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 1.9579164934859892, + "language_loss": 0.75377816, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77519387, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.65946102142334 + }, + { + "auxiliary_loss_clip": 0.01114811, + "auxiliary_loss_mlp": 0.01051252, + "balance_loss_clip": 1.04205728, + "balance_loss_mlp": 1.03041458, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 2.264128739655149, + "language_loss": 0.80525237, + "learning_rate": 3.909354813123452e-06, + "loss": 0.826913, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.643892526626587 + }, + { + "auxiliary_loss_clip": 0.01149198, + "auxiliary_loss_mlp": 0.00751183, + "balance_loss_clip": 1.04766977, + "balance_loss_mlp": 0.99993986, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7275994507611423, + "language_loss": 0.80247724, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82148111, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.600283145904541 + }, + { + "auxiliary_loss_clip": 0.01139523, + "auxiliary_loss_mlp": 0.010517, + "balance_loss_clip": 1.0437851, + "balance_loss_mlp": 1.03069592, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.9273733201026833, + "language_loss": 0.74174953, + "learning_rate": 3.909122827637406e-06, + "loss": 0.7636618, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.573223114013672 + }, + { + "auxiliary_loss_clip": 0.01147503, + "auxiliary_loss_mlp": 0.00751187, + "balance_loss_clip": 1.04149795, + "balance_loss_mlp": 0.99992645, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.5572148907857486, + "language_loss": 0.74236298, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76134992, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 2.78781795501709 + }, + { + "auxiliary_loss_clip": 0.01123093, + "auxiliary_loss_mlp": 0.01040099, + "balance_loss_clip": 1.04182446, + "balance_loss_mlp": 1.02224159, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 1.8405577007645868, + "language_loss": 0.85128909, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87292099, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.675116539001465 + }, + { + "auxiliary_loss_clip": 0.01112716, + "auxiliary_loss_mlp": 0.0105335, + "balance_loss_clip": 1.04756904, + "balance_loss_mlp": 1.03427672, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 2.3159317395909094, + "language_loss": 0.77649003, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79815066, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.7715301513671875 + }, + { + "auxiliary_loss_clip": 0.01133938, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.04100466, + "balance_loss_mlp": 1.03075016, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.973914825855998, + "language_loss": 0.8327592, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85460174, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 2.602771282196045 + }, + { + "auxiliary_loss_clip": 0.0111988, + "auxiliary_loss_mlp": 0.01052487, + "balance_loss_clip": 1.04076457, + "balance_loss_mlp": 1.03191125, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.6919813324560697, + "language_loss": 0.78105319, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80277681, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 2.667665481567383 + }, + { + "auxiliary_loss_clip": 0.01113927, + "auxiliary_loss_mlp": 0.01054193, + "balance_loss_clip": 1.04111826, + "balance_loss_mlp": 1.03321254, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 1.9000878067835614, + "language_loss": 0.83423054, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85591185, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.6052322387695312 + }, + { + "auxiliary_loss_clip": 0.0111724, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_clip": 1.04594076, + "balance_loss_mlp": 1.04011035, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.2719198810110317, + "language_loss": 0.81055474, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83235699, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.6396069526672363 + }, + { + "auxiliary_loss_clip": 0.01124701, + "auxiliary_loss_mlp": 0.01050285, + "balance_loss_clip": 1.04145217, + "balance_loss_mlp": 1.02984118, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 1.98245917577508, + "language_loss": 0.85930824, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.8810581, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 2.581974744796753 + }, + { + "auxiliary_loss_clip": 0.0113205, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.04332662, + "balance_loss_mlp": 1.02523351, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 1.7732031474660406, + "language_loss": 0.85295498, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87471569, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 2.544799566268921 + }, + { + "auxiliary_loss_clip": 0.01099811, + "auxiliary_loss_mlp": 0.01054867, + "balance_loss_clip": 1.0410552, + "balance_loss_mlp": 1.03416061, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 1.74460236217383, + "language_loss": 0.78887415, + "learning_rate": 3.907958557264774e-06, + "loss": 0.81042099, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.6074867248535156 + }, + { + "auxiliary_loss_clip": 0.01102474, + "auxiliary_loss_mlp": 0.01053558, + "balance_loss_clip": 1.04319978, + "balance_loss_mlp": 1.03221977, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.0015589620284557, + "language_loss": 0.79088354, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81244391, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 2.655754804611206 + }, + { + "auxiliary_loss_clip": 0.01124393, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.04432297, + "balance_loss_mlp": 1.03061008, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.1877013108078507, + "language_loss": 0.92377174, + "learning_rate": 3.907724834849002e-06, + "loss": 0.94551313, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 2.6920969486236572 + }, + { + "auxiliary_loss_clip": 0.01124278, + "auxiliary_loss_mlp": 0.01042853, + "balance_loss_clip": 1.03913283, + "balance_loss_mlp": 1.02230096, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.9228400754202346, + "language_loss": 0.8087368, + "learning_rate": 3.907607865127225e-06, + "loss": 0.8304081, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 2.6692686080932617 + }, + { + "auxiliary_loss_clip": 0.01006643, + "auxiliary_loss_mlp": 0.01024461, + "balance_loss_clip": 1.00968981, + "balance_loss_mlp": 1.02199388, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 1.3712387734486118, + "language_loss": 0.63313687, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65344787, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 3.188079833984375 + }, + { + "auxiliary_loss_clip": 0.01091082, + "auxiliary_loss_mlp": 0.01054557, + "balance_loss_clip": 1.03773999, + "balance_loss_mlp": 1.03318262, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.8364203231234293, + "language_loss": 0.93206179, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95351815, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 2.748568058013916 + }, + { + "auxiliary_loss_clip": 0.0114172, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.04709244, + "balance_loss_mlp": 1.02613413, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.9148427543681998, + "language_loss": 0.8126654, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83452249, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.594198703765869 + }, + { + "auxiliary_loss_clip": 0.01084743, + "auxiliary_loss_mlp": 0.01054849, + "balance_loss_clip": 1.0376482, + "balance_loss_mlp": 1.03371286, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.5262374505195015, + "language_loss": 0.77368438, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79508036, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.7610857486724854 + }, + { + "auxiliary_loss_clip": 0.01131461, + "auxiliary_loss_mlp": 0.01050621, + "balance_loss_clip": 1.04434645, + "balance_loss_mlp": 1.03023648, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.1339516569907957, + "language_loss": 0.81142819, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83324903, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 2.580355405807495 + }, + { + "auxiliary_loss_clip": 0.011321, + "auxiliary_loss_mlp": 0.01048889, + "balance_loss_clip": 1.04443336, + "balance_loss_mlp": 1.02837348, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.6709044410091007, + "language_loss": 0.78239965, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80420965, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 2.67549991607666 + }, + { + "auxiliary_loss_clip": 0.0112993, + "auxiliary_loss_mlp": 0.01042888, + "balance_loss_clip": 1.05190778, + "balance_loss_mlp": 1.02350414, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 1.8458422103629872, + "language_loss": 0.74956501, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77129316, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.6462628841400146 + }, + { + "auxiliary_loss_clip": 0.01069989, + "auxiliary_loss_mlp": 0.01049632, + "balance_loss_clip": 1.0366708, + "balance_loss_mlp": 1.02856791, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 2.111201928779827, + "language_loss": 0.90599227, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92718852, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 4.157872438430786 + }, + { + "auxiliary_loss_clip": 0.01081247, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.04039526, + "balance_loss_mlp": 1.02686548, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.2972316483723554, + "language_loss": 0.83426565, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85555822, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 2.7592947483062744 + }, + { + "auxiliary_loss_clip": 0.01082157, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.03751731, + "balance_loss_mlp": 1.03458393, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 1.6966267903759886, + "language_loss": 0.73503315, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.75640535, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 2.6903140544891357 + }, + { + "auxiliary_loss_clip": 0.01075293, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.03887856, + "balance_loss_mlp": 1.01835132, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.7693688273305284, + "language_loss": 0.76202935, + "learning_rate": 3.906316424944469e-06, + "loss": 0.78314471, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 2.729417085647583 + }, + { + "auxiliary_loss_clip": 0.01133169, + "auxiliary_loss_mlp": 0.01044913, + "balance_loss_clip": 1.04229832, + "balance_loss_mlp": 1.02499318, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 2.120493128931353, + "language_loss": 0.82706326, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84884405, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 5.503296375274658 + }, + { + "auxiliary_loss_clip": 0.01118301, + "auxiliary_loss_mlp": 0.01042927, + "balance_loss_clip": 1.0418129, + "balance_loss_mlp": 1.02282858, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 1.6678766567889172, + "language_loss": 0.75378656, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77539885, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 4.174846410751343 + }, + { + "auxiliary_loss_clip": 0.01147255, + "auxiliary_loss_mlp": 0.01054775, + "balance_loss_clip": 1.05031145, + "balance_loss_mlp": 1.03492653, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.9636680687836563, + "language_loss": 0.83765554, + "learning_rate": 3.905962695693935e-06, + "loss": 0.85967582, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 2.6441659927368164 + }, + { + "auxiliary_loss_clip": 0.01132651, + "auxiliary_loss_mlp": 0.01050882, + "balance_loss_clip": 1.04313743, + "balance_loss_mlp": 1.03118885, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.3004628835298706, + "language_loss": 0.84622395, + "learning_rate": 3.9058446413892e-06, + "loss": 0.86805928, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.596553325653076 + }, + { + "auxiliary_loss_clip": 0.01134998, + "auxiliary_loss_mlp": 0.0104274, + "balance_loss_clip": 1.04448605, + "balance_loss_mlp": 1.0239768, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.6408102042675479, + "language_loss": 0.77063262, + "learning_rate": 3.905726514814646e-06, + "loss": 0.79240996, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 2.773258924484253 + }, + { + "auxiliary_loss_clip": 0.01137025, + "auxiliary_loss_mlp": 0.01049631, + "balance_loss_clip": 1.05280447, + "balance_loss_mlp": 1.02835178, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.596728472412601, + "language_loss": 0.79294151, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81480807, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.6226253509521484 + }, + { + "auxiliary_loss_clip": 0.01125389, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_clip": 1.04354191, + "balance_loss_mlp": 1.02448499, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 3.766538647635224, + "language_loss": 0.89895284, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92066246, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.616851329803467 + }, + { + "auxiliary_loss_clip": 0.01107475, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_clip": 1.04296803, + "balance_loss_mlp": 1.02732134, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.7339438870378487, + "language_loss": 0.80068105, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82222641, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.726633310317993 + }, + { + "auxiliary_loss_clip": 0.01145527, + "auxiliary_loss_mlp": 0.01046342, + "balance_loss_clip": 1.04515779, + "balance_loss_mlp": 1.02710176, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.7837580803131603, + "language_loss": 0.88161963, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90353835, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 2.5850603580474854 + }, + { + "auxiliary_loss_clip": 0.01106291, + "auxiliary_loss_mlp": 0.01043002, + "balance_loss_clip": 1.04019463, + "balance_loss_mlp": 1.02466762, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 6.782012760601069, + "language_loss": 0.86928773, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89078063, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.5894320011138916 + }, + { + "auxiliary_loss_clip": 0.01120878, + "auxiliary_loss_mlp": 0.01051146, + "balance_loss_clip": 1.04220402, + "balance_loss_mlp": 1.03064156, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 2.023349021969892, + "language_loss": 0.73592019, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75764048, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.6256332397460938 + }, + { + "auxiliary_loss_clip": 0.01035674, + "auxiliary_loss_mlp": 0.01000247, + "balance_loss_clip": 1.01105618, + "balance_loss_mlp": 0.99783862, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.7634712098580674, + "language_loss": 0.61750042, + "learning_rate": 3.904897605614418e-06, + "loss": 0.6378597, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.1397688388824463 + }, + { + "auxiliary_loss_clip": 0.011249, + "auxiliary_loss_mlp": 0.0104913, + "balance_loss_clip": 1.04557776, + "balance_loss_mlp": 1.02922165, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 1.9093643785566692, + "language_loss": 0.78077954, + "learning_rate": 3.904778901042793e-06, + "loss": 0.8025198, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.6002538204193115 + }, + { + "auxiliary_loss_clip": 0.01025224, + "auxiliary_loss_mlp": 0.01002003, + "balance_loss_clip": 1.01411021, + "balance_loss_mlp": 0.99958289, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.7430547525811224, + "language_loss": 0.59380722, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61407948, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.084470510482788 + }, + { + "auxiliary_loss_clip": 0.01134367, + "auxiliary_loss_mlp": 0.01047013, + "balance_loss_clip": 1.04579854, + "balance_loss_mlp": 1.02889347, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 2.5402485575293356, + "language_loss": 0.63318729, + "learning_rate": 3.904541275215825e-06, + "loss": 0.65500104, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.690868616104126 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01058074, + "balance_loss_clip": 1.04525328, + "balance_loss_mlp": 1.03717637, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 1.9440911487987085, + "language_loss": 0.80515116, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82699335, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.5775675773620605 + }, + { + "auxiliary_loss_clip": 0.01119445, + "auxiliary_loss_mlp": 0.01054226, + "balance_loss_clip": 1.04206049, + "balance_loss_mlp": 1.03523588, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.722500713241023, + "language_loss": 0.7593286, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78106534, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.5696728229522705 + }, + { + "auxiliary_loss_clip": 0.01092047, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.03935218, + "balance_loss_mlp": 1.03224587, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.5498436553543198, + "language_loss": 0.76745808, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.78889549, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 2.8954265117645264 + }, + { + "auxiliary_loss_clip": 0.01121354, + "auxiliary_loss_mlp": 0.01055667, + "balance_loss_clip": 1.04000473, + "balance_loss_mlp": 1.03659332, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.3360921978810976, + "language_loss": 0.8298474, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85161757, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 2.5663328170776367 + }, + { + "auxiliary_loss_clip": 0.01135137, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_clip": 1.04341233, + "balance_loss_mlp": 1.0284667, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.8761819659835781, + "language_loss": 0.75450176, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77632844, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.612351179122925 + }, + { + "auxiliary_loss_clip": 0.01129337, + "auxiliary_loss_mlp": 0.01054316, + "balance_loss_clip": 1.04371929, + "balance_loss_mlp": 1.03712654, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 1.8844045192568282, + "language_loss": 0.87551105, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89734763, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 2.592036485671997 + }, + { + "auxiliary_loss_clip": 0.01092155, + "auxiliary_loss_mlp": 0.01055535, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.03362453, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 2.4757244719927747, + "language_loss": 0.69529402, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71677089, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.70294451713562 + }, + { + "auxiliary_loss_clip": 0.01112, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.03701293, + "balance_loss_mlp": 1.03318882, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 1.955126833980333, + "language_loss": 0.81345642, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83511877, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 2.6407763957977295 + }, + { + "auxiliary_loss_clip": 0.01133437, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.04785264, + "balance_loss_mlp": 1.02539372, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 1.842380809227549, + "language_loss": 0.80343962, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82522631, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.6403613090515137 + }, + { + "auxiliary_loss_clip": 0.01041223, + "auxiliary_loss_mlp": 0.01008426, + "balance_loss_clip": 1.00634098, + "balance_loss_mlp": 1.00588691, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7048883997878921, + "language_loss": 0.57082754, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59132403, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 3.1314454078674316 + }, + { + "auxiliary_loss_clip": 0.01113317, + "auxiliary_loss_mlp": 0.01047977, + "balance_loss_clip": 1.04167259, + "balance_loss_mlp": 1.02867675, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 2.2719441195510486, + "language_loss": 0.93474638, + "learning_rate": 3.903229170377845e-06, + "loss": 0.95635927, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 2.63122296333313 + }, + { + "auxiliary_loss_clip": 0.01116323, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.03768063, + "balance_loss_mlp": 1.01486063, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.6803059346984661, + "language_loss": 0.77879667, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80028474, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 2.6472623348236084 + }, + { + "auxiliary_loss_clip": 0.01104406, + "auxiliary_loss_mlp": 0.01049159, + "balance_loss_clip": 1.04044437, + "balance_loss_mlp": 1.03173053, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 2.114393636156392, + "language_loss": 0.81374878, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83528441, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 2.688096046447754 + }, + { + "auxiliary_loss_clip": 0.0113837, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.04467952, + "balance_loss_mlp": 1.02683151, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.28166356680469, + "language_loss": 0.82847226, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85033274, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.5935933589935303 + }, + { + "auxiliary_loss_clip": 0.01098044, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.04403877, + "balance_loss_mlp": 1.02463531, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 1.7430634969133532, + "language_loss": 0.73748684, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75890833, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.763235569000244 + }, + { + "auxiliary_loss_clip": 0.01140178, + "auxiliary_loss_mlp": 0.01040954, + "balance_loss_clip": 1.04246271, + "balance_loss_mlp": 1.02278626, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 3.5506936094340418, + "language_loss": 0.79453307, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81634438, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.5310816764831543 + }, + { + "auxiliary_loss_clip": 0.01143076, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.04161429, + "balance_loss_mlp": 1.02173328, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 1.8134465902154824, + "language_loss": 0.75697851, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77881849, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.518357753753662 + }, + { + "auxiliary_loss_clip": 0.01080786, + "auxiliary_loss_mlp": 0.01047006, + "balance_loss_clip": 1.03660905, + "balance_loss_mlp": 1.02836132, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 1.9364953342170124, + "language_loss": 0.83095694, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85223484, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.665548086166382 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.00751179, + "balance_loss_clip": 1.04056275, + "balance_loss_mlp": 0.99993956, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.607085574940246, + "language_loss": 0.78631592, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80498803, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.693495988845825 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.01054052, + "balance_loss_clip": 1.04106331, + "balance_loss_mlp": 1.03328586, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.2557617791512143, + "language_loss": 0.76650161, + "learning_rate": 3.902149134427982e-06, + "loss": 0.78807688, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.766903877258301 + }, + { + "auxiliary_loss_clip": 0.01100733, + "auxiliary_loss_mlp": 0.0104721, + "balance_loss_clip": 1.03975034, + "balance_loss_mlp": 1.02750516, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 2.036444535199999, + "language_loss": 0.85505521, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87653458, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 2.7158093452453613 + }, + { + "auxiliary_loss_clip": 0.01109315, + "auxiliary_loss_mlp": 0.01048595, + "balance_loss_clip": 1.04140019, + "balance_loss_mlp": 1.02853227, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.9642258543207944, + "language_loss": 0.73570359, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.75728261, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 4.194753408432007 + }, + { + "auxiliary_loss_clip": 0.01131181, + "auxiliary_loss_mlp": 0.0105272, + "balance_loss_clip": 1.04509306, + "balance_loss_mlp": 1.03278863, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.9126073623631945, + "language_loss": 0.83789551, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85973454, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 2.5741961002349854 + }, + { + "auxiliary_loss_clip": 0.01125719, + "auxiliary_loss_mlp": 0.01050343, + "balance_loss_clip": 1.04346073, + "balance_loss_mlp": 1.03092396, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.5096138496560154, + "language_loss": 0.86397445, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88573503, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.01110661, + "auxiliary_loss_mlp": 0.00751028, + "balance_loss_clip": 1.04095697, + "balance_loss_mlp": 1.00007749, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.6071294972454984, + "language_loss": 0.70601237, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72462928, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.7300705909729004 + }, + { + "auxiliary_loss_clip": 0.01097058, + "auxiliary_loss_mlp": 0.0105553, + "balance_loss_clip": 1.03824306, + "balance_loss_mlp": 1.03448915, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 3.3962940825856536, + "language_loss": 0.86665535, + "learning_rate": 3.901425864420852e-06, + "loss": 0.88818121, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 4.0017311573028564 + }, + { + "auxiliary_loss_clip": 0.01130121, + "auxiliary_loss_mlp": 0.01048774, + "balance_loss_clip": 1.04206955, + "balance_loss_mlp": 1.03072596, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.9626924471727283, + "language_loss": 0.87523091, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89701992, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 5.470365285873413 + }, + { + "auxiliary_loss_clip": 0.01120293, + "auxiliary_loss_mlp": 0.00750984, + "balance_loss_clip": 1.04269052, + "balance_loss_mlp": 0.99995601, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.4320669771227434, + "language_loss": 0.87944824, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89816093, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 2.5412955284118652 + }, + { + "auxiliary_loss_clip": 0.0114249, + "auxiliary_loss_mlp": 0.01037016, + "balance_loss_clip": 1.04375529, + "balance_loss_mlp": 1.01845503, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.8209394284587814, + "language_loss": 0.76020944, + "learning_rate": 3.901063255975046e-06, + "loss": 0.78200454, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 2.5913734436035156 + }, + { + "auxiliary_loss_clip": 0.01081152, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_clip": 1.03518271, + "balance_loss_mlp": 1.02533913, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.3197061884557435, + "language_loss": 0.83123326, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85249364, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.754958152770996 + }, + { + "auxiliary_loss_clip": 0.01122198, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.04302216, + "balance_loss_mlp": 1.02907252, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.049426063560206, + "language_loss": 0.79345804, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81515884, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.575678586959839 + }, + { + "auxiliary_loss_clip": 0.01145231, + "auxiliary_loss_mlp": 0.01048803, + "balance_loss_clip": 1.04586995, + "balance_loss_mlp": 1.03000355, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.7376206433429109, + "language_loss": 0.79374868, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81568909, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.5303802490234375 + }, + { + "auxiliary_loss_clip": 0.01133409, + "auxiliary_loss_mlp": 0.00750988, + "balance_loss_clip": 1.04080296, + "balance_loss_mlp": 0.9999578, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 1.9044752129927986, + "language_loss": 0.76061356, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77945751, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 2.6060874462127686 + }, + { + "auxiliary_loss_clip": 0.01132085, + "auxiliary_loss_mlp": 0.00750961, + "balance_loss_clip": 1.04221916, + "balance_loss_mlp": 1.00001168, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.713859959026464, + "language_loss": 0.78187549, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80070591, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.593308210372925 + }, + { + "auxiliary_loss_clip": 0.01099099, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_clip": 1.04198647, + "balance_loss_mlp": 1.02700424, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.4822595652647492, + "language_loss": 0.69246006, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71390492, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 2.828145742416382 + }, + { + "auxiliary_loss_clip": 0.01005441, + "auxiliary_loss_mlp": 0.00749055, + "balance_loss_clip": 1.01124787, + "balance_loss_mlp": 1.00031984, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8376217045747936, + "language_loss": 0.62749141, + "learning_rate": 3.900214646718047e-06, + "loss": 0.6450364, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.3049323558807373 + }, + { + "auxiliary_loss_clip": 0.01119979, + "auxiliary_loss_mlp": 0.01039605, + "balance_loss_clip": 1.04083562, + "balance_loss_mlp": 1.01976824, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 3.1592800568062303, + "language_loss": 0.77055359, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79214948, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.6358468532562256 + }, + { + "auxiliary_loss_clip": 0.01095588, + "auxiliary_loss_mlp": 0.01046505, + "balance_loss_clip": 1.03912938, + "balance_loss_mlp": 1.02533317, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.2225603615719463, + "language_loss": 0.79290223, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81432307, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.6486072540283203 + }, + { + "auxiliary_loss_clip": 0.01111723, + "auxiliary_loss_mlp": 0.01047795, + "balance_loss_clip": 1.04104149, + "balance_loss_mlp": 1.02894783, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 1.9622763140504857, + "language_loss": 0.71128535, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73288059, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.6150569915771484 + }, + { + "auxiliary_loss_clip": 0.01078717, + "auxiliary_loss_mlp": 0.01040788, + "balance_loss_clip": 1.04141295, + "balance_loss_mlp": 1.02139235, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.184080443325723, + "language_loss": 0.72202599, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74322104, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.8105876445770264 + }, + { + "auxiliary_loss_clip": 0.01070363, + "auxiliary_loss_mlp": 0.01046894, + "balance_loss_clip": 1.03331935, + "balance_loss_mlp": 1.02703393, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 4.00196044865382, + "language_loss": 0.82123661, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84240925, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.702144145965576 + }, + { + "auxiliary_loss_clip": 0.01138367, + "auxiliary_loss_mlp": 0.01047776, + "balance_loss_clip": 1.04344213, + "balance_loss_mlp": 1.02764142, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 3.4226552677237425, + "language_loss": 0.80032647, + "learning_rate": 3.899484457098528e-06, + "loss": 0.8221879, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 2.5733277797698975 + }, + { + "auxiliary_loss_clip": 0.01134401, + "auxiliary_loss_mlp": 0.01040973, + "balance_loss_clip": 1.04701197, + "balance_loss_mlp": 1.02217364, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 5.439163007085178, + "language_loss": 0.82787448, + "learning_rate": 3.899362506701421e-06, + "loss": 0.84962821, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.602278232574463 + }, + { + "auxiliary_loss_clip": 0.01118405, + "auxiliary_loss_mlp": 0.01047133, + "balance_loss_clip": 1.04308283, + "balance_loss_mlp": 1.02734411, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 3.612109930327535, + "language_loss": 0.77266252, + "learning_rate": 3.899240484280298e-06, + "loss": 0.7943179, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 2.6435132026672363 + }, + { + "auxiliary_loss_clip": 0.00998754, + "auxiliary_loss_mlp": 0.00998669, + "balance_loss_clip": 1.00620139, + "balance_loss_mlp": 0.99582022, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.9058365107735243, + "language_loss": 0.59181523, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61178946, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.3662803173065186 + }, + { + "auxiliary_loss_clip": 0.01132779, + "auxiliary_loss_mlp": 0.01048225, + "balance_loss_clip": 1.04383528, + "balance_loss_mlp": 1.03010559, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 4.3615561001975545, + "language_loss": 0.82495481, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84676486, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 2.567323923110962 + }, + { + "auxiliary_loss_clip": 0.01137326, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.04540241, + "balance_loss_mlp": 1.02766669, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.644831428468561, + "language_loss": 0.78771484, + "learning_rate": 3.898873984919113e-06, + "loss": 0.80957228, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 2.6650590896606445 + }, + { + "auxiliary_loss_clip": 0.01111413, + "auxiliary_loss_mlp": 0.01046479, + "balance_loss_clip": 1.03951144, + "balance_loss_mlp": 1.02713132, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 3.1506534209784984, + "language_loss": 0.85359168, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.87517059, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 2.6190333366394043 + }, + { + "auxiliary_loss_clip": 0.01122031, + "auxiliary_loss_mlp": 0.01046561, + "balance_loss_clip": 1.04186249, + "balance_loss_mlp": 1.02829838, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.4084860482085393, + "language_loss": 0.86206383, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88374978, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 2.5937438011169434 + }, + { + "auxiliary_loss_clip": 0.01125068, + "auxiliary_loss_mlp": 0.01049739, + "balance_loss_clip": 1.04347718, + "balance_loss_mlp": 1.0304507, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 2.2341418362122867, + "language_loss": 0.68218637, + "learning_rate": 3.898506837508518e-06, + "loss": 0.70393449, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.669663906097412 + }, + { + "auxiliary_loss_clip": 0.01146274, + "auxiliary_loss_mlp": 0.00751224, + "balance_loss_clip": 1.04945219, + "balance_loss_mlp": 1.00000811, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.2889835460502512, + "language_loss": 0.83380699, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85278201, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 2.646411895751953 + }, + { + "auxiliary_loss_clip": 0.01153752, + "auxiliary_loss_mlp": 0.00751124, + "balance_loss_clip": 1.04982352, + "balance_loss_mlp": 1.0000397, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.5371984290643375, + "language_loss": 0.81869918, + "learning_rate": 3.898261712602539e-06, + "loss": 0.83774799, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.5633456707000732 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01052344, + "balance_loss_clip": 1.03897667, + "balance_loss_mlp": 1.03081512, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 1.863470176676088, + "language_loss": 0.7810697, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80268431, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.6352810859680176 + }, + { + "auxiliary_loss_clip": 0.01147701, + "auxiliary_loss_mlp": 0.01053092, + "balance_loss_clip": 1.04383242, + "balance_loss_mlp": 1.03312469, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.4562553789208548, + "language_loss": 0.82240772, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84441566, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.526829481124878 + }, + { + "auxiliary_loss_clip": 0.01120341, + "auxiliary_loss_mlp": 0.01050717, + "balance_loss_clip": 1.04379117, + "balance_loss_mlp": 1.03004599, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.969894963768051, + "language_loss": 0.71343398, + "learning_rate": 3.897893485388149e-06, + "loss": 0.7351445, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.746870756149292 + }, + { + "auxiliary_loss_clip": 0.01128937, + "auxiliary_loss_mlp": 0.01050789, + "balance_loss_clip": 1.04644156, + "balance_loss_mlp": 1.03200161, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.0367325511054077, + "language_loss": 0.71917075, + "learning_rate": 3.897770599040521e-06, + "loss": 0.74096799, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.6637444496154785 + }, + { + "auxiliary_loss_clip": 0.01146845, + "auxiliary_loss_mlp": 0.01045321, + "balance_loss_clip": 1.04714227, + "balance_loss_mlp": 1.02686691, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.9345205868711484, + "language_loss": 0.78950793, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81142956, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.5392801761627197 + }, + { + "auxiliary_loss_clip": 0.01138686, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.04547119, + "balance_loss_mlp": 1.02735662, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 1.8399167238088172, + "language_loss": 0.75967461, + "learning_rate": 3.897524610458975e-06, + "loss": 0.7815336, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.6219046115875244 + }, + { + "auxiliary_loss_clip": 0.01133338, + "auxiliary_loss_mlp": 0.01049246, + "balance_loss_clip": 1.04419911, + "balance_loss_mlp": 1.02928996, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 4.985536554540821, + "language_loss": 0.70139098, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72321689, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 2.56125545501709 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.04681075, + "balance_loss_mlp": 1.02643657, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 2.131187899290643, + "language_loss": 0.84139717, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86331761, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 2.5645554065704346 + }, + { + "auxiliary_loss_clip": 0.01133975, + "auxiliary_loss_mlp": 0.01056532, + "balance_loss_clip": 1.04193759, + "balance_loss_mlp": 1.03747034, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 1.6659394067397229, + "language_loss": 0.78768814, + "learning_rate": 3.897155087940906e-06, + "loss": 0.8095932, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 4.175720453262329 + }, + { + "auxiliary_loss_clip": 0.0109601, + "auxiliary_loss_mlp": 0.00751295, + "balance_loss_clip": 1.04154134, + "balance_loss_mlp": 1.00004864, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 1.6572721305521, + "language_loss": 0.80173624, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82020932, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 2.718461036682129 + }, + { + "auxiliary_loss_clip": 0.01126641, + "auxiliary_loss_mlp": 0.01048878, + "balance_loss_clip": 1.04305446, + "balance_loss_mlp": 1.02921999, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 1.957865037053456, + "language_loss": 0.83216476, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85391992, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 2.5687832832336426 + }, + { + "auxiliary_loss_clip": 0.01137359, + "auxiliary_loss_mlp": 0.01048948, + "balance_loss_clip": 1.04214764, + "balance_loss_mlp": 1.02887321, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.3621200043416475, + "language_loss": 0.76075655, + "learning_rate": 3.896784917960055e-06, + "loss": 0.7826196, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 2.637981414794922 + }, + { + "auxiliary_loss_clip": 0.01079677, + "auxiliary_loss_mlp": 0.01043251, + "balance_loss_clip": 1.03969109, + "balance_loss_mlp": 1.02362883, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.6826585841136876, + "language_loss": 0.86990637, + "learning_rate": 3.896661384107648e-06, + "loss": 0.89113563, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 4.1788341999053955 + }, + { + "auxiliary_loss_clip": 0.01149574, + "auxiliary_loss_mlp": 0.01047874, + "balance_loss_clip": 1.04323149, + "balance_loss_mlp": 1.02802563, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.140327165851341, + "language_loss": 0.80669606, + "learning_rate": 3.896537778333651e-06, + "loss": 0.8286705, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 4.127370357513428 + }, + { + "auxiliary_loss_clip": 0.0115213, + "auxiliary_loss_mlp": 0.01055317, + "balance_loss_clip": 1.04751956, + "balance_loss_mlp": 1.03575516, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.4245992600871986, + "language_loss": 0.74224645, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76432091, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 4.042129755020142 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01046485, + "balance_loss_clip": 1.03713489, + "balance_loss_mlp": 1.02519393, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 1.9886158650483443, + "language_loss": 0.82692033, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84836364, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.7135963439941406 + }, + { + "auxiliary_loss_clip": 0.01130438, + "auxiliary_loss_mlp": 0.01038902, + "balance_loss_clip": 1.04752791, + "balance_loss_mlp": 1.01978111, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6064633221144808, + "language_loss": 0.82329845, + "learning_rate": 3.896166529529008e-06, + "loss": 0.8449918, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 2.5498387813568115 + }, + { + "auxiliary_loss_clip": 0.01120107, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_clip": 1.04081869, + "balance_loss_mlp": 1.02236128, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.297394364672355, + "language_loss": 0.82763582, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84925753, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.651197910308838 + }, + { + "auxiliary_loss_clip": 0.01113639, + "auxiliary_loss_mlp": 0.01047367, + "balance_loss_clip": 1.04175532, + "balance_loss_mlp": 1.02770996, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.6429473727107053, + "language_loss": 0.72719067, + "learning_rate": 3.895918670803968e-06, + "loss": 0.74880069, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.6449365615844727 + }, + { + "auxiliary_loss_clip": 0.01149293, + "auxiliary_loss_mlp": 0.0075129, + "balance_loss_clip": 1.04407811, + "balance_loss_mlp": 1.00015175, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.35076275573195, + "language_loss": 0.81027043, + "learning_rate": 3.895794633598958e-06, + "loss": 0.8292762, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 2.5518362522125244 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.03888917, + "balance_loss_mlp": 1.02202129, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.2335714756444824, + "language_loss": 0.71364379, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.73502582, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 2.6867294311523438 + }, + { + "auxiliary_loss_clip": 0.01088645, + "auxiliary_loss_mlp": 0.01042508, + "balance_loss_clip": 1.0425297, + "balance_loss_mlp": 1.02163458, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 1.6256344447018223, + "language_loss": 0.74724936, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76856095, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 2.8398351669311523 + }, + { + "auxiliary_loss_clip": 0.01147213, + "auxiliary_loss_mlp": 0.01042088, + "balance_loss_clip": 1.04311407, + "balance_loss_mlp": 1.0240283, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.6969998472960697, + "language_loss": 0.83095849, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85285145, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.9363603591918945 + }, + { + "auxiliary_loss_clip": 0.01083296, + "auxiliary_loss_mlp": 0.0105106, + "balance_loss_clip": 1.03797817, + "balance_loss_mlp": 1.03148544, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.7004101333645152, + "language_loss": 0.83632636, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85766989, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.7075915336608887 + }, + { + "auxiliary_loss_clip": 0.01051885, + "auxiliary_loss_mlp": 0.01053389, + "balance_loss_clip": 1.03474689, + "balance_loss_mlp": 1.03126323, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 1.8904613419298713, + "language_loss": 0.80119234, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82224512, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.784144639968872 + }, + { + "auxiliary_loss_clip": 0.01150136, + "auxiliary_loss_mlp": 0.01043563, + "balance_loss_clip": 1.04710364, + "balance_loss_mlp": 1.02361965, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 1.9508871950795577, + "language_loss": 0.66570199, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.687639, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.6186461448669434 + }, + { + "auxiliary_loss_clip": 0.01121624, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.04467106, + "balance_loss_mlp": 1.02589715, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.686383806285863, + "language_loss": 0.67037296, + "learning_rate": 3.8949243605434e-06, + "loss": 0.6920355, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.658510446548462 + }, + { + "auxiliary_loss_clip": 0.01133711, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.04231477, + "balance_loss_mlp": 1.0233345, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9501496420969242, + "language_loss": 0.71833336, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74010396, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 2.6169536113739014 + }, + { + "auxiliary_loss_clip": 0.01105271, + "auxiliary_loss_mlp": 0.01042897, + "balance_loss_clip": 1.04502666, + "balance_loss_mlp": 1.02383494, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 2.301359832962828, + "language_loss": 0.75923526, + "learning_rate": 3.894675064326678e-06, + "loss": 0.7807169, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.656904458999634 + }, + { + "auxiliary_loss_clip": 0.01114252, + "auxiliary_loss_mlp": 0.0105501, + "balance_loss_clip": 1.04614592, + "balance_loss_mlp": 1.03436327, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.385352931137167, + "language_loss": 0.70510387, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72679657, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 2.693615674972534 + }, + { + "auxiliary_loss_clip": 0.01029728, + "auxiliary_loss_mlp": 0.0101379, + "balance_loss_clip": 1.01416373, + "balance_loss_mlp": 1.01077425, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8017999899321069, + "language_loss": 0.59020686, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61064208, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.377324104309082 + }, + { + "auxiliary_loss_clip": 0.01135479, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.04444742, + "balance_loss_mlp": 1.02804625, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.3523185769528867, + "language_loss": 0.80465549, + "learning_rate": 3.894300581166417e-06, + "loss": 0.82647693, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 2.57167911529541 + }, + { + "auxiliary_loss_clip": 0.01144787, + "auxiliary_loss_mlp": 0.01044911, + "balance_loss_clip": 1.04329419, + "balance_loss_mlp": 1.0240258, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.851386412823833, + "language_loss": 0.74743009, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76932704, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 2.6445693969726562 + }, + { + "auxiliary_loss_clip": 0.01109031, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.04087794, + "balance_loss_mlp": 1.02276278, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 1.6935790714109729, + "language_loss": 0.82315218, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84467268, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 2.638878345489502 + }, + { + "auxiliary_loss_clip": 0.01144513, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.04499567, + "balance_loss_mlp": 1.02216315, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.303506535141452, + "language_loss": 0.74780977, + "learning_rate": 3.893925451517562e-06, + "loss": 0.76966691, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 2.4267985820770264 + }, + { + "auxiliary_loss_clip": 0.01107911, + "auxiliary_loss_mlp": 0.01043159, + "balance_loss_clip": 1.04109025, + "balance_loss_mlp": 1.02445519, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.0210244306738265, + "language_loss": 0.84750271, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86901349, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.588315010070801 + }, + { + "auxiliary_loss_clip": 0.01136828, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.04661131, + "balance_loss_mlp": 1.0302633, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.7105252104555655, + "language_loss": 0.90019333, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92205518, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 2.495197057723999 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01042311, + "balance_loss_clip": 1.04333985, + "balance_loss_mlp": 1.02441752, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.9487250366087918, + "language_loss": 0.68533915, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70706081, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.486694812774658 + }, + { + "auxiliary_loss_clip": 0.0109668, + "auxiliary_loss_mlp": 0.01049165, + "balance_loss_clip": 1.03984535, + "balance_loss_mlp": 1.02899516, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 1.9421050866907121, + "language_loss": 0.78458434, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80604279, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.6307523250579834 + }, + { + "auxiliary_loss_clip": 0.01144371, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.04421258, + "balance_loss_mlp": 1.02072942, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 1.7098998427213357, + "language_loss": 0.85800552, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87984508, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.5494179725646973 + }, + { + "auxiliary_loss_clip": 0.01117191, + "auxiliary_loss_mlp": 0.01044944, + "balance_loss_clip": 1.04382539, + "balance_loss_mlp": 1.02466679, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 1.8191822517554168, + "language_loss": 0.82293969, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84456104, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.6324408054351807 + }, + { + "auxiliary_loss_clip": 0.01122486, + "auxiliary_loss_mlp": 0.01047162, + "balance_loss_clip": 1.04341471, + "balance_loss_mlp": 1.02744448, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.079233590466041, + "language_loss": 0.72685063, + "learning_rate": 3.893047635600818e-06, + "loss": 0.74854714, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.530611038208008 + }, + { + "auxiliary_loss_clip": 0.0113375, + "auxiliary_loss_mlp": 0.01042502, + "balance_loss_clip": 1.04376256, + "balance_loss_mlp": 1.02185488, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 1.9504620481128023, + "language_loss": 0.80115354, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82291603, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.5461320877075195 + }, + { + "auxiliary_loss_clip": 0.01009357, + "auxiliary_loss_mlp": 0.01000589, + "balance_loss_clip": 1.0130235, + "balance_loss_mlp": 0.99769229, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8343012836481031, + "language_loss": 0.58978903, + "learning_rate": 3.892796184920778e-06, + "loss": 0.60988849, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.196794271469116 + }, + { + "auxiliary_loss_clip": 0.01068319, + "auxiliary_loss_mlp": 0.01053352, + "balance_loss_clip": 1.0411272, + "balance_loss_mlp": 1.03409958, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 1.664690993937043, + "language_loss": 0.73995537, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76117206, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 2.6862871646881104 + }, + { + "auxiliary_loss_clip": 0.0113459, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_clip": 1.04555917, + "balance_loss_mlp": 1.0243578, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.8636709647379184, + "language_loss": 0.72644979, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74823153, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 2.5665299892425537 + }, + { + "auxiliary_loss_clip": 0.01139599, + "auxiliary_loss_mlp": 0.01056285, + "balance_loss_clip": 1.04890692, + "balance_loss_mlp": 1.03727138, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 1.907105477567966, + "language_loss": 0.74629211, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76825094, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 2.576101779937744 + }, + { + "auxiliary_loss_clip": 0.01107682, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_clip": 1.04495716, + "balance_loss_mlp": 1.02512264, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 1.9725328081910953, + "language_loss": 0.79123926, + "learning_rate": 3.892292422298637e-06, + "loss": 0.81276929, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.6037185192108154 + }, + { + "auxiliary_loss_clip": 0.01086222, + "auxiliary_loss_mlp": 0.01045203, + "balance_loss_clip": 1.03635561, + "balance_loss_mlp": 1.02590287, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.8254322806485452, + "language_loss": 0.85318208, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87449634, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 4.237675666809082 + }, + { + "auxiliary_loss_clip": 0.01027025, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.01829517, + "balance_loss_mlp": 1.0355643, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7633206438224365, + "language_loss": 0.54123926, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56189609, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 3.098670244216919 + }, + { + "auxiliary_loss_clip": 0.01146228, + "auxiliary_loss_mlp": 0.01044513, + "balance_loss_clip": 1.04496503, + "balance_loss_mlp": 1.02528453, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.731041409815781, + "language_loss": 0.72633946, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74824691, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 2.55000901222229 + }, + { + "auxiliary_loss_clip": 0.0111238, + "auxiliary_loss_mlp": 0.00751215, + "balance_loss_clip": 1.04134858, + "balance_loss_mlp": 1.00009704, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.6909326376187142, + "language_loss": 0.77908105, + "learning_rate": 3.891787511581859e-06, + "loss": 0.79771698, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 2.6452085971832275 + }, + { + "auxiliary_loss_clip": 0.01138513, + "auxiliary_loss_mlp": 0.01050324, + "balance_loss_clip": 1.04327261, + "balance_loss_mlp": 1.03165579, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 1.9566896314140412, + "language_loss": 0.74849701, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77038538, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 4.128314971923828 + }, + { + "auxiliary_loss_clip": 0.01149791, + "auxiliary_loss_mlp": 0.01046829, + "balance_loss_clip": 1.04460931, + "balance_loss_mlp": 1.02755284, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 1.940708193043909, + "language_loss": 0.79531652, + "learning_rate": 3.891534625783685e-06, + "loss": 0.81728268, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 4.124161720275879 + }, + { + "auxiliary_loss_clip": 0.01147829, + "auxiliary_loss_mlp": 0.01055812, + "balance_loss_clip": 1.04620504, + "balance_loss_mlp": 1.03735805, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.0981460538673016, + "language_loss": 0.82958943, + "learning_rate": 3.891408075291425e-06, + "loss": 0.8516258, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 2.5621228218078613 + }, + { + "auxiliary_loss_clip": 0.01100685, + "auxiliary_loss_mlp": 0.0104999, + "balance_loss_clip": 1.04323351, + "balance_loss_mlp": 1.03024864, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 1.5758135083007823, + "language_loss": 0.69588995, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71739668, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.7679827213287354 + }, + { + "auxiliary_loss_clip": 0.01148474, + "auxiliary_loss_mlp": 0.01053757, + "balance_loss_clip": 1.04618549, + "balance_loss_mlp": 1.03406358, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 1.6470277518827716, + "language_loss": 0.84551716, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86753953, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.5170483589172363 + }, + { + "auxiliary_loss_clip": 0.01149303, + "auxiliary_loss_mlp": 0.01048845, + "balance_loss_clip": 1.0460422, + "balance_loss_mlp": 1.02938962, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 2.0493342877489202, + "language_loss": 0.86869121, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89067268, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 2.5525736808776855 + }, + { + "auxiliary_loss_clip": 0.01118959, + "auxiliary_loss_mlp": 0.01046191, + "balance_loss_clip": 1.04549861, + "balance_loss_mlp": 1.02767801, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 4.997795254194676, + "language_loss": 0.72812921, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74978071, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.65250563621521 + }, + { + "auxiliary_loss_clip": 0.01096741, + "auxiliary_loss_mlp": 0.01051225, + "balance_loss_clip": 1.03898573, + "balance_loss_mlp": 1.03254509, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.0372081898165986, + "language_loss": 0.74165559, + "learning_rate": 3.890774247090444e-06, + "loss": 0.76313531, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 2.706841230392456 + }, + { + "auxiliary_loss_clip": 0.01138324, + "auxiliary_loss_mlp": 0.01049011, + "balance_loss_clip": 1.04681253, + "balance_loss_mlp": 1.02960372, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 2.028222276785716, + "language_loss": 0.78509557, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80696893, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.6129302978515625 + }, + { + "auxiliary_loss_clip": 0.01101106, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_clip": 1.04184103, + "balance_loss_mlp": 1.02553999, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.185132746862986, + "language_loss": 0.78890276, + "learning_rate": 3.890520213887941e-06, + "loss": 0.81035721, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.669673442840576 + }, + { + "auxiliary_loss_clip": 0.0110131, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.0406332, + "balance_loss_mlp": 1.02288878, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 1.8219101809535747, + "language_loss": 0.74519932, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76661885, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.5615501403808594 + }, + { + "auxiliary_loss_clip": 0.01118565, + "auxiliary_loss_mlp": 0.01043804, + "balance_loss_clip": 1.04036045, + "balance_loss_mlp": 1.02476621, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 1.780649065790205, + "language_loss": 0.83679795, + "learning_rate": 3.890265893930578e-06, + "loss": 0.85842168, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 2.61907958984375 + }, + { + "auxiliary_loss_clip": 0.01125273, + "auxiliary_loss_mlp": 0.01044506, + "balance_loss_clip": 1.04399061, + "balance_loss_mlp": 1.02701831, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.6827614999217981, + "language_loss": 0.85536557, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87706339, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.5894227027893066 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.00751046, + "balance_loss_clip": 1.03949285, + "balance_loss_mlp": 1.00014639, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 1.9531413463593557, + "language_loss": 0.82485884, + "learning_rate": 3.890011287256929e-06, + "loss": 0.84344327, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 2.598163366317749 + }, + { + "auxiliary_loss_clip": 0.01015351, + "auxiliary_loss_mlp": 0.00749227, + "balance_loss_clip": 1.01562476, + "balance_loss_mlp": 1.00046086, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7491706726820809, + "language_loss": 0.5802753, + "learning_rate": 3.889883876413563e-06, + "loss": 0.59792113, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.264343023300171 + }, + { + "auxiliary_loss_clip": 0.01028505, + "auxiliary_loss_mlp": 0.0100858, + "balance_loss_clip": 1.01298738, + "balance_loss_mlp": 1.0056237, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.8072291437077079, + "language_loss": 0.55375099, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57412195, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 3.2005691528320312 + }, + { + "auxiliary_loss_clip": 0.01103473, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.03884053, + "balance_loss_mlp": 1.02581072, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 3.536138544354058, + "language_loss": 0.74967086, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77115786, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 2.5827598571777344 + }, + { + "auxiliary_loss_clip": 0.0108515, + "auxiliary_loss_mlp": 0.01048466, + "balance_loss_clip": 1.03599024, + "balance_loss_mlp": 1.03119278, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 1.8396108619506737, + "language_loss": 0.79187626, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81321239, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 2.646653890609741 + }, + { + "auxiliary_loss_clip": 0.01112928, + "auxiliary_loss_mlp": 0.01050524, + "balance_loss_clip": 1.04052365, + "balance_loss_mlp": 1.03240407, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 2.9069156986284117, + "language_loss": 0.69435233, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71598685, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 2.6613314151763916 + }, + { + "auxiliary_loss_clip": 0.0113714, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_clip": 1.04454982, + "balance_loss_mlp": 1.02689493, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 1.9645491177432268, + "language_loss": 0.81073558, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83256102, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 2.568016529083252 + }, + { + "auxiliary_loss_clip": 0.01131253, + "auxiliary_loss_mlp": 0.01054213, + "balance_loss_clip": 1.04330373, + "balance_loss_mlp": 1.03586626, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 2.1684476013242344, + "language_loss": 0.87236369, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89421844, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 2.5128650665283203 + }, + { + "auxiliary_loss_clip": 0.01119624, + "auxiliary_loss_mlp": 0.01051151, + "balance_loss_clip": 1.04134488, + "balance_loss_mlp": 1.03148174, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.120366886552809, + "language_loss": 0.72599232, + "learning_rate": 3.888989994172501e-06, + "loss": 0.7477001, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.6213748455047607 + }, + { + "auxiliary_loss_clip": 0.01101097, + "auxiliary_loss_mlp": 0.01049061, + "balance_loss_clip": 1.03999937, + "balance_loss_mlp": 1.03038096, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.7522110826447332, + "language_loss": 0.8692385, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89074004, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 2.6400434970855713 + }, + { + "auxiliary_loss_clip": 0.01108054, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.04060626, + "balance_loss_mlp": 1.03388345, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 2.0306106367500054, + "language_loss": 0.77116907, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79276288, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.581921100616455 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.01042648, + "balance_loss_clip": 1.0397346, + "balance_loss_mlp": 1.02562547, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.150652270468457, + "language_loss": 0.78686124, + "learning_rate": 3.888605827226212e-06, + "loss": 0.80845356, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.5676989555358887 + }, + { + "auxiliary_loss_clip": 0.01038366, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.01398146, + "balance_loss_mlp": 1.0246613, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9756059322376835, + "language_loss": 0.68971533, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71037418, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 2.9486725330352783 + }, + { + "auxiliary_loss_clip": 0.01105767, + "auxiliary_loss_mlp": 0.01051348, + "balance_loss_clip": 1.04474306, + "balance_loss_mlp": 1.03390753, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 3.0307068166592086, + "language_loss": 0.67377353, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69534469, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.783345937728882 + }, + { + "auxiliary_loss_clip": 0.01132344, + "auxiliary_loss_mlp": 0.01057737, + "balance_loss_clip": 1.04240775, + "balance_loss_mlp": 1.03869963, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 1.971528004549876, + "language_loss": 0.8307209, + "learning_rate": 3.88822101573484e-06, + "loss": 0.85262173, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.520010471343994 + }, + { + "auxiliary_loss_clip": 0.01147567, + "auxiliary_loss_mlp": 0.01040148, + "balance_loss_clip": 1.0446403, + "balance_loss_mlp": 1.02095497, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0079853455263206, + "language_loss": 0.65769029, + "learning_rate": 3.888092602028167e-06, + "loss": 0.67956746, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.5711677074432373 + }, + { + "auxiliary_loss_clip": 0.01128047, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.04182148, + "balance_loss_mlp": 1.02776098, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.547397104183919, + "language_loss": 0.89336193, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91510683, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.5741310119628906 + }, + { + "auxiliary_loss_clip": 0.01123092, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_clip": 1.04241133, + "balance_loss_mlp": 1.02663589, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0306937514683834, + "language_loss": 0.73828012, + "learning_rate": 3.887835559829712e-06, + "loss": 0.75995779, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 2.6211116313934326 + }, + { + "auxiliary_loss_clip": 0.01131953, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_clip": 1.04255962, + "balance_loss_mlp": 1.02745378, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 1.9399540871811964, + "language_loss": 0.85432535, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87611014, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 2.52877140045166 + }, + { + "auxiliary_loss_clip": 0.01107403, + "auxiliary_loss_mlp": 0.01045977, + "balance_loss_clip": 1.03997326, + "balance_loss_mlp": 1.02599728, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.4825666983187773, + "language_loss": 0.81206787, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83360165, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 2.5977652072906494 + }, + { + "auxiliary_loss_clip": 0.01083565, + "auxiliary_loss_mlp": 0.01047184, + "balance_loss_clip": 1.03926647, + "balance_loss_mlp": 1.02805126, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.038814239532175, + "language_loss": 0.74155748, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76286501, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.6851682662963867 + }, + { + "auxiliary_loss_clip": 0.0110314, + "auxiliary_loss_mlp": 0.01045411, + "balance_loss_clip": 1.04196537, + "balance_loss_mlp": 1.02754116, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 3.945462304880495, + "language_loss": 0.80434948, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82583505, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.570192813873291 + }, + { + "auxiliary_loss_clip": 0.01085006, + "auxiliary_loss_mlp": 0.0105189, + "balance_loss_clip": 1.0382297, + "balance_loss_mlp": 1.03100491, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.5149688016310812, + "language_loss": 0.72266573, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74403471, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 2.6802749633789062 + }, + { + "auxiliary_loss_clip": 0.01097366, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.0394187, + "balance_loss_mlp": 1.02394295, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.742201341378275, + "language_loss": 0.65720296, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67861271, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 4.099465847015381 + }, + { + "auxiliary_loss_clip": 0.01140981, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.03992248, + "balance_loss_mlp": 1.02048659, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 19.713080229772295, + "language_loss": 0.81429088, + "learning_rate": 3.886933657403615e-06, + "loss": 0.83609378, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 2.4833762645721436 + }, + { + "auxiliary_loss_clip": 0.01117627, + "auxiliary_loss_mlp": 0.01046478, + "balance_loss_clip": 1.04137635, + "balance_loss_mlp": 1.02721357, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.7890722496876361, + "language_loss": 0.82167804, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84331906, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 2.615013599395752 + }, + { + "auxiliary_loss_clip": 0.0112726, + "auxiliary_loss_mlp": 0.01050454, + "balance_loss_clip": 1.04147804, + "balance_loss_mlp": 1.02991474, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.5340866432379396, + "language_loss": 0.86441505, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88619214, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 2.5702037811279297 + }, + { + "auxiliary_loss_clip": 0.01145381, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.043926, + "balance_loss_mlp": 1.02076757, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 2.836148266062916, + "language_loss": 0.7733146, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79517049, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 4.037550926208496 + }, + { + "auxiliary_loss_clip": 0.01122797, + "auxiliary_loss_mlp": 0.01049888, + "balance_loss_clip": 1.04231668, + "balance_loss_mlp": 1.02887177, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.8950374553126696, + "language_loss": 0.79158354, + "learning_rate": 3.886416710321491e-06, + "loss": 0.81331038, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 4.047831058502197 + }, + { + "auxiliary_loss_clip": 0.01122884, + "auxiliary_loss_mlp": 0.01047575, + "balance_loss_clip": 1.04393673, + "balance_loss_mlp": 1.02720273, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.1161447572121106, + "language_loss": 0.68266612, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70437074, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.6519389152526855 + }, + { + "auxiliary_loss_clip": 0.01124467, + "auxiliary_loss_mlp": 0.01046728, + "balance_loss_clip": 1.04289031, + "balance_loss_mlp": 1.02776194, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.5563653593340665, + "language_loss": 0.80976331, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83147526, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.5493640899658203 + }, + { + "auxiliary_loss_clip": 0.01083486, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.03637123, + "balance_loss_mlp": 1.02449155, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.6003425326357112, + "language_loss": 0.77797377, + "learning_rate": 3.886028248895093e-06, + "loss": 0.7992475, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 2.6458117961883545 + }, + { + "auxiliary_loss_clip": 0.01141241, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.04509664, + "balance_loss_mlp": 1.01913786, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.595903661591601, + "language_loss": 0.82881027, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85058343, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.5398690700531006 + }, + { + "auxiliary_loss_clip": 0.01146891, + "auxiliary_loss_mlp": 0.01053931, + "balance_loss_clip": 1.04456413, + "balance_loss_mlp": 1.03304601, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 2.1044897420017437, + "language_loss": 0.64849961, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67050779, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 2.558104991912842 + }, + { + "auxiliary_loss_clip": 0.01101775, + "auxiliary_loss_mlp": 0.01036922, + "balance_loss_clip": 1.03654146, + "balance_loss_mlp": 1.01800382, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.3681759796239117, + "language_loss": 0.72717166, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74855864, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.6503255367279053 + }, + { + "auxiliary_loss_clip": 0.01128544, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.04144216, + "balance_loss_mlp": 1.02585578, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6442049509532355, + "language_loss": 0.86980951, + "learning_rate": 3.88550929909221e-06, + "loss": 0.89153111, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.4985780715942383 + }, + { + "auxiliary_loss_clip": 0.01126736, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.04187369, + "balance_loss_mlp": 1.0275445, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.6626803665672714, + "language_loss": 0.78817683, + "learning_rate": 3.88537938288243e-06, + "loss": 0.80990076, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.519649028778076 + }, + { + "auxiliary_loss_clip": 0.01002896, + "auxiliary_loss_mlp": 0.01017722, + "balance_loss_clip": 1.01885426, + "balance_loss_mlp": 1.01554096, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7555183022012218, + "language_loss": 0.60601217, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62621838, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 3.3544509410858154 + }, + { + "auxiliary_loss_clip": 0.01139718, + "auxiliary_loss_mlp": 0.01052686, + "balance_loss_clip": 1.04549241, + "balance_loss_mlp": 1.03159738, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 1.8890065244805438, + "language_loss": 0.81156802, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83349204, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 2.553555965423584 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.0103943, + "balance_loss_clip": 1.04191065, + "balance_loss_mlp": 1.02178693, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 1.6823914517774399, + "language_loss": 0.76812565, + "learning_rate": 3.884989205310157e-06, + "loss": 0.78970593, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 2.569425582885742 + }, + { + "auxiliary_loss_clip": 0.01106921, + "auxiliary_loss_mlp": 0.01054101, + "balance_loss_clip": 1.04336274, + "balance_loss_mlp": 1.03592157, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.534354042085174, + "language_loss": 0.84346271, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86507297, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.7062032222747803 + }, + { + "auxiliary_loss_clip": 0.01135967, + "auxiliary_loss_mlp": 0.01048034, + "balance_loss_clip": 1.04513192, + "balance_loss_mlp": 1.02737498, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9315886311761599, + "language_loss": 0.82281172, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84465176, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 2.5714941024780273 + }, + { + "auxiliary_loss_clip": 0.01144452, + "auxiliary_loss_mlp": 0.01047931, + "balance_loss_clip": 1.04349029, + "balance_loss_mlp": 1.02785647, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.7544599896174946, + "language_loss": 0.85851359, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88043737, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.5666141510009766 + }, + { + "auxiliary_loss_clip": 0.01037292, + "auxiliary_loss_mlp": 0.01013885, + "balance_loss_clip": 1.0129559, + "balance_loss_mlp": 1.01133406, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7580518664144443, + "language_loss": 0.61796266, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63847446, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 3.2017147541046143 + }, + { + "auxiliary_loss_clip": 0.01136102, + "auxiliary_loss_mlp": 0.01050041, + "balance_loss_clip": 1.04626155, + "balance_loss_mlp": 1.03134918, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 1.6157922567579432, + "language_loss": 0.89102435, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91288579, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 2.6081864833831787 + }, + { + "auxiliary_loss_clip": 0.01107424, + "auxiliary_loss_mlp": 0.01059169, + "balance_loss_clip": 1.03754675, + "balance_loss_mlp": 1.03564918, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 1.9322105157595422, + "language_loss": 0.84427643, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86594248, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 2.5846071243286133 + }, + { + "auxiliary_loss_clip": 0.01146467, + "auxiliary_loss_mlp": 0.01052236, + "balance_loss_clip": 1.04435349, + "balance_loss_mlp": 1.03249502, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.2562052333711557, + "language_loss": 0.74413937, + "learning_rate": 3.884076289441196e-06, + "loss": 0.76612645, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.5674407482147217 + }, + { + "auxiliary_loss_clip": 0.01106402, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.04002643, + "balance_loss_mlp": 1.02982402, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 2.031886165880053, + "language_loss": 0.83005846, + "learning_rate": 3.88394558707144e-06, + "loss": 0.851623, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 2.627708673477173 + }, + { + "auxiliary_loss_clip": 0.01129979, + "auxiliary_loss_mlp": 0.00751367, + "balance_loss_clip": 1.04401517, + "balance_loss_mlp": 1.0000217, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.112133385657089, + "language_loss": 0.81301868, + "learning_rate": 3.883814813262277e-06, + "loss": 0.83183217, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.5707383155822754 + }, + { + "auxiliary_loss_clip": 0.0113707, + "auxiliary_loss_mlp": 0.01053248, + "balance_loss_clip": 1.0435549, + "balance_loss_mlp": 1.03139675, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.385630575631742, + "language_loss": 0.82668519, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84858841, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.5132453441619873 + }, + { + "auxiliary_loss_clip": 0.01104602, + "auxiliary_loss_mlp": 0.010504, + "balance_loss_clip": 1.04184747, + "balance_loss_mlp": 1.03214908, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 2.2815322315703046, + "language_loss": 0.73577535, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75732541, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 2.655020236968994 + }, + { + "auxiliary_loss_clip": 0.01121382, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.04251528, + "balance_loss_mlp": 1.03316522, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.2750487326937314, + "language_loss": 0.74556738, + "learning_rate": 3.883422063247961e-06, + "loss": 0.76730669, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 2.6335995197296143 + }, + { + "auxiliary_loss_clip": 0.01147971, + "auxiliary_loss_mlp": 0.0104983, + "balance_loss_clip": 1.04446626, + "balance_loss_mlp": 1.03027964, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 3.415867340749494, + "language_loss": 0.63273466, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65471262, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.607351064682007 + }, + { + "auxiliary_loss_clip": 0.01125214, + "auxiliary_loss_mlp": 0.0104435, + "balance_loss_clip": 1.04141903, + "balance_loss_mlp": 1.02499068, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.184816240179944, + "language_loss": 0.82982397, + "learning_rate": 3.883159872799043e-06, + "loss": 0.85151958, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.6488828659057617 + }, + { + "auxiliary_loss_clip": 0.0108563, + "auxiliary_loss_mlp": 0.010571, + "balance_loss_clip": 1.04324818, + "balance_loss_mlp": 1.03452182, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7928150325135193, + "language_loss": 0.88283068, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90425801, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.6820950508117676 + }, + { + "auxiliary_loss_clip": 0.01136703, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.04320109, + "balance_loss_mlp": 1.02773547, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 3.105976755447211, + "language_loss": 0.71099365, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73284733, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 2.522908926010132 + }, + { + "auxiliary_loss_clip": 0.0109793, + "auxiliary_loss_mlp": 0.01041251, + "balance_loss_clip": 1.04713559, + "balance_loss_mlp": 1.02165294, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 1.826518348480037, + "language_loss": 0.66410464, + "learning_rate": 3.882766051566027e-06, + "loss": 0.68549645, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 2.712451696395874 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.0105825, + "balance_loss_clip": 1.05334413, + "balance_loss_mlp": 1.03903317, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.816392600632902, + "language_loss": 0.76410347, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78582394, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.7129323482513428 + }, + { + "auxiliary_loss_clip": 0.01110515, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.03922307, + "balance_loss_mlp": 1.02375984, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 1.942755192556501, + "language_loss": 0.81720948, + "learning_rate": 3.882503147095667e-06, + "loss": 0.8387481, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.569995880126953 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.02337742, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 1.7313752744763564, + "language_loss": 0.76305711, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78482765, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.657545804977417 + }, + { + "auxiliary_loss_clip": 0.01114151, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_clip": 1.04376328, + "balance_loss_mlp": 1.02427781, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 4.939455342434234, + "language_loss": 0.80687588, + "learning_rate": 3.882239957086477e-06, + "loss": 0.82845896, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.604822874069214 + }, + { + "auxiliary_loss_clip": 0.01118758, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.04154372, + "balance_loss_mlp": 1.02982485, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 5.24655466492856, + "language_loss": 0.75565732, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77734756, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 2.7164764404296875 + }, + { + "auxiliary_loss_clip": 0.0113404, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_clip": 1.04231501, + "balance_loss_mlp": 1.03283489, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 1.9978070805197572, + "language_loss": 0.80500662, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82688576, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 2.63468599319458 + }, + { + "auxiliary_loss_clip": 0.01047021, + "auxiliary_loss_mlp": 0.01000272, + "balance_loss_clip": 1.02193713, + "balance_loss_mlp": 0.99793547, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.6933904124682659, + "language_loss": 0.60690928, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62738222, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 4.715377330780029 + }, + { + "auxiliary_loss_clip": 0.01144098, + "auxiliary_loss_mlp": 0.00751192, + "balance_loss_clip": 1.04417026, + "balance_loss_mlp": 0.9999243, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.824239422865466, + "language_loss": 0.77935743, + "learning_rate": 3.881712720611336e-06, + "loss": 0.7983104, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.5386946201324463 + }, + { + "auxiliary_loss_clip": 0.01128991, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.04159975, + "balance_loss_mlp": 1.02497935, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 1.7974970681666143, + "language_loss": 0.78611243, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80785894, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 2.568692207336426 + }, + { + "auxiliary_loss_clip": 0.01134573, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.04415703, + "balance_loss_mlp": 1.02356625, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.3301500136692828, + "language_loss": 0.82205033, + "learning_rate": 3.881448674225356e-06, + "loss": 0.84382445, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 2.523456573486328 + }, + { + "auxiliary_loss_clip": 0.01142689, + "auxiliary_loss_mlp": 0.0105599, + "balance_loss_clip": 1.04387641, + "balance_loss_mlp": 1.03279209, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.3575145055464346, + "language_loss": 0.69520456, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71719134, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 4.09121036529541 + }, + { + "auxiliary_loss_clip": 0.01137421, + "auxiliary_loss_mlp": 0.00751258, + "balance_loss_clip": 1.04327798, + "balance_loss_mlp": 0.99997365, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 12.19571904200818, + "language_loss": 0.8088547, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82774144, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 4.025057792663574 + }, + { + "auxiliary_loss_clip": 0.01138944, + "auxiliary_loss_mlp": 0.01046663, + "balance_loss_clip": 1.05283904, + "balance_loss_mlp": 1.0262301, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.3957555505698687, + "language_loss": 0.75272405, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77458012, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 4.098391771316528 + }, + { + "auxiliary_loss_clip": 0.01075534, + "auxiliary_loss_mlp": 0.01049151, + "balance_loss_clip": 1.03660476, + "balance_loss_mlp": 1.02881384, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 1.8148851347835115, + "language_loss": 0.76550651, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78675336, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.705648183822632 + }, + { + "auxiliary_loss_clip": 0.01084332, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_clip": 1.03611207, + "balance_loss_mlp": 1.02412295, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.6263580753492488, + "language_loss": 0.80050188, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82177609, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.7419023513793945 + }, + { + "auxiliary_loss_clip": 0.01153592, + "auxiliary_loss_mlp": 0.01057355, + "balance_loss_clip": 1.04770696, + "balance_loss_mlp": 1.03847206, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.710055735823191, + "language_loss": 0.83429724, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85640669, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.525146484375 + }, + { + "auxiliary_loss_clip": 0.01119719, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.04027271, + "balance_loss_mlp": 1.03087449, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.6123631528898763, + "language_loss": 0.73488021, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75657052, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 2.5931382179260254 + }, + { + "auxiliary_loss_clip": 0.01133383, + "auxiliary_loss_mlp": 0.01047437, + "balance_loss_clip": 1.0455507, + "balance_loss_mlp": 1.02941298, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.9128299290012114, + "language_loss": 0.84396636, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86577457, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 2.6150474548339844 + }, + { + "auxiliary_loss_clip": 0.01128934, + "auxiliary_loss_mlp": 0.01048932, + "balance_loss_clip": 1.04263139, + "balance_loss_mlp": 1.02778411, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.7838199051000978, + "language_loss": 0.7504102, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77218878, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.7036163806915283 + }, + { + "auxiliary_loss_clip": 0.01125627, + "auxiliary_loss_mlp": 0.01048836, + "balance_loss_clip": 1.04516292, + "balance_loss_mlp": 1.02960765, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.5698199340167216, + "language_loss": 0.74663186, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76837647, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.713799238204956 + }, + { + "auxiliary_loss_clip": 0.0110538, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.04312801, + "balance_loss_mlp": 1.02573776, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.0505838432134857, + "language_loss": 0.86263347, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88416374, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 2.7169086933135986 + }, + { + "auxiliary_loss_clip": 0.01099559, + "auxiliary_loss_mlp": 0.01049435, + "balance_loss_clip": 1.03918803, + "balance_loss_mlp": 1.02835917, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 1.896195366065552, + "language_loss": 0.68183661, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70332658, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 2.8095622062683105 + }, + { + "auxiliary_loss_clip": 0.0108845, + "auxiliary_loss_mlp": 0.01058708, + "balance_loss_clip": 1.0429039, + "balance_loss_mlp": 1.03626084, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 2.6459438227300085, + "language_loss": 0.8652668, + "learning_rate": 3.879725418400005e-06, + "loss": 0.88673842, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.7297399044036865 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.00751159, + "balance_loss_clip": 1.03818154, + "balance_loss_mlp": 1.00006652, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.9544863803878638, + "language_loss": 0.74380332, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76237047, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 2.6289539337158203 + }, + { + "auxiliary_loss_clip": 0.01018652, + "auxiliary_loss_mlp": 0.01015593, + "balance_loss_clip": 1.01449108, + "balance_loss_mlp": 1.01317334, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7071580356875061, + "language_loss": 0.51593745, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53627992, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.2828550338745117 + }, + { + "auxiliary_loss_clip": 0.01133333, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.04160404, + "balance_loss_mlp": 1.02407193, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 1.824587960956965, + "language_loss": 0.71109468, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73286378, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 2.615081310272217 + }, + { + "auxiliary_loss_clip": 0.01133629, + "auxiliary_loss_mlp": 0.0104167, + "balance_loss_clip": 1.04387033, + "balance_loss_mlp": 1.02325225, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.527494968491616, + "language_loss": 0.79871422, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82046717, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.561521530151367 + }, + { + "auxiliary_loss_clip": 0.01132538, + "auxiliary_loss_mlp": 0.01044415, + "balance_loss_clip": 1.0416863, + "balance_loss_mlp": 1.02554429, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.381194591425762, + "language_loss": 0.78394282, + "learning_rate": 3.879059419522011e-06, + "loss": 0.8057124, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.6250860691070557 + }, + { + "auxiliary_loss_clip": 0.0110127, + "auxiliary_loss_mlp": 0.01045009, + "balance_loss_clip": 1.04171026, + "balance_loss_mlp": 1.02793872, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.0562579978796927, + "language_loss": 0.80291295, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82437575, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.7935752868652344 + }, + { + "auxiliary_loss_clip": 0.01129365, + "auxiliary_loss_mlp": 0.01044056, + "balance_loss_clip": 1.0404501, + "balance_loss_mlp": 1.02526844, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 2.122152336894989, + "language_loss": 0.78295624, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80469048, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 2.617780923843384 + }, + { + "auxiliary_loss_clip": 0.01128553, + "auxiliary_loss_mlp": 0.01060261, + "balance_loss_clip": 1.04403305, + "balance_loss_mlp": 1.0414381, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.8815056938169326, + "language_loss": 0.78410089, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80598897, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.65006160736084 + }, + { + "auxiliary_loss_clip": 0.01088935, + "auxiliary_loss_mlp": 0.01048296, + "balance_loss_clip": 1.0457077, + "balance_loss_mlp": 1.02948451, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.9506641384836345, + "language_loss": 0.69009751, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71146977, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.768434762954712 + }, + { + "auxiliary_loss_clip": 0.01112525, + "auxiliary_loss_mlp": 0.0104654, + "balance_loss_clip": 1.04209232, + "balance_loss_mlp": 1.02706146, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7450486787115211, + "language_loss": 0.86608821, + "learning_rate": 3.878391639291116e-06, + "loss": 0.88767886, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.6169240474700928 + }, + { + "auxiliary_loss_clip": 0.0114247, + "auxiliary_loss_mlp": 0.01045283, + "balance_loss_clip": 1.04319334, + "balance_loss_mlp": 1.02606678, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 1.8751398075962287, + "language_loss": 0.75167692, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77355444, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.6049022674560547 + }, + { + "auxiliary_loss_clip": 0.01108887, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.04368186, + "balance_loss_mlp": 1.02095985, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.3454699091754296, + "language_loss": 0.82804483, + "learning_rate": 3.878124028561692e-06, + "loss": 0.84952509, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.655402183532715 + }, + { + "auxiliary_loss_clip": 0.01113508, + "auxiliary_loss_mlp": 0.00750957, + "balance_loss_clip": 1.04151666, + "balance_loss_mlp": 1.00003648, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 1.9157070985647944, + "language_loss": 0.86044419, + "learning_rate": 3.877990116366466e-06, + "loss": 0.87908888, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 2.666609764099121 + }, + { + "auxiliary_loss_clip": 0.01038113, + "auxiliary_loss_mlp": 0.01011929, + "balance_loss_clip": 1.01661515, + "balance_loss_mlp": 1.00948477, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7570762366648637, + "language_loss": 0.65616965, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67667007, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 3.2583627700805664 + }, + { + "auxiliary_loss_clip": 0.01126244, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.04160023, + "balance_loss_mlp": 1.02158713, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 1.8067577682602314, + "language_loss": 0.78768742, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80934048, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 2.599285364151001 + }, + { + "auxiliary_loss_clip": 0.01134185, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.04535866, + "balance_loss_mlp": 1.02064919, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.7787423671496374, + "language_loss": 0.7782746, + "learning_rate": 3.877587952519672e-06, + "loss": 0.79999983, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 2.6639091968536377 + }, + { + "auxiliary_loss_clip": 0.01046464, + "auxiliary_loss_mlp": 0.0104681, + "balance_loss_clip": 1.03238106, + "balance_loss_mlp": 1.02838027, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 1.7169480675514799, + "language_loss": 0.88066757, + "learning_rate": 3.877453755500647e-06, + "loss": 0.9016003, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.7669105529785156 + }, + { + "auxiliary_loss_clip": 0.01045167, + "auxiliary_loss_mlp": 0.01004183, + "balance_loss_clip": 1.01202214, + "balance_loss_mlp": 1.00177467, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8724540360627634, + "language_loss": 0.59004271, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61053622, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 3.164172649383545 + }, + { + "auxiliary_loss_clip": 0.0114833, + "auxiliary_loss_mlp": 0.00751091, + "balance_loss_clip": 1.04576302, + "balance_loss_mlp": 1.00007629, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.5968960727766448, + "language_loss": 0.796606, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81560022, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 2.5613691806793213 + }, + { + "auxiliary_loss_clip": 0.01106718, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.04080963, + "balance_loss_mlp": 1.02015555, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 3.302652448477566, + "language_loss": 0.77572471, + "learning_rate": 3.877050737304533e-06, + "loss": 0.79717839, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.7970802783966064 + }, + { + "auxiliary_loss_clip": 0.01106178, + "auxiliary_loss_mlp": 0.01041478, + "balance_loss_clip": 1.04046249, + "balance_loss_mlp": 1.0219996, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 2.2222006345302727, + "language_loss": 0.68119347, + "learning_rate": 3.876916255543129e-06, + "loss": 0.70267004, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 2.6665947437286377 + }, + { + "auxiliary_loss_clip": 0.01143266, + "auxiliary_loss_mlp": 0.01047811, + "balance_loss_clip": 1.04385936, + "balance_loss_mlp": 1.02855909, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.7279087685523173, + "language_loss": 0.84095919, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86286998, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 2.556117296218872 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01044679, + "balance_loss_clip": 1.04460287, + "balance_loss_mlp": 1.02572453, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 3.3929240959543825, + "language_loss": 0.81861669, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84053838, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 2.605992078781128 + }, + { + "auxiliary_loss_clip": 0.01108853, + "auxiliary_loss_mlp": 0.00751004, + "balance_loss_clip": 1.04547334, + "balance_loss_mlp": 1.00001633, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 8.605311377674779, + "language_loss": 0.86974514, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88834369, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 4.260493755340576 + }, + { + "auxiliary_loss_clip": 0.01144611, + "auxiliary_loss_mlp": 0.01047524, + "balance_loss_clip": 1.04575169, + "balance_loss_mlp": 1.028355, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.357337632998073, + "language_loss": 0.80443251, + "learning_rate": 3.876377616820024e-06, + "loss": 0.82635391, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 2.58880615234375 + }, + { + "auxiliary_loss_clip": 0.01101083, + "auxiliary_loss_mlp": 0.01046047, + "balance_loss_clip": 1.04004216, + "balance_loss_mlp": 1.02745032, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 4.714315523071912, + "language_loss": 0.85305327, + "learning_rate": 3.876242779245409e-06, + "loss": 0.87452459, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 2.581031560897827 + }, + { + "auxiliary_loss_clip": 0.01132307, + "auxiliary_loss_mlp": 0.01048845, + "balance_loss_clip": 1.04266, + "balance_loss_mlp": 1.02974784, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.129883619475135, + "language_loss": 0.77320272, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79501426, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.628251314163208 + }, + { + "auxiliary_loss_clip": 0.01142226, + "auxiliary_loss_mlp": 0.00751101, + "balance_loss_clip": 1.04508138, + "balance_loss_mlp": 1.00011098, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.8426019633779676, + "language_loss": 0.77106988, + "learning_rate": 3.875972890659349e-06, + "loss": 0.79000318, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 4.066301107406616 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.04325485, + "balance_loss_mlp": 1.02701426, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 1.81931225373322, + "language_loss": 0.80120313, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82287931, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 4.1386635303497314 + }, + { + "auxiliary_loss_clip": 0.01021241, + "auxiliary_loss_mlp": 0.01007079, + "balance_loss_clip": 1.01184845, + "balance_loss_mlp": 1.0047307, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8498403526138708, + "language_loss": 0.59098727, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61127049, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 4.667682886123657 + }, + { + "auxiliary_loss_clip": 0.01092191, + "auxiliary_loss_mlp": 0.01047838, + "balance_loss_clip": 1.03899574, + "balance_loss_mlp": 1.02820432, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.2948424158220337, + "language_loss": 0.65295905, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67435932, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.757176160812378 + }, + { + "auxiliary_loss_clip": 0.01067809, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.03475189, + "balance_loss_mlp": 1.02453494, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.5180695416518828, + "language_loss": 0.70600617, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72712159, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.6889607906341553 + }, + { + "auxiliary_loss_clip": 0.01088769, + "auxiliary_loss_mlp": 0.01056865, + "balance_loss_clip": 1.03498769, + "balance_loss_mlp": 1.03504944, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 1.8360052354650114, + "language_loss": 0.86148113, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88293743, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 2.6602156162261963 + }, + { + "auxiliary_loss_clip": 0.01094528, + "auxiliary_loss_mlp": 0.01047152, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.02909207, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.5815056724211822, + "language_loss": 0.66873991, + "learning_rate": 3.875161517775226e-06, + "loss": 0.6901567, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.7757983207702637 + }, + { + "auxiliary_loss_clip": 0.01103011, + "auxiliary_loss_mlp": 0.01052591, + "balance_loss_clip": 1.03934383, + "balance_loss_mlp": 1.03211045, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.8889950244840295, + "language_loss": 0.89231277, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91386878, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 2.639626979827881 + }, + { + "auxiliary_loss_clip": 0.0113193, + "auxiliary_loss_mlp": 0.01054665, + "balance_loss_clip": 1.04168129, + "balance_loss_mlp": 1.03537691, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 2.802197603202962, + "language_loss": 0.70332325, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.72518921, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.5829646587371826 + }, + { + "auxiliary_loss_clip": 0.01109746, + "auxiliary_loss_mlp": 0.00751025, + "balance_loss_clip": 1.04274416, + "balance_loss_mlp": 1.00011671, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.8667581376076334, + "language_loss": 0.81763828, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83624607, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 2.6174697875976562 + }, + { + "auxiliary_loss_clip": 0.01127393, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.0426203, + "balance_loss_mlp": 1.026021, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.7709693189138926, + "language_loss": 0.88873196, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91043842, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 2.563687324523926 + }, + { + "auxiliary_loss_clip": 0.01095806, + "auxiliary_loss_mlp": 0.0105799, + "balance_loss_clip": 1.03909957, + "balance_loss_mlp": 1.03793859, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.0839484811685365, + "language_loss": 0.84768331, + "learning_rate": 3.874483418234632e-06, + "loss": 0.86922121, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 2.640630006790161 + }, + { + "auxiliary_loss_clip": 0.0112769, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.04083717, + "balance_loss_mlp": 1.021559, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.7139758819657311, + "language_loss": 0.74029821, + "learning_rate": 3.874347585064131e-06, + "loss": 0.7619766, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.617086410522461 + }, + { + "auxiliary_loss_clip": 0.01127463, + "auxiliary_loss_mlp": 0.01043188, + "balance_loss_clip": 1.04004908, + "balance_loss_mlp": 1.02462745, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 4.703160978143816, + "language_loss": 0.78270316, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80440974, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 2.503770351409912 + }, + { + "auxiliary_loss_clip": 0.01119555, + "auxiliary_loss_mlp": 0.01041599, + "balance_loss_clip": 1.04111302, + "balance_loss_mlp": 1.02310944, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.183147854927106, + "language_loss": 0.71659577, + "learning_rate": 3.87407570550194e-06, + "loss": 0.7382074, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 2.587191581726074 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01047516, + "balance_loss_clip": 1.04388237, + "balance_loss_mlp": 1.03005147, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.6545034625715407, + "language_loss": 0.72323358, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74506241, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.513683795928955 + }, + { + "auxiliary_loss_clip": 0.0103469, + "auxiliary_loss_mlp": 0.01003303, + "balance_loss_clip": 1.01134825, + "balance_loss_mlp": 1.00096667, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8278982599339251, + "language_loss": 0.56064987, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58102977, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 2.9775986671447754 + }, + { + "auxiliary_loss_clip": 0.01101739, + "auxiliary_loss_mlp": 0.010415, + "balance_loss_clip": 1.03910661, + "balance_loss_mlp": 1.02301073, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7587006984185145, + "language_loss": 0.82920164, + "learning_rate": 3.873667353183016e-06, + "loss": 0.8506341, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 2.659080743789673 + }, + { + "auxiliary_loss_clip": 0.01108465, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.04204881, + "balance_loss_mlp": 1.02257931, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.7512382999928393, + "language_loss": 0.8131395, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83462596, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.6506259441375732 + }, + { + "auxiliary_loss_clip": 0.01086531, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.04022861, + "balance_loss_mlp": 1.02383065, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.6322255049649483, + "language_loss": 0.82473087, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84605056, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.684262752532959 + }, + { + "auxiliary_loss_clip": 0.01126999, + "auxiliary_loss_mlp": 0.01039717, + "balance_loss_clip": 1.04569387, + "balance_loss_mlp": 1.02090597, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.775816817489863, + "language_loss": 0.80336618, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82503331, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.585785150527954 + }, + { + "auxiliary_loss_clip": 0.01125137, + "auxiliary_loss_mlp": 0.01045387, + "balance_loss_clip": 1.04027295, + "balance_loss_mlp": 1.02714825, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.079696385929465, + "language_loss": 0.79102218, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81272739, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.571946144104004 + }, + { + "auxiliary_loss_clip": 0.01130593, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.04397559, + "balance_loss_mlp": 1.0241375, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.1606895961061356, + "language_loss": 0.80294019, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82468498, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.60060715675354 + }, + { + "auxiliary_loss_clip": 0.01069691, + "auxiliary_loss_mlp": 0.0105185, + "balance_loss_clip": 1.03569925, + "balance_loss_mlp": 1.03325343, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.8412232593025548, + "language_loss": 0.65476251, + "learning_rate": 3.872848730344146e-06, + "loss": 0.67597795, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.852159261703491 + }, + { + "auxiliary_loss_clip": 0.01124335, + "auxiliary_loss_mlp": 0.01043087, + "balance_loss_clip": 1.04187191, + "balance_loss_mlp": 1.02474093, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.482649058211374, + "language_loss": 0.78708816, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80876243, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 2.5786139965057373 + }, + { + "auxiliary_loss_clip": 0.01136055, + "auxiliary_loss_mlp": 0.01045281, + "balance_loss_clip": 1.0415473, + "balance_loss_mlp": 1.02725673, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 2.2917177984143153, + "language_loss": 0.8037045, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82551789, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 2.5178675651550293 + }, + { + "auxiliary_loss_clip": 0.01125778, + "auxiliary_loss_mlp": 0.01046034, + "balance_loss_clip": 1.04374349, + "balance_loss_mlp": 1.02816486, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.869467904123168, + "language_loss": 0.77388895, + "learning_rate": 3.87243846010358e-06, + "loss": 0.79560709, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.6264567375183105 + }, + { + "auxiliary_loss_clip": 0.01020922, + "auxiliary_loss_mlp": 0.01016855, + "balance_loss_clip": 1.00814795, + "balance_loss_mlp": 1.01445889, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8412336522658079, + "language_loss": 0.61528969, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63566744, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.0999109745025635 + }, + { + "auxiliary_loss_clip": 0.01120335, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.03779149, + "balance_loss_mlp": 1.02363753, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.5577005894485068, + "language_loss": 0.64470267, + "learning_rate": 3.872164591585956e-06, + "loss": 0.66630781, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.5927023887634277 + }, + { + "auxiliary_loss_clip": 0.01127734, + "auxiliary_loss_mlp": 0.01041986, + "balance_loss_clip": 1.03715491, + "balance_loss_mlp": 1.02260303, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.5093998618671898, + "language_loss": 0.74223036, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.76392758, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.6302475929260254 + }, + { + "auxiliary_loss_clip": 0.01128788, + "auxiliary_loss_mlp": 0.01041607, + "balance_loss_clip": 1.04437947, + "balance_loss_mlp": 1.02338028, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.0022847579643637, + "language_loss": 0.77360868, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79531264, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.578902244567871 + }, + { + "auxiliary_loss_clip": 0.01137976, + "auxiliary_loss_mlp": 0.01046094, + "balance_loss_clip": 1.04082251, + "balance_loss_mlp": 1.02846253, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.8074027299493833, + "language_loss": 0.76976413, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79160482, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 2.558438777923584 + }, + { + "auxiliary_loss_clip": 0.01118941, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.03861237, + "balance_loss_mlp": 1.01904225, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.6101718005572656, + "language_loss": 0.87071598, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89227307, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.593080997467041 + }, + { + "auxiliary_loss_clip": 0.01126657, + "auxiliary_loss_mlp": 0.01042797, + "balance_loss_clip": 1.04312253, + "balance_loss_mlp": 1.0248915, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.6520518279178507, + "language_loss": 0.88823885, + "learning_rate": 3.871478678011177e-06, + "loss": 0.90993339, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 2.647136688232422 + }, + { + "auxiliary_loss_clip": 0.01115945, + "auxiliary_loss_mlp": 0.0104482, + "balance_loss_clip": 1.04058886, + "balance_loss_mlp": 1.02558005, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.9049275231595257, + "language_loss": 0.81066805, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83227575, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.5492641925811768 + }, + { + "auxiliary_loss_clip": 0.01122165, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.03898907, + "balance_loss_mlp": 1.01929128, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 4.676756206382549, + "language_loss": 0.834782, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85637999, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 2.6355369091033936 + }, + { + "auxiliary_loss_clip": 0.0103422, + "auxiliary_loss_mlp": 0.01011778, + "balance_loss_clip": 1.01184177, + "balance_loss_mlp": 1.00872636, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.9186822696570385, + "language_loss": 0.6195091, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63996905, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 4.5988874435424805 + }, + { + "auxiliary_loss_clip": 0.01110611, + "auxiliary_loss_mlp": 0.0104132, + "balance_loss_clip": 1.04207349, + "balance_loss_mlp": 1.02336669, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.5770993137556588, + "language_loss": 0.87098718, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89250648, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.6068575382232666 + }, + { + "auxiliary_loss_clip": 0.01099561, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.03987598, + "balance_loss_mlp": 1.02146244, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.8770436750401167, + "language_loss": 0.74862528, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77002257, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 2.6255810260772705 + }, + { + "auxiliary_loss_clip": 0.01033497, + "auxiliary_loss_mlp": 0.01000493, + "balance_loss_clip": 1.01103115, + "balance_loss_mlp": 0.99775076, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6817208893957598, + "language_loss": 0.51792717, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53826708, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 3.050114631652832 + }, + { + "auxiliary_loss_clip": 0.01138915, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_clip": 1.04298615, + "balance_loss_mlp": 1.03038204, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 2.0753319644279236, + "language_loss": 0.70837843, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.73024571, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 4.121740341186523 + }, + { + "auxiliary_loss_clip": 0.01086308, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.03526425, + "balance_loss_mlp": 1.02830529, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8071083598961852, + "language_loss": 0.82363558, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84495819, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 4.1226866245269775 + }, + { + "auxiliary_loss_clip": 0.01122516, + "auxiliary_loss_mlp": 0.01045598, + "balance_loss_clip": 1.0422051, + "balance_loss_mlp": 1.0264647, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 2.038840128802481, + "language_loss": 0.71420598, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73588705, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.600403308868408 + }, + { + "auxiliary_loss_clip": 0.01088584, + "auxiliary_loss_mlp": 0.00751086, + "balance_loss_clip": 1.04053879, + "balance_loss_mlp": 1.00016141, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 2.0301438592854852, + "language_loss": 0.75775373, + "learning_rate": 3.870101529014526e-06, + "loss": 0.77615041, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 4.1536383628845215 + }, + { + "auxiliary_loss_clip": 0.01080749, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.03739488, + "balance_loss_mlp": 1.0224309, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.449780457248792, + "language_loss": 0.81546658, + "learning_rate": 3.869963423999178e-06, + "loss": 0.83669972, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 2.706463575363159 + }, + { + "auxiliary_loss_clip": 0.0112121, + "auxiliary_loss_mlp": 0.01045482, + "balance_loss_clip": 1.03912544, + "balance_loss_mlp": 1.02775574, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 1.878547571433384, + "language_loss": 0.74319595, + "learning_rate": 3.86982524807463e-06, + "loss": 0.7648629, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.7225570678710938 + }, + { + "auxiliary_loss_clip": 0.0112897, + "auxiliary_loss_mlp": 0.01041347, + "balance_loss_clip": 1.04358101, + "balance_loss_mlp": 1.0235604, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 1.6985459505667224, + "language_loss": 0.74104434, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76274753, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 2.7343945503234863 + }, + { + "auxiliary_loss_clip": 0.01094845, + "auxiliary_loss_mlp": 0.01045917, + "balance_loss_clip": 1.03559232, + "balance_loss_mlp": 1.02798796, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.6784169751204325, + "language_loss": 0.73109508, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75250262, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 2.70412540435791 + }, + { + "auxiliary_loss_clip": 0.01109366, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.03732789, + "balance_loss_mlp": 1.02301669, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 1.9982834330373285, + "language_loss": 0.90767962, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92916524, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 2.641066789627075 + }, + { + "auxiliary_loss_clip": 0.01094656, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.03561735, + "balance_loss_mlp": 1.02100587, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.7329861656005054, + "language_loss": 0.65469217, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67604285, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 2.645575523376465 + }, + { + "auxiliary_loss_clip": 0.01115034, + "auxiliary_loss_mlp": 0.01046906, + "balance_loss_clip": 1.04151118, + "balance_loss_mlp": 1.0282501, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.0135202634301788, + "language_loss": 0.80258548, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82420492, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 2.57625675201416 + }, + { + "auxiliary_loss_clip": 0.01108764, + "auxiliary_loss_mlp": 0.01049666, + "balance_loss_clip": 1.03976274, + "balance_loss_mlp": 1.03033018, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 1.74208809881599, + "language_loss": 0.81996888, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84155321, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.67995023727417 + }, + { + "auxiliary_loss_clip": 0.01093632, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.03975701, + "balance_loss_mlp": 1.02587128, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.5079824019804806, + "language_loss": 0.8741532, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89554369, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.6466867923736572 + }, + { + "auxiliary_loss_clip": 0.01107614, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.04493463, + "balance_loss_mlp": 1.02139533, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.6464143818008807, + "language_loss": 0.75668597, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77816176, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.7761874198913574 + }, + { + "auxiliary_loss_clip": 0.01123595, + "auxiliary_loss_mlp": 0.00750815, + "balance_loss_clip": 1.04017353, + "balance_loss_mlp": 1.00005603, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.9549208502699154, + "language_loss": 0.82943153, + "learning_rate": 3.868578474705109e-06, + "loss": 0.84817559, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 2.7043285369873047 + }, + { + "auxiliary_loss_clip": 0.01139937, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_clip": 1.04356682, + "balance_loss_mlp": 1.02705264, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 2.0867029778333888, + "language_loss": 0.82954884, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85139668, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 2.532104969024658 + }, + { + "auxiliary_loss_clip": 0.01140032, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.04491591, + "balance_loss_mlp": 1.02419245, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.071600103722922, + "language_loss": 0.84539604, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86721915, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.4915213584899902 + }, + { + "auxiliary_loss_clip": 0.01109718, + "auxiliary_loss_mlp": 0.01044813, + "balance_loss_clip": 1.0396657, + "balance_loss_mlp": 1.02844524, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.157520814160091, + "language_loss": 0.86065757, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88220286, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.5771942138671875 + }, + { + "auxiliary_loss_clip": 0.01128267, + "auxiliary_loss_mlp": 0.01048047, + "balance_loss_clip": 1.03986287, + "balance_loss_mlp": 1.02941442, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 2.1955307451772312, + "language_loss": 0.79445106, + "learning_rate": 3.868022510705977e-06, + "loss": 0.8162142, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.6139461994171143 + }, + { + "auxiliary_loss_clip": 0.01126286, + "auxiliary_loss_mlp": 0.01045009, + "balance_loss_clip": 1.04212523, + "balance_loss_mlp": 1.02772403, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.4697624476320827, + "language_loss": 0.76543713, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78715003, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.5827577114105225 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.04104996, + "balance_loss_mlp": 1.02410889, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.2144596279631052, + "language_loss": 0.93369472, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95536482, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.606645107269287 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.04102111, + "balance_loss_mlp": 1.02440751, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.8622889804919058, + "language_loss": 0.91590154, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93749523, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.60630464553833 + }, + { + "auxiliary_loss_clip": 0.01132469, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.04399109, + "balance_loss_mlp": 1.02482152, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.667331650180747, + "language_loss": 0.74046606, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76222134, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.5949056148529053 + }, + { + "auxiliary_loss_clip": 0.0110542, + "auxiliary_loss_mlp": 0.01047703, + "balance_loss_clip": 1.03999877, + "balance_loss_mlp": 1.02852178, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 2.000791651090477, + "language_loss": 0.78886211, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81039333, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 2.656226873397827 + }, + { + "auxiliary_loss_clip": 0.01090521, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.04062128, + "balance_loss_mlp": 1.03075957, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.1646329785004403, + "language_loss": 0.88321769, + "learning_rate": 3.867186439744955e-06, + "loss": 0.9046123, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.598411798477173 + }, + { + "auxiliary_loss_clip": 0.01106612, + "auxiliary_loss_mlp": 0.01044212, + "balance_loss_clip": 1.04022002, + "balance_loss_mlp": 1.02643752, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.096170418571847, + "language_loss": 0.77055347, + "learning_rate": 3.867046846740299e-06, + "loss": 0.79206169, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.6138556003570557 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01046367, + "balance_loss_clip": 1.03831828, + "balance_loss_mlp": 1.02868772, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 1.9340044507505956, + "language_loss": 0.76988065, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79134566, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.7309207916259766 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.04038501, + "balance_loss_mlp": 1.02537179, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.494262270212942, + "language_loss": 0.87840706, + "learning_rate": 3.866767448340471e-06, + "loss": 0.89994717, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.6118087768554688 + }, + { + "auxiliary_loss_clip": 0.01132171, + "auxiliary_loss_mlp": 0.01045821, + "balance_loss_clip": 1.04315281, + "balance_loss_mlp": 1.02615142, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 2.1543776837200066, + "language_loss": 0.80059135, + "learning_rate": 3.866627642955895e-06, + "loss": 0.82237124, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.583620071411133 + }, + { + "auxiliary_loss_clip": 0.01121849, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_clip": 1.03863204, + "balance_loss_mlp": 1.02670097, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.72932678028595, + "language_loss": 0.74843192, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77009308, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.606034755706787 + }, + { + "auxiliary_loss_clip": 0.01141032, + "auxiliary_loss_mlp": 0.01040525, + "balance_loss_clip": 1.04463506, + "balance_loss_mlp": 1.02267933, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.4044502357464634, + "language_loss": 0.78447241, + "learning_rate": 3.866347819843925e-06, + "loss": 0.806288, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 2.5562820434570312 + }, + { + "auxiliary_loss_clip": 0.01112892, + "auxiliary_loss_mlp": 0.0104847, + "balance_loss_clip": 1.04150474, + "balance_loss_mlp": 1.02975404, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 3.3260373143563027, + "language_loss": 0.81812727, + "learning_rate": 3.866207802127143e-06, + "loss": 0.83974087, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 2.597539186477661 + }, + { + "auxiliary_loss_clip": 0.01123485, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.04187274, + "balance_loss_mlp": 1.02229548, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 1.9935244210909768, + "language_loss": 0.82118714, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84282315, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 2.573057174682617 + }, + { + "auxiliary_loss_clip": 0.01118908, + "auxiliary_loss_mlp": 0.01049223, + "balance_loss_clip": 1.04041064, + "balance_loss_mlp": 1.02918363, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.3979846672993244, + "language_loss": 0.83768076, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85936207, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.573383092880249 + }, + { + "auxiliary_loss_clip": 0.01130755, + "auxiliary_loss_mlp": 0.0104614, + "balance_loss_clip": 1.04700971, + "balance_loss_mlp": 1.02810359, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 1.8150980053768242, + "language_loss": 0.75221384, + "learning_rate": 3.865787324397324e-06, + "loss": 0.77398276, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 2.64460825920105 + }, + { + "auxiliary_loss_clip": 0.01013339, + "auxiliary_loss_mlp": 0.01065643, + "balance_loss_clip": 1.01209879, + "balance_loss_mlp": 1.06225705, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8958905418935771, + "language_loss": 0.61827219, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63906199, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 4.490846395492554 + }, + { + "auxiliary_loss_clip": 0.01132272, + "auxiliary_loss_mlp": 0.01049312, + "balance_loss_clip": 1.04218602, + "balance_loss_mlp": 1.02871299, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 3.9170151300300105, + "language_loss": 0.77120227, + "learning_rate": 3.865506652147709e-06, + "loss": 0.7930181, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.5735652446746826 + }, + { + "auxiliary_loss_clip": 0.01141909, + "auxiliary_loss_mlp": 0.010468, + "balance_loss_clip": 1.04422522, + "balance_loss_mlp": 1.02887058, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 2.0595380436890607, + "language_loss": 0.77048147, + "learning_rate": 3.865366209909941e-06, + "loss": 0.79236853, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 2.5598738193511963 + }, + { + "auxiliary_loss_clip": 0.01139686, + "auxiliary_loss_mlp": 0.01050767, + "balance_loss_clip": 1.04180527, + "balance_loss_mlp": 1.03215837, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.547409831786257, + "language_loss": 0.86124963, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88315415, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 2.6859211921691895 + }, + { + "auxiliary_loss_clip": 0.01094532, + "auxiliary_loss_mlp": 0.01048841, + "balance_loss_clip": 1.03760469, + "balance_loss_mlp": 1.02886128, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5324726056989788, + "language_loss": 0.83049834, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85193205, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 2.6106033325195312 + }, + { + "auxiliary_loss_clip": 0.01099424, + "auxiliary_loss_mlp": 0.00750866, + "balance_loss_clip": 1.0377624, + "balance_loss_mlp": 0.99996841, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.1487665633594633, + "language_loss": 0.82600164, + "learning_rate": 3.864944458808712e-06, + "loss": 0.84450454, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 5.638302803039551 + }, + { + "auxiliary_loss_clip": 0.0114524, + "auxiliary_loss_mlp": 0.01049091, + "balance_loss_clip": 1.04565132, + "balance_loss_mlp": 1.03001797, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.702810913374328, + "language_loss": 0.7984674, + "learning_rate": 3.86480373366343e-06, + "loss": 0.82041073, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.5044796466827393 + }, + { + "auxiliary_loss_clip": 0.01128261, + "auxiliary_loss_mlp": 0.01051984, + "balance_loss_clip": 1.0433017, + "balance_loss_mlp": 1.03392375, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.5301408288267946, + "language_loss": 0.64560241, + "learning_rate": 3.864662937804603e-06, + "loss": 0.66740489, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.645247459411621 + }, + { + "auxiliary_loss_clip": 0.01105598, + "auxiliary_loss_mlp": 0.01045265, + "balance_loss_clip": 1.03922439, + "balance_loss_mlp": 1.02547657, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.5334887799867756, + "language_loss": 0.81860983, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84011853, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.626673936843872 + }, + { + "auxiliary_loss_clip": 0.01126295, + "auxiliary_loss_mlp": 0.01050067, + "balance_loss_clip": 1.04696703, + "balance_loss_mlp": 1.02962208, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.5992608596658866, + "language_loss": 0.74534792, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76711154, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 4.137527227401733 + }, + { + "auxiliary_loss_clip": 0.01112553, + "auxiliary_loss_mlp": 0.01047491, + "balance_loss_clip": 1.04193258, + "balance_loss_mlp": 1.02882254, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.9410405792107206, + "language_loss": 0.80867481, + "learning_rate": 3.86424012600026e-06, + "loss": 0.8302753, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 2.6018471717834473 + }, + { + "auxiliary_loss_clip": 0.01092659, + "auxiliary_loss_mlp": 0.01048186, + "balance_loss_clip": 1.03695416, + "balance_loss_mlp": 1.02869534, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.052624802046717, + "language_loss": 0.84166324, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86307168, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 2.5925216674804688 + }, + { + "auxiliary_loss_clip": 0.01100931, + "auxiliary_loss_mlp": 0.00751161, + "balance_loss_clip": 1.03848398, + "balance_loss_mlp": 0.99991071, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 1.7236684756075253, + "language_loss": 0.70445716, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72297806, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.6671957969665527 + }, + { + "auxiliary_loss_clip": 0.01113176, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_clip": 1.03981328, + "balance_loss_mlp": 1.02797198, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.5446852580899892, + "language_loss": 0.73249972, + "learning_rate": 3.863816677966381e-06, + "loss": 0.754094, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.6170966625213623 + }, + { + "auxiliary_loss_clip": 0.01077783, + "auxiliary_loss_mlp": 0.01048907, + "balance_loss_clip": 1.03562164, + "balance_loss_mlp": 1.02941656, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 3.1478018039505327, + "language_loss": 0.72380871, + "learning_rate": 3.863675387262386e-06, + "loss": 0.74507558, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 2.6248903274536133 + }, + { + "auxiliary_loss_clip": 0.01131486, + "auxiliary_loss_mlp": 0.01055861, + "balance_loss_clip": 1.04379344, + "balance_loss_mlp": 1.03532147, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 2.3187984306371687, + "language_loss": 0.75790185, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77977538, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.6044764518737793 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.04083419, + "balance_loss_mlp": 1.02574956, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.5423653785101168, + "language_loss": 0.7955699, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81737459, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.5601532459259033 + }, + { + "auxiliary_loss_clip": 0.01130983, + "auxiliary_loss_mlp": 0.01050233, + "balance_loss_clip": 1.04523039, + "balance_loss_mlp": 1.02994382, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 1.9812243725181742, + "language_loss": 0.82579005, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84760225, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.6706128120422363 + }, + { + "auxiliary_loss_clip": 0.01082846, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_clip": 1.03548861, + "balance_loss_mlp": 1.03616452, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 2.1096014902024587, + "language_loss": 0.74962443, + "learning_rate": 3.863109517792446e-06, + "loss": 0.77101851, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.8026387691497803 + }, + { + "auxiliary_loss_clip": 0.01140084, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.04372406, + "balance_loss_mlp": 1.02440786, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 1.9621159330254152, + "language_loss": 0.81383848, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.8356632, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 2.5251970291137695 + }, + { + "auxiliary_loss_clip": 0.01113808, + "auxiliary_loss_mlp": 0.0105473, + "balance_loss_clip": 1.04280496, + "balance_loss_mlp": 1.03581095, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 2.5959619890022574, + "language_loss": 0.69938684, + "learning_rate": 3.862826159140214e-06, + "loss": 0.7210722, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.8236947059631348 + }, + { + "auxiliary_loss_clip": 0.01128506, + "auxiliary_loss_mlp": 0.01051602, + "balance_loss_clip": 1.0470562, + "balance_loss_mlp": 1.03248048, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 1.8536534624599703, + "language_loss": 0.76710945, + "learning_rate": 3.862684373853579e-06, + "loss": 0.78891051, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.6634249687194824 + }, + { + "auxiliary_loss_clip": 0.01041101, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.01610661, + "balance_loss_mlp": 1.0227946, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9149069591443031, + "language_loss": 0.58868903, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60935503, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 3.04561185836792 + }, + { + "auxiliary_loss_clip": 0.0102785, + "auxiliary_loss_mlp": 0.01017366, + "balance_loss_clip": 1.01529098, + "balance_loss_mlp": 1.0146836, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8611687248449118, + "language_loss": 0.62234855, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64280075, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.1687378883361816 + }, + { + "auxiliary_loss_clip": 0.01122153, + "auxiliary_loss_mlp": 0.01048549, + "balance_loss_clip": 1.04063511, + "balance_loss_mlp": 1.02952337, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.9354727627666226, + "language_loss": 0.71869683, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74040389, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.563159227371216 + }, + { + "auxiliary_loss_clip": 0.01014816, + "auxiliary_loss_mlp": 0.01004051, + "balance_loss_clip": 1.01270461, + "balance_loss_mlp": 1.00139225, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.7033274120486392, + "language_loss": 0.60450923, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62469792, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.2112298011779785 + }, + { + "auxiliary_loss_clip": 0.01144587, + "auxiliary_loss_mlp": 0.01067134, + "balance_loss_clip": 1.04427886, + "balance_loss_mlp": 1.04850125, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 3.143718325893165, + "language_loss": 0.79036629, + "learning_rate": 3.861974388030356e-06, + "loss": 0.81248355, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 2.6236042976379395 + }, + { + "auxiliary_loss_clip": 0.01091583, + "auxiliary_loss_mlp": 0.01080147, + "balance_loss_clip": 1.04062641, + "balance_loss_mlp": 1.06239653, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.837335724158771, + "language_loss": 0.71833038, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74004763, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.674039840698242 + }, + { + "auxiliary_loss_clip": 0.01114376, + "auxiliary_loss_mlp": 0.01076239, + "balance_loss_clip": 1.04210222, + "balance_loss_mlp": 1.05668831, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.912865656722861, + "language_loss": 0.90106058, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92296672, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.609821319580078 + }, + { + "auxiliary_loss_clip": 0.01130491, + "auxiliary_loss_mlp": 0.01086217, + "balance_loss_clip": 1.04318202, + "balance_loss_mlp": 1.06859803, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 1.8221269903942017, + "language_loss": 0.82629859, + "learning_rate": 3.861547549218276e-06, + "loss": 0.84846568, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.567640781402588 + }, + { + "auxiliary_loss_clip": 0.010761, + "auxiliary_loss_mlp": 0.01095708, + "balance_loss_clip": 1.03708541, + "balance_loss_mlp": 1.0762651, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.5017250769824946, + "language_loss": 0.81582004, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83753818, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.694150686264038 + }, + { + "auxiliary_loss_clip": 0.01032985, + "auxiliary_loss_mlp": 0.00748966, + "balance_loss_clip": 1.03309608, + "balance_loss_mlp": 0.99997789, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9159121270327584, + "language_loss": 0.63360065, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65142018, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.235109806060791 + }, + { + "auxiliary_loss_clip": 0.01094387, + "auxiliary_loss_mlp": 0.00750913, + "balance_loss_clip": 1.04823518, + "balance_loss_mlp": 0.99999583, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.5804201878351982, + "language_loss": 0.82720608, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84565914, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.8008129596710205 + }, + { + "auxiliary_loss_clip": 0.01122338, + "auxiliary_loss_mlp": 0.0106164, + "balance_loss_clip": 1.04660988, + "balance_loss_mlp": 1.04329407, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.254630796367846, + "language_loss": 0.78816593, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81000578, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.6344048976898193 + }, + { + "auxiliary_loss_clip": 0.01133931, + "auxiliary_loss_mlp": 0.01057222, + "balance_loss_clip": 1.04592776, + "balance_loss_mlp": 1.03887594, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.1324792218941893, + "language_loss": 0.83111525, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85302681, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 2.647409439086914 + }, + { + "auxiliary_loss_clip": 0.01140974, + "auxiliary_loss_mlp": 0.01053427, + "balance_loss_clip": 1.04575038, + "balance_loss_mlp": 1.0354743, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.8553466369983826, + "language_loss": 0.87541223, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89735615, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 2.5942258834838867 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01053739, + "balance_loss_clip": 1.0390619, + "balance_loss_mlp": 1.03248382, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 2.365083578905265, + "language_loss": 0.67065167, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69218546, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.6734399795532227 + }, + { + "auxiliary_loss_clip": 0.01127505, + "auxiliary_loss_mlp": 0.01044881, + "balance_loss_clip": 1.04186189, + "balance_loss_mlp": 1.02711844, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.8551924250390444, + "language_loss": 0.83248955, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85421342, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 2.5968782901763916 + }, + { + "auxiliary_loss_clip": 0.01092749, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.0359447, + "balance_loss_mlp": 1.02739906, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.6334228187383855, + "language_loss": 0.78863388, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81000781, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 2.6768810749053955 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.04685259, + "balance_loss_mlp": 1.02572715, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.7868902717500887, + "language_loss": 0.83320856, + "learning_rate": 3.860120165643504e-06, + "loss": 0.8550837, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 2.5344769954681396 + }, + { + "auxiliary_loss_clip": 0.01138705, + "auxiliary_loss_mlp": 0.01051352, + "balance_loss_clip": 1.04687774, + "balance_loss_mlp": 1.03150368, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.8397702259665498, + "language_loss": 0.78695309, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80885369, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 4.321853160858154 + }, + { + "auxiliary_loss_clip": 0.01139245, + "auxiliary_loss_mlp": 0.00751158, + "balance_loss_clip": 1.04271793, + "balance_loss_mlp": 0.9999913, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 2.0542794733028673, + "language_loss": 0.79917419, + "learning_rate": 3.859833842323822e-06, + "loss": 0.81807828, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 2.633399724960327 + }, + { + "auxiliary_loss_clip": 0.0110375, + "auxiliary_loss_mlp": 0.01048834, + "balance_loss_clip": 1.04658973, + "balance_loss_mlp": 1.02852023, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.9857350691962738, + "language_loss": 0.78346813, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80499393, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 2.6768295764923096 + }, + { + "auxiliary_loss_clip": 0.01016885, + "auxiliary_loss_mlp": 0.01028726, + "balance_loss_clip": 1.01398349, + "balance_loss_mlp": 1.02629459, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8497926017675772, + "language_loss": 0.58395702, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60441315, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 3.181389808654785 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_clip": 1.0416615, + "balance_loss_mlp": 1.02511477, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.0759790879053024, + "language_loss": 0.88239646, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90417105, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 2.518305778503418 + }, + { + "auxiliary_loss_clip": 0.01130853, + "auxiliary_loss_mlp": 0.00750905, + "balance_loss_clip": 1.04390574, + "balance_loss_mlp": 0.99994993, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 4.632266755767882, + "language_loss": 0.7438516, + "learning_rate": 3.85926034942691e-06, + "loss": 0.76266921, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 5.697275400161743 + }, + { + "auxiliary_loss_clip": 0.01141328, + "auxiliary_loss_mlp": 0.01053325, + "balance_loss_clip": 1.04318047, + "balance_loss_mlp": 1.03271413, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.7300649355512707, + "language_loss": 0.73189855, + "learning_rate": 3.859116799930736e-06, + "loss": 0.7538451, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 2.5855443477630615 + }, + { + "auxiliary_loss_clip": 0.01130759, + "auxiliary_loss_mlp": 0.01045211, + "balance_loss_clip": 1.04656243, + "balance_loss_mlp": 1.02751994, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.9135835446275833, + "language_loss": 0.74842817, + "learning_rate": 3.858973179936668e-06, + "loss": 0.77018785, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 2.6044392585754395 + }, + { + "auxiliary_loss_clip": 0.01128085, + "auxiliary_loss_mlp": 0.01050226, + "balance_loss_clip": 1.04347861, + "balance_loss_mlp": 1.0313549, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.134922867673692, + "language_loss": 0.74173981, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76352298, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 2.7451086044311523 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.04320419, + "balance_loss_mlp": 1.0280025, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6829349727702598, + "language_loss": 0.82837069, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85019857, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 4.134527921676636 + }, + { + "auxiliary_loss_clip": 0.01132666, + "auxiliary_loss_mlp": 0.01055954, + "balance_loss_clip": 1.04559135, + "balance_loss_mlp": 1.03512859, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 2.2547136626012128, + "language_loss": 0.72198045, + "learning_rate": 3.858541897021563e-06, + "loss": 0.74386662, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.5635812282562256 + }, + { + "auxiliary_loss_clip": 0.01108192, + "auxiliary_loss_mlp": 0.01045799, + "balance_loss_clip": 1.04269814, + "balance_loss_mlp": 1.02649915, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 2.7133779230309703, + "language_loss": 0.810992, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83253193, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 2.599454641342163 + }, + { + "auxiliary_loss_clip": 0.01118447, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.0412364, + "balance_loss_mlp": 1.03006268, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 1.6446293837219221, + "language_loss": 0.82983005, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85152733, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.575486183166504 + }, + { + "auxiliary_loss_clip": 0.01114062, + "auxiliary_loss_mlp": 0.01055336, + "balance_loss_clip": 1.04417884, + "balance_loss_mlp": 1.03611922, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.8276560604827856, + "language_loss": 0.70961511, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73130906, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.6169545650482178 + }, + { + "auxiliary_loss_clip": 0.01049781, + "auxiliary_loss_mlp": 0.01003295, + "balance_loss_clip": 1.01743388, + "balance_loss_mlp": 1.00083959, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8252929724040454, + "language_loss": 0.63080275, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65133345, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 2.983109951019287 + }, + { + "auxiliary_loss_clip": 0.0109439, + "auxiliary_loss_mlp": 0.01042144, + "balance_loss_clip": 1.04150653, + "balance_loss_mlp": 1.02233112, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.7231240975155182, + "language_loss": 0.74761951, + "learning_rate": 3.857821682713975e-06, + "loss": 0.76898485, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.7187633514404297 + }, + { + "auxiliary_loss_clip": 0.01140876, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.04404891, + "balance_loss_mlp": 1.02065408, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.2710737862175208, + "language_loss": 0.8489641, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87075818, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.5892670154571533 + }, + { + "auxiliary_loss_clip": 0.01043867, + "auxiliary_loss_mlp": 0.01006258, + "balance_loss_clip": 1.01199126, + "balance_loss_mlp": 1.00364733, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7916035167134594, + "language_loss": 0.56859112, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58909237, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 3.038787603378296 + }, + { + "auxiliary_loss_clip": 0.0110526, + "auxiliary_loss_mlp": 0.01043471, + "balance_loss_clip": 1.03889394, + "balance_loss_mlp": 1.02402759, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 2.0247677486525117, + "language_loss": 0.85116112, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87264836, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 2.6368653774261475 + }, + { + "auxiliary_loss_clip": 0.01128745, + "auxiliary_loss_mlp": 0.0105134, + "balance_loss_clip": 1.041435, + "balance_loss_mlp": 1.03116989, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 1.9829229720150772, + "language_loss": 0.75101078, + "learning_rate": 3.857244243157052e-06, + "loss": 0.77281159, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.5247721672058105 + }, + { + "auxiliary_loss_clip": 0.01106411, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.03863573, + "balance_loss_mlp": 1.01958203, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6589459658464183, + "language_loss": 0.82322204, + "learning_rate": 3.85709970718691e-06, + "loss": 0.84465462, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.6734046936035156 + }, + { + "auxiliary_loss_clip": 0.01074644, + "auxiliary_loss_mlp": 0.01039536, + "balance_loss_clip": 1.04364419, + "balance_loss_mlp": 1.02157092, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.619296795545299, + "language_loss": 0.743132, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76427388, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.748039722442627 + }, + { + "auxiliary_loss_clip": 0.0111881, + "auxiliary_loss_mlp": 0.01053028, + "balance_loss_clip": 1.04124248, + "balance_loss_mlp": 1.03334618, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.0838924345235372, + "language_loss": 0.75648934, + "learning_rate": 3.856810423987889e-06, + "loss": 0.77820772, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 2.578416585922241 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01041255, + "balance_loss_clip": 1.04102397, + "balance_loss_mlp": 1.02276564, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.853046510238175, + "language_loss": 0.83273679, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85434985, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 2.591193199157715 + }, + { + "auxiliary_loss_clip": 0.0110906, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.04453731, + "balance_loss_mlp": 1.03308606, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.098339455976109, + "language_loss": 0.83651263, + "learning_rate": 3.85652085914712e-06, + "loss": 0.85811931, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 2.7964203357696533 + }, + { + "auxiliary_loss_clip": 0.01126902, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.04306054, + "balance_loss_mlp": 1.02226591, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.7813696084557429, + "language_loss": 0.84247255, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86415398, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 2.604637622833252 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01042896, + "balance_loss_clip": 1.04575491, + "balance_loss_mlp": 1.02434695, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.932009837007164, + "language_loss": 0.74977583, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77150798, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.5822699069976807 + }, + { + "auxiliary_loss_clip": 0.01095593, + "auxiliary_loss_mlp": 0.01048728, + "balance_loss_clip": 1.04327726, + "balance_loss_mlp": 1.02743769, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 1.9531887841723696, + "language_loss": 0.83467686, + "learning_rate": 3.856085983903782e-06, + "loss": 0.85612005, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.7272372245788574 + }, + { + "auxiliary_loss_clip": 0.0110266, + "auxiliary_loss_mlp": 0.01040406, + "balance_loss_clip": 1.03713918, + "balance_loss_mlp": 1.022048, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.175813158614774, + "language_loss": 0.75603652, + "learning_rate": 3.855940884716071e-06, + "loss": 0.77746713, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.620339870452881 + }, + { + "auxiliary_loss_clip": 0.01114307, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.04455662, + "balance_loss_mlp": 1.02639318, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6519729340652813, + "language_loss": 0.81233346, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83394581, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.713921070098877 + }, + { + "auxiliary_loss_clip": 0.0113255, + "auxiliary_loss_mlp": 0.01048699, + "balance_loss_clip": 1.04253531, + "balance_loss_mlp": 1.02837396, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.0499450922195126, + "language_loss": 0.66096228, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68277478, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.5510754585266113 + }, + { + "auxiliary_loss_clip": 0.01117028, + "auxiliary_loss_mlp": 0.01045168, + "balance_loss_clip": 1.04427946, + "balance_loss_mlp": 1.02573669, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.714843259411667, + "language_loss": 0.67357802, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69519997, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 2.908027410507202 + }, + { + "auxiliary_loss_clip": 0.01131195, + "auxiliary_loss_mlp": 0.01051948, + "balance_loss_clip": 1.04231453, + "balance_loss_mlp": 1.03184962, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 1.689319847454231, + "language_loss": 0.76533389, + "learning_rate": 3.855359784245646e-06, + "loss": 0.7871654, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.541433334350586 + }, + { + "auxiliary_loss_clip": 0.01107548, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.03700471, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 2.8412410707751676, + "language_loss": 0.79343235, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81507057, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 2.6348609924316406 + }, + { + "auxiliary_loss_clip": 0.01149859, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_clip": 1.04764056, + "balance_loss_mlp": 1.02602649, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 1.6465081510168336, + "language_loss": 0.76117778, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78313714, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 2.544355630874634 + }, + { + "auxiliary_loss_clip": 0.00990693, + "auxiliary_loss_mlp": 0.0100646, + "balance_loss_clip": 1.0122602, + "balance_loss_mlp": 1.00398052, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.7819335899224595, + "language_loss": 0.60061592, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62058747, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.392857074737549 + }, + { + "auxiliary_loss_clip": 0.01118071, + "auxiliary_loss_mlp": 0.01043049, + "balance_loss_clip": 1.04125178, + "balance_loss_mlp": 1.02331972, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.6746169761761958, + "language_loss": 0.87751615, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89912736, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 3.1750729084014893 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.04043698, + "balance_loss_mlp": 1.02673769, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 1.932597542865697, + "language_loss": 0.76028979, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78183079, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 2.6712472438812256 + }, + { + "auxiliary_loss_clip": 0.01109107, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.04159009, + "balance_loss_mlp": 1.02624655, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.862307842125037, + "language_loss": 0.76016188, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78171813, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.6620261669158936 + }, + { + "auxiliary_loss_clip": 0.01138349, + "auxiliary_loss_mlp": 0.01046582, + "balance_loss_clip": 1.04314971, + "balance_loss_mlp": 1.0270201, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.9575369929884834, + "language_loss": 0.72379673, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.745646, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 4.108793497085571 + }, + { + "auxiliary_loss_clip": 0.011077, + "auxiliary_loss_mlp": 0.01047047, + "balance_loss_clip": 1.03888464, + "balance_loss_mlp": 1.02611399, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 1.9381481280898374, + "language_loss": 0.89511907, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91666651, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 2.615057945251465 + }, + { + "auxiliary_loss_clip": 0.0110065, + "auxiliary_loss_mlp": 0.01047903, + "balance_loss_clip": 1.04011869, + "balance_loss_mlp": 1.02689791, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.366511549326905, + "language_loss": 0.80249941, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82398498, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 2.6449615955352783 + }, + { + "auxiliary_loss_clip": 0.01136822, + "auxiliary_loss_mlp": 0.01049863, + "balance_loss_clip": 1.04435408, + "balance_loss_mlp": 1.03022933, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.9223247963189576, + "language_loss": 0.77177656, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79364341, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01109233, + "auxiliary_loss_mlp": 0.01054428, + "balance_loss_clip": 1.04589128, + "balance_loss_mlp": 1.03326869, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8998490479298171, + "language_loss": 0.82518852, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84682512, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 2.6745355129241943 + }, + { + "auxiliary_loss_clip": 0.01082443, + "auxiliary_loss_mlp": 0.01046234, + "balance_loss_clip": 1.04140568, + "balance_loss_mlp": 1.02755344, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9718789080606458, + "language_loss": 0.80383551, + "learning_rate": 3.85360973012719e-06, + "loss": 0.8251223, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 4.18423318862915 + }, + { + "auxiliary_loss_clip": 0.0112433, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.04240215, + "balance_loss_mlp": 1.02639472, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.5290142691237192, + "language_loss": 0.77753806, + "learning_rate": 3.853463435273058e-06, + "loss": 0.79922754, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 2.645042657852173 + }, + { + "auxiliary_loss_clip": 0.01023994, + "auxiliary_loss_mlp": 0.01016239, + "balance_loss_clip": 1.01364422, + "balance_loss_mlp": 1.01338983, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8012966107920765, + "language_loss": 0.60161859, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62202096, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 4.767845869064331 + }, + { + "auxiliary_loss_clip": 0.0108127, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.03854012, + "balance_loss_mlp": 1.02382755, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.1171159221010605, + "language_loss": 0.70862055, + "learning_rate": 3.853170634719787e-06, + "loss": 0.72986132, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 2.7042057514190674 + }, + { + "auxiliary_loss_clip": 0.01118646, + "auxiliary_loss_mlp": 0.01046526, + "balance_loss_clip": 1.04207504, + "balance_loss_mlp": 1.02690387, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.599011996264243, + "language_loss": 0.80770922, + "learning_rate": 3.853024129031751e-06, + "loss": 0.8293609, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 2.62339448928833 + }, + { + "auxiliary_loss_clip": 0.01116234, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.04423404, + "balance_loss_mlp": 1.02479386, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 11.031568330065182, + "language_loss": 0.84708828, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86869025, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 4.177184820175171 + }, + { + "auxiliary_loss_clip": 0.01128395, + "auxiliary_loss_mlp": 0.01056547, + "balance_loss_clip": 1.04440165, + "balance_loss_mlp": 1.03511286, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 2.163324951312862, + "language_loss": 0.77497756, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79682696, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.558465003967285 + }, + { + "auxiliary_loss_clip": 0.01112358, + "auxiliary_loss_mlp": 0.01038888, + "balance_loss_clip": 1.04157972, + "balance_loss_mlp": 1.01882517, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.7164704574085166, + "language_loss": 0.78836977, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80988216, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.6910288333892822 + }, + { + "auxiliary_loss_clip": 0.01126183, + "auxiliary_loss_mlp": 0.00750723, + "balance_loss_clip": 1.04230618, + "balance_loss_mlp": 0.99987638, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.530836371346313, + "language_loss": 0.7068342, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72560328, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.6233906745910645 + }, + { + "auxiliary_loss_clip": 0.01117855, + "auxiliary_loss_mlp": 0.0075114, + "balance_loss_clip": 1.04128051, + "balance_loss_mlp": 0.99994349, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 4.781127128991627, + "language_loss": 0.8516562, + "learning_rate": 3.852290546699863e-06, + "loss": 0.87034619, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.668621063232422 + }, + { + "auxiliary_loss_clip": 0.01119389, + "auxiliary_loss_mlp": 0.01047865, + "balance_loss_clip": 1.04283988, + "balance_loss_mlp": 1.02746844, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 1.94836143954265, + "language_loss": 0.84829921, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.86997175, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.57773756980896 + }, + { + "auxiliary_loss_clip": 0.0112477, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.04167593, + "balance_loss_mlp": 1.02512109, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.0761670565369914, + "language_loss": 0.74119747, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76286149, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 2.5472943782806396 + }, + { + "auxiliary_loss_clip": 0.01129672, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_clip": 1.04342937, + "balance_loss_mlp": 1.02481139, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.243624613845159, + "language_loss": 0.71753573, + "learning_rate": 3.8518495543877e-06, + "loss": 0.73926747, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.704087257385254 + }, + { + "auxiliary_loss_clip": 0.01117627, + "auxiliary_loss_mlp": 0.01045343, + "balance_loss_clip": 1.04451525, + "balance_loss_mlp": 1.02642465, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.677386837113887, + "language_loss": 0.70388639, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72551608, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 2.6675429344177246 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01050166, + "balance_loss_clip": 1.03987741, + "balance_loss_mlp": 1.03116429, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 3.1789871738596944, + "language_loss": 0.81581533, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.8374458, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.540471315383911 + }, + { + "auxiliary_loss_clip": 0.01099488, + "auxiliary_loss_mlp": 0.01055986, + "balance_loss_clip": 1.04296041, + "balance_loss_mlp": 1.03719854, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 5.093531443174308, + "language_loss": 0.80098319, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82253796, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 2.739896774291992 + }, + { + "auxiliary_loss_clip": 0.0111731, + "auxiliary_loss_mlp": 0.01054007, + "balance_loss_clip": 1.0392983, + "balance_loss_mlp": 1.03381336, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 1.840975542070631, + "language_loss": 0.90634799, + "learning_rate": 3.851260581551727e-06, + "loss": 0.92806113, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.597886323928833 + }, + { + "auxiliary_loss_clip": 0.01128725, + "auxiliary_loss_mlp": 0.01055888, + "balance_loss_clip": 1.04316568, + "balance_loss_mlp": 1.03679061, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 4.495382198315695, + "language_loss": 0.78589338, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8077395, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.5404114723205566 + }, + { + "auxiliary_loss_clip": 0.01128382, + "auxiliary_loss_mlp": 0.0104922, + "balance_loss_clip": 1.04156375, + "balance_loss_mlp": 1.02978897, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 1.628306021383216, + "language_loss": 0.79997313, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82174915, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 2.577725410461426 + }, + { + "auxiliary_loss_clip": 0.01112895, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.04023242, + "balance_loss_mlp": 1.02415717, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.8871628427230984, + "language_loss": 0.66077363, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68233931, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 2.988060474395752 + }, + { + "auxiliary_loss_clip": 0.01035639, + "auxiliary_loss_mlp": 0.01008168, + "balance_loss_clip": 1.01302695, + "balance_loss_mlp": 1.00562894, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.8854168447425751, + "language_loss": 0.59458482, + "learning_rate": 3.850670485516019e-06, + "loss": 0.6150229, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.1324245929718018 + }, + { + "auxiliary_loss_clip": 0.01140623, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.04169309, + "balance_loss_mlp": 1.02912831, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 1.9575653407537825, + "language_loss": 0.6550287, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67692864, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 2.5249860286712646 + }, + { + "auxiliary_loss_clip": 0.01095369, + "auxiliary_loss_mlp": 0.01047727, + "balance_loss_clip": 1.0387671, + "balance_loss_mlp": 1.02759254, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4037964830835425, + "language_loss": 0.75203133, + "learning_rate": 3.850375016410121e-06, + "loss": 0.7734623, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 2.6712729930877686 + }, + { + "auxiliary_loss_clip": 0.01111669, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.04453301, + "balance_loss_mlp": 1.0224787, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.2912403580907053, + "language_loss": 0.72313535, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74468094, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.5892586708068848 + }, + { + "auxiliary_loss_clip": 0.0110829, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.03949428, + "balance_loss_mlp": 1.02776098, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 2.427518127840639, + "language_loss": 0.71786761, + "learning_rate": 3.850079266638601e-06, + "loss": 0.73942477, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.697291374206543 + }, + { + "auxiliary_loss_clip": 0.01103746, + "auxiliary_loss_mlp": 0.01052699, + "balance_loss_clip": 1.03904462, + "balance_loss_mlp": 1.03304183, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 1.8510497536877373, + "language_loss": 0.65467978, + "learning_rate": 3.849931286517249e-06, + "loss": 0.6762442, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.770895004272461 + }, + { + "auxiliary_loss_clip": 0.01115376, + "auxiliary_loss_mlp": 0.01053605, + "balance_loss_clip": 1.03994608, + "balance_loss_mlp": 1.03302956, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.0120744346144, + "language_loss": 0.83203101, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85372078, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.586024522781372 + }, + { + "auxiliary_loss_clip": 0.01099452, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_clip": 1.03792679, + "balance_loss_mlp": 1.02688158, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.1994133914193905, + "language_loss": 0.7743271, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79577249, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.652240037918091 + }, + { + "auxiliary_loss_clip": 0.01136181, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.04164696, + "balance_loss_mlp": 1.02520347, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.0395391052207166, + "language_loss": 0.8526988, + "learning_rate": 3.849486925278176e-06, + "loss": 0.8744905, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 2.5707132816314697 + }, + { + "auxiliary_loss_clip": 0.01122937, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.04022431, + "balance_loss_mlp": 1.02565539, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6483687727465453, + "language_loss": 0.8309586, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85262698, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 2.582059383392334 + }, + { + "auxiliary_loss_clip": 0.01095703, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.03764677, + "balance_loss_mlp": 1.02197719, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 1.9210915470613992, + "language_loss": 0.7626276, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78398848, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.643407106399536 + }, + { + "auxiliary_loss_clip": 0.01141461, + "auxiliary_loss_mlp": 0.01039631, + "balance_loss_clip": 1.04287004, + "balance_loss_mlp": 1.02111816, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 2.7775956624238014, + "language_loss": 0.76045531, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78226626, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 2.544254779815674 + }, + { + "auxiliary_loss_clip": 0.01121104, + "auxiliary_loss_mlp": 0.01044976, + "balance_loss_clip": 1.03793073, + "balance_loss_mlp": 1.02738035, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 1.983149423530325, + "language_loss": 0.68670058, + "learning_rate": 3.848893461794131e-06, + "loss": 0.70836139, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 2.5682060718536377 + }, + { + "auxiliary_loss_clip": 0.0110241, + "auxiliary_loss_mlp": 0.01046778, + "balance_loss_clip": 1.04050374, + "balance_loss_mlp": 1.02884912, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.957054187352898, + "language_loss": 0.77739251, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79888439, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 2.787946939468384 + }, + { + "auxiliary_loss_clip": 0.01117137, + "auxiliary_loss_mlp": 0.00751011, + "balance_loss_clip": 1.03951716, + "balance_loss_mlp": 0.99993211, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.4477234992104426, + "language_loss": 0.8024255, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82110697, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 2.560016632080078 + }, + { + "auxiliary_loss_clip": 0.01128185, + "auxiliary_loss_mlp": 0.01050678, + "balance_loss_clip": 1.04183912, + "balance_loss_mlp": 1.03106761, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.0739226520982594, + "language_loss": 0.73508579, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.75687438, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 4.055174350738525 + }, + { + "auxiliary_loss_clip": 0.0106766, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.03415406, + "balance_loss_mlp": 1.0189817, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.078265366682507, + "language_loss": 0.69290465, + "learning_rate": 3.848298876546534e-06, + "loss": 0.7139554, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.7302963733673096 + }, + { + "auxiliary_loss_clip": 0.01124191, + "auxiliary_loss_mlp": 0.01045926, + "balance_loss_clip": 1.04059243, + "balance_loss_mlp": 1.02787805, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 2.4223485566264342, + "language_loss": 0.73631096, + "learning_rate": 3.84815005500134e-06, + "loss": 0.75801206, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 2.640376091003418 + }, + { + "auxiliary_loss_clip": 0.01001963, + "auxiliary_loss_mlp": 0.01012449, + "balance_loss_clip": 1.02172422, + "balance_loss_mlp": 1.00942063, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.9265367858785556, + "language_loss": 0.64792895, + "learning_rate": 3.84800116337411e-06, + "loss": 0.66807306, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 3.2310216426849365 + }, + { + "auxiliary_loss_clip": 0.01121892, + "auxiliary_loss_mlp": 0.0104276, + "balance_loss_clip": 1.041785, + "balance_loss_mlp": 1.02583206, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 9.113601746915817, + "language_loss": 0.73036313, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75200963, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 2.586472988128662 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.04079342, + "balance_loss_mlp": 1.01803577, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 2.9869358209024206, + "language_loss": 0.77678579, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.7982024, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 4.164937496185303 + }, + { + "auxiliary_loss_clip": 0.01039942, + "auxiliary_loss_mlp": 0.01001049, + "balance_loss_clip": 1.01773882, + "balance_loss_mlp": 0.9987362, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7240639368902041, + "language_loss": 0.5466826, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56709254, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.1387929916381836 + }, + { + "auxiliary_loss_clip": 0.01086232, + "auxiliary_loss_mlp": 0.01047611, + "balance_loss_clip": 1.03632784, + "balance_loss_mlp": 1.02843022, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 3.416101175932062, + "language_loss": 0.78499341, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.80633187, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 4.161744832992554 + }, + { + "auxiliary_loss_clip": 0.01127241, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_clip": 1.04213357, + "balance_loss_mlp": 1.03027761, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 1.9296621604642739, + "language_loss": 0.70017052, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72194111, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 2.6018595695495605 + }, + { + "auxiliary_loss_clip": 0.01128766, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_clip": 1.04207957, + "balance_loss_mlp": 1.02718341, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9436183019018176, + "language_loss": 0.78652447, + "learning_rate": 3.847106342204354e-06, + "loss": 0.80827057, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 2.617187023162842 + }, + { + "auxiliary_loss_clip": 0.01122265, + "auxiliary_loss_mlp": 0.01047816, + "balance_loss_clip": 1.04334128, + "balance_loss_mlp": 1.0285871, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 1.7823757225451697, + "language_loss": 0.74669194, + "learning_rate": 3.846956960161114e-06, + "loss": 0.7683928, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 4.203149318695068 + }, + { + "auxiliary_loss_clip": 0.011125, + "auxiliary_loss_mlp": 0.01045756, + "balance_loss_clip": 1.04138947, + "balance_loss_mlp": 1.02596748, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.4644756437601223, + "language_loss": 0.81856251, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84014505, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.739283800125122 + }, + { + "auxiliary_loss_clip": 0.0100694, + "auxiliary_loss_mlp": 0.0101033, + "balance_loss_clip": 1.02221429, + "balance_loss_mlp": 1.00780284, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8177730138539023, + "language_loss": 0.57872099, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59889364, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.177133798599243 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01057415, + "balance_loss_clip": 1.03857517, + "balance_loss_mlp": 1.03632712, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.9005177716539712, + "language_loss": 0.74887478, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.7706055, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.632263422012329 + }, + { + "auxiliary_loss_clip": 0.01117239, + "auxiliary_loss_mlp": 0.01047918, + "balance_loss_clip": 1.04013348, + "balance_loss_mlp": 1.02929771, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8264048094420369, + "language_loss": 0.74647123, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76812279, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 2.6055755615234375 + }, + { + "auxiliary_loss_clip": 0.01121449, + "auxiliary_loss_mlp": 0.01048085, + "balance_loss_clip": 1.04191279, + "balance_loss_mlp": 1.02807009, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 2.422593678859601, + "language_loss": 0.79893517, + "learning_rate": 3.846208999506402e-06, + "loss": 0.82063055, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.5861592292785645 + }, + { + "auxiliary_loss_clip": 0.01112813, + "auxiliary_loss_mlp": 0.01043899, + "balance_loss_clip": 1.04132223, + "balance_loss_mlp": 1.02655411, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 2.013774301146475, + "language_loss": 0.84856617, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87013334, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.634096145629883 + }, + { + "auxiliary_loss_clip": 0.0110465, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.04031205, + "balance_loss_mlp": 1.02840447, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.155206706061296, + "language_loss": 0.69166875, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71319199, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 2.747229814529419 + }, + { + "auxiliary_loss_clip": 0.01114668, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.04352117, + "balance_loss_mlp": 1.024611, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.1778852355142844, + "language_loss": 0.86830032, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88988209, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.628998041152954 + }, + { + "auxiliary_loss_clip": 0.01107612, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.04002869, + "balance_loss_mlp": 1.01932538, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.9953061007060064, + "language_loss": 0.83396691, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85542554, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.626615047454834 + }, + { + "auxiliary_loss_clip": 0.01103257, + "auxiliary_loss_mlp": 0.01040206, + "balance_loss_clip": 1.03914499, + "balance_loss_mlp": 1.02169263, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.1816810097034103, + "language_loss": 0.80303013, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82446474, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.63307785987854 + }, + { + "auxiliary_loss_clip": 0.01123057, + "auxiliary_loss_mlp": 0.010421, + "balance_loss_clip": 1.04178238, + "balance_loss_mlp": 1.02451682, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.038472641046662, + "language_loss": 0.78252894, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.8041805, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 2.6030237674713135 + }, + { + "auxiliary_loss_clip": 0.01127151, + "auxiliary_loss_mlp": 0.01043956, + "balance_loss_clip": 1.04251862, + "balance_loss_mlp": 1.025038, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.9799889589272288, + "language_loss": 0.87701201, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89872313, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 2.6454410552978516 + }, + { + "auxiliary_loss_clip": 0.01090961, + "auxiliary_loss_mlp": 0.01048371, + "balance_loss_clip": 1.0362792, + "balance_loss_mlp": 1.02957165, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.3547346396992053, + "language_loss": 0.78620505, + "learning_rate": 3.84500862231636e-06, + "loss": 0.80759835, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 2.6736843585968018 + }, + { + "auxiliary_loss_clip": 0.01143116, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_clip": 1.04277349, + "balance_loss_mlp": 1.02585149, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 3.2482056926146394, + "language_loss": 0.76917446, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79106772, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.4941060543060303 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01046095, + "balance_loss_clip": 1.04093611, + "balance_loss_mlp": 1.02773643, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.217926734157268, + "language_loss": 0.78209054, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80376387, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.551727056503296 + }, + { + "auxiliary_loss_clip": 0.01108942, + "auxiliary_loss_mlp": 0.01057284, + "balance_loss_clip": 1.043787, + "balance_loss_mlp": 1.03871143, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.2625837994305047, + "language_loss": 0.75735676, + "learning_rate": 3.844557326325461e-06, + "loss": 0.779019, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.595219612121582 + }, + { + "auxiliary_loss_clip": 0.01130006, + "auxiliary_loss_mlp": 0.01043864, + "balance_loss_clip": 1.04368567, + "balance_loss_mlp": 1.02575636, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.1336456420030596, + "language_loss": 0.77615136, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79789007, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 2.537180185317993 + }, + { + "auxiliary_loss_clip": 0.01084765, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.03946233, + "balance_loss_mlp": 1.02178144, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.5263567377131497, + "language_loss": 0.89620131, + "learning_rate": 3.844256112593029e-06, + "loss": 0.91743737, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.6762478351593018 + }, + { + "auxiliary_loss_clip": 0.01126004, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.04337144, + "balance_loss_mlp": 1.03077173, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 1.9227708767923286, + "language_loss": 0.93362999, + "learning_rate": 3.844105400822391e-06, + "loss": 0.9553892, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.614140272140503 + }, + { + "auxiliary_loss_clip": 0.01109053, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_clip": 1.03736854, + "balance_loss_mlp": 1.02796221, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.908777382265012, + "language_loss": 0.75227678, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77382392, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.6862077713012695 + }, + { + "auxiliary_loss_clip": 0.01099423, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.04085398, + "balance_loss_mlp": 1.02872765, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.5390703917896527, + "language_loss": 0.81076491, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83223355, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.620253086090088 + }, + { + "auxiliary_loss_clip": 0.01145066, + "auxiliary_loss_mlp": 0.0105001, + "balance_loss_clip": 1.04670262, + "balance_loss_mlp": 1.03128266, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.1617451342177683, + "language_loss": 0.77495193, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79690278, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 2.5888681411743164 + }, + { + "auxiliary_loss_clip": 0.01127927, + "auxiliary_loss_mlp": 0.0104657, + "balance_loss_clip": 1.04267025, + "balance_loss_mlp": 1.02784228, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 1.9756591997760626, + "language_loss": 0.85963184, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88137686, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 2.620595932006836 + }, + { + "auxiliary_loss_clip": 0.0113247, + "auxiliary_loss_mlp": 0.01054964, + "balance_loss_clip": 1.04280734, + "balance_loss_mlp": 1.03454328, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.3988781712275262, + "language_loss": 0.82430077, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84617507, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 2.537855863571167 + }, + { + "auxiliary_loss_clip": 0.01141476, + "auxiliary_loss_mlp": 0.01043028, + "balance_loss_clip": 1.04519868, + "balance_loss_mlp": 1.02474141, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 7.358538586457604, + "language_loss": 0.71276069, + "learning_rate": 3.843199661896884e-06, + "loss": 0.73460579, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 2.534360408782959 + }, + { + "auxiliary_loss_clip": 0.0111301, + "auxiliary_loss_mlp": 0.01047911, + "balance_loss_clip": 1.04285777, + "balance_loss_mlp": 1.0270617, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 2.068626228069647, + "language_loss": 0.77764213, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79925132, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 2.843782424926758 + }, + { + "auxiliary_loss_clip": 0.01091746, + "auxiliary_loss_mlp": 0.01052265, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.03272676, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.1621380440933655, + "language_loss": 0.74364805, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76508814, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 2.7973732948303223 + }, + { + "auxiliary_loss_clip": 0.01119199, + "auxiliary_loss_mlp": 0.01050483, + "balance_loss_clip": 1.04180574, + "balance_loss_mlp": 1.03107584, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.4595597931843747, + "language_loss": 0.80781257, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82950944, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 2.7129311561584473 + }, + { + "auxiliary_loss_clip": 0.01128328, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_clip": 1.04179597, + "balance_loss_mlp": 1.02778518, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.7437150408247677, + "language_loss": 0.75048947, + "learning_rate": 3.842594437983917e-06, + "loss": 0.7722379, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 2.5797994136810303 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.04390752, + "balance_loss_mlp": 1.02477419, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.9052927022155473, + "language_loss": 0.76917005, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79094934, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 4.060861349105835 + }, + { + "auxiliary_loss_clip": 0.01048248, + "auxiliary_loss_mlp": 0.01001951, + "balance_loss_clip": 1.02457643, + "balance_loss_mlp": 0.99963802, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9356402909403039, + "language_loss": 0.5666436, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58714557, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.0475234985351562 + }, + { + "auxiliary_loss_clip": 0.01095272, + "auxiliary_loss_mlp": 0.01049922, + "balance_loss_clip": 1.04059339, + "balance_loss_mlp": 1.03038383, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 2.701195261336819, + "language_loss": 0.88680679, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90825874, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 2.627572774887085 + }, + { + "auxiliary_loss_clip": 0.0113153, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.04445207, + "balance_loss_mlp": 1.0288384, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.7778086374879178, + "language_loss": 0.7797296, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80153096, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 2.572071075439453 + }, + { + "auxiliary_loss_clip": 0.01076257, + "auxiliary_loss_mlp": 0.01052882, + "balance_loss_clip": 1.04145145, + "balance_loss_mlp": 1.03205657, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.3920915779574328, + "language_loss": 0.77925545, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80054688, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 4.289572238922119 + }, + { + "auxiliary_loss_clip": 0.0110888, + "auxiliary_loss_mlp": 0.01050751, + "balance_loss_clip": 1.04135156, + "balance_loss_mlp": 1.03270257, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.6369848124520545, + "language_loss": 0.77315074, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79474699, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 2.6925220489501953 + }, + { + "auxiliary_loss_clip": 0.01125566, + "auxiliary_loss_mlp": 0.00750876, + "balance_loss_clip": 1.04567146, + "balance_loss_mlp": 0.99992335, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.9302161070130535, + "language_loss": 0.90098995, + "learning_rate": 3.84153260631005e-06, + "loss": 0.91975439, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.6197829246520996 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.0409596, + "balance_loss_mlp": 1.02851129, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 1.8045179864064527, + "language_loss": 0.70446491, + "learning_rate": 3.841380636700468e-06, + "loss": 0.72613829, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.6206414699554443 + }, + { + "auxiliary_loss_clip": 0.01117261, + "auxiliary_loss_mlp": 0.01044584, + "balance_loss_clip": 1.04117537, + "balance_loss_mlp": 1.02481937, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.1107648074732013, + "language_loss": 0.91872621, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94034469, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 4.253758192062378 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.04771304, + "balance_loss_mlp": 1.03296852, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 2.2917527363651287, + "language_loss": 0.6335814, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65535527, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 2.68399977684021 + }, + { + "auxiliary_loss_clip": 0.01116654, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.04011476, + "balance_loss_mlp": 1.02891457, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.8778598017957735, + "language_loss": 0.87863421, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90029037, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 4.109145164489746 + }, + { + "auxiliary_loss_clip": 0.01127289, + "auxiliary_loss_mlp": 0.01044697, + "balance_loss_clip": 1.04353571, + "balance_loss_mlp": 1.0263145, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 2.532135186516692, + "language_loss": 0.83045805, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85217786, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.552219867706299 + }, + { + "auxiliary_loss_clip": 0.01111777, + "auxiliary_loss_mlp": 0.0075119, + "balance_loss_clip": 1.04253566, + "balance_loss_mlp": 1.0000627, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.9776162797042312, + "language_loss": 0.74721539, + "learning_rate": 3.840619741387832e-06, + "loss": 0.76584506, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 2.588764190673828 + }, + { + "auxiliary_loss_clip": 0.01106014, + "auxiliary_loss_mlp": 0.01045685, + "balance_loss_clip": 1.04495502, + "balance_loss_mlp": 1.02582443, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 2.0384600632519074, + "language_loss": 0.75802058, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.77953756, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.7617411613464355 + }, + { + "auxiliary_loss_clip": 0.01117583, + "auxiliary_loss_mlp": 0.01050055, + "balance_loss_clip": 1.04270196, + "balance_loss_mlp": 1.03106463, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 2.0473102488386843, + "language_loss": 0.7055037, + "learning_rate": 3.840314894646969e-06, + "loss": 0.72718012, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.6478450298309326 + }, + { + "auxiliary_loss_clip": 0.01120711, + "auxiliary_loss_mlp": 0.0105133, + "balance_loss_clip": 1.03999341, + "balance_loss_mlp": 1.03200579, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.0909997500414144, + "language_loss": 0.71792114, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73964155, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.611644744873047 + }, + { + "auxiliary_loss_clip": 0.01135388, + "auxiliary_loss_mlp": 0.01040732, + "balance_loss_clip": 1.04030359, + "balance_loss_mlp": 1.02283859, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7342731598831782, + "language_loss": 0.85118991, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87295115, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 2.5783169269561768 + }, + { + "auxiliary_loss_clip": 0.01085784, + "auxiliary_loss_mlp": 0.0104195, + "balance_loss_clip": 1.0369854, + "balance_loss_mlp": 1.02383041, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.1863023192483557, + "language_loss": 0.78097332, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80225068, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.663149833679199 + }, + { + "auxiliary_loss_clip": 0.0110296, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.03867316, + "balance_loss_mlp": 1.01800323, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.977570886658088, + "language_loss": 0.70445275, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72586238, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.7161054611206055 + }, + { + "auxiliary_loss_clip": 0.01111178, + "auxiliary_loss_mlp": 0.01043717, + "balance_loss_clip": 1.04045832, + "balance_loss_mlp": 1.02605045, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.971132549412968, + "language_loss": 0.76587212, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78742111, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 2.6124820709228516 + }, + { + "auxiliary_loss_clip": 0.01123899, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.04225922, + "balance_loss_mlp": 1.02053332, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.423125166558152, + "language_loss": 0.77528352, + "learning_rate": 3.839398679771359e-06, + "loss": 0.79691684, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 2.5635464191436768 + }, + { + "auxiliary_loss_clip": 0.01114728, + "auxiliary_loss_mlp": 0.01042459, + "balance_loss_clip": 1.04034519, + "balance_loss_mlp": 1.02425587, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 2.386523121462181, + "language_loss": 0.82409757, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84566945, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 2.6265132427215576 + }, + { + "auxiliary_loss_clip": 0.01143141, + "auxiliary_loss_mlp": 0.01047758, + "balance_loss_clip": 1.04428673, + "balance_loss_mlp": 1.02933979, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.6062764843523343, + "language_loss": 0.90570748, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92761648, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.573443651199341 + }, + { + "auxiliary_loss_clip": 0.01069076, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.03644872, + "balance_loss_mlp": 1.03173459, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.6972003005513407, + "language_loss": 0.7025615, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72376978, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.7206954956054688 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.03859115, + "balance_loss_mlp": 1.02576816, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.7263058719770992, + "language_loss": 0.82549137, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84704375, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.614413261413574 + }, + { + "auxiliary_loss_clip": 0.01115076, + "auxiliary_loss_mlp": 0.01044035, + "balance_loss_clip": 1.03984904, + "balance_loss_mlp": 1.02598679, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 4.5829194318295, + "language_loss": 0.8488028, + "learning_rate": 3.838633249192036e-06, + "loss": 0.87039387, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.658653736114502 + }, + { + "auxiliary_loss_clip": 0.01137512, + "auxiliary_loss_mlp": 0.01041082, + "balance_loss_clip": 1.04015827, + "balance_loss_mlp": 1.02285457, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.7079355490994887, + "language_loss": 0.82184082, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84362674, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.6388611793518066 + }, + { + "auxiliary_loss_clip": 0.01109491, + "auxiliary_loss_mlp": 0.0104588, + "balance_loss_clip": 1.04325533, + "balance_loss_mlp": 1.02675855, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.1745441239412817, + "language_loss": 0.76445091, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78600454, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.7133355140686035 + }, + { + "auxiliary_loss_clip": 0.01103545, + "auxiliary_loss_mlp": 0.01048715, + "balance_loss_clip": 1.04042149, + "balance_loss_mlp": 1.02887893, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 1.9909196574337045, + "language_loss": 0.82546121, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84698379, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.6645326614379883 + }, + { + "auxiliary_loss_clip": 0.01124458, + "auxiliary_loss_mlp": 0.01048353, + "balance_loss_clip": 1.04625535, + "balance_loss_mlp": 1.03003013, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 2.152052754731745, + "language_loss": 0.80746984, + "learning_rate": 3.838019649712958e-06, + "loss": 0.829198, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.599752426147461 + }, + { + "auxiliary_loss_clip": 0.01036222, + "auxiliary_loss_mlp": 0.01010061, + "balance_loss_clip": 1.01355314, + "balance_loss_mlp": 1.00781965, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.8423488105966532, + "language_loss": 0.58891708, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60937989, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 3.217707395553589 + }, + { + "auxiliary_loss_clip": 0.01086626, + "auxiliary_loss_mlp": 0.01048701, + "balance_loss_clip": 1.03612566, + "balance_loss_mlp": 1.02797103, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.9529601640337617, + "language_loss": 0.85486168, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87621498, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 2.6636087894439697 + }, + { + "auxiliary_loss_clip": 0.01126129, + "auxiliary_loss_mlp": 0.01052159, + "balance_loss_clip": 1.04184151, + "balance_loss_mlp": 1.03344321, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 4.887883247825437, + "language_loss": 0.79299974, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81478262, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.5862300395965576 + }, + { + "auxiliary_loss_clip": 0.01119309, + "auxiliary_loss_mlp": 0.01055818, + "balance_loss_clip": 1.03972769, + "balance_loss_mlp": 1.03448009, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.5718525684585678, + "language_loss": 0.75914055, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78089184, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 2.623791217803955 + }, + { + "auxiliary_loss_clip": 0.01115922, + "auxiliary_loss_mlp": 0.01041262, + "balance_loss_clip": 1.0378449, + "balance_loss_mlp": 1.02242672, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 1.8439698777711613, + "language_loss": 0.75711566, + "learning_rate": 3.837251082205368e-06, + "loss": 0.77868754, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 2.6001312732696533 + }, + { + "auxiliary_loss_clip": 0.01099952, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.04044509, + "balance_loss_mlp": 1.02378416, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 1.8687110450188333, + "language_loss": 0.61529756, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63671762, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 2.6234257221221924 + }, + { + "auxiliary_loss_clip": 0.01107165, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.03711462, + "balance_loss_mlp": 1.0233587, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8084835213458044, + "language_loss": 0.81150079, + "learning_rate": 3.836943167480296e-06, + "loss": 0.833, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 2.615602970123291 + }, + { + "auxiliary_loss_clip": 0.01141546, + "auxiliary_loss_mlp": 0.01057136, + "balance_loss_clip": 1.04154253, + "balance_loss_mlp": 1.03513002, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 2.631038167977658, + "language_loss": 0.88810492, + "learning_rate": 3.836789105629236e-06, + "loss": 0.9100917, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 2.5737929344177246 + }, + { + "auxiliary_loss_clip": 0.01075916, + "auxiliary_loss_mlp": 0.01054739, + "balance_loss_clip": 1.03627455, + "balance_loss_mlp": 1.03393698, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.507429169345664, + "language_loss": 0.64838094, + "learning_rate": 3.83663497412695e-06, + "loss": 0.66968745, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 2.7090165615081787 + }, + { + "auxiliary_loss_clip": 0.01078748, + "auxiliary_loss_mlp": 0.01045033, + "balance_loss_clip": 1.03402877, + "balance_loss_mlp": 1.02442145, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.7857230437714133, + "language_loss": 0.82616186, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84739971, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 2.6825833320617676 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.01047532, + "balance_loss_clip": 1.03895378, + "balance_loss_mlp": 1.02903044, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.5410599604144735, + "language_loss": 0.78768384, + "learning_rate": 3.836326502192077e-06, + "loss": 0.80920428, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 4.1362693309783936 + }, + { + "auxiliary_loss_clip": 0.01125527, + "auxiliary_loss_mlp": 0.01051352, + "balance_loss_clip": 1.04055512, + "balance_loss_mlp": 1.03276682, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 5.763858143859418, + "language_loss": 0.64815807, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66992688, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 2.683218479156494 + }, + { + "auxiliary_loss_clip": 0.01119235, + "auxiliary_loss_mlp": 0.01051342, + "balance_loss_clip": 1.0436902, + "balance_loss_mlp": 1.03114843, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.11888819933119, + "language_loss": 0.81783497, + "learning_rate": 3.836017751722467e-06, + "loss": 0.83954078, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 2.6517372131347656 + }, + { + "auxiliary_loss_clip": 0.01127505, + "auxiliary_loss_mlp": 0.01046777, + "balance_loss_clip": 1.04259717, + "balance_loss_mlp": 1.02758467, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 1.8882975578735093, + "language_loss": 0.72475588, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.7464987, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 2.5765886306762695 + }, + { + "auxiliary_loss_clip": 0.01091875, + "auxiliary_loss_mlp": 0.01039181, + "balance_loss_clip": 1.03334594, + "balance_loss_mlp": 1.01965499, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 2.331390021980543, + "language_loss": 0.81535113, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83666176, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 4.118611097335815 + }, + { + "auxiliary_loss_clip": 0.01137832, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.03993845, + "balance_loss_mlp": 1.0249325, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 4.685812697949738, + "language_loss": 0.86855125, + "learning_rate": 3.835554103867876e-06, + "loss": 0.89036977, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.5411837100982666 + }, + { + "auxiliary_loss_clip": 0.01128195, + "auxiliary_loss_mlp": 0.01045031, + "balance_loss_clip": 1.04338765, + "balance_loss_mlp": 1.0264343, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6688597579177922, + "language_loss": 0.68739444, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70912671, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.5820581912994385 + }, + { + "auxiliary_loss_clip": 0.01114304, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.04525805, + "balance_loss_mlp": 1.02650642, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.8844989869706432, + "language_loss": 0.79815787, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.81975037, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.5732040405273438 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.00750747, + "balance_loss_clip": 1.04095626, + "balance_loss_mlp": 0.99993461, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 1.8686192618255875, + "language_loss": 0.82788771, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84653759, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 4.208268880844116 + }, + { + "auxiliary_loss_clip": 0.01143446, + "auxiliary_loss_mlp": 0.01052252, + "balance_loss_clip": 1.04276311, + "balance_loss_mlp": 1.03191459, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 3.6912819398774808, + "language_loss": 0.81521225, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83716923, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.517630100250244 + }, + { + "auxiliary_loss_clip": 0.01141606, + "auxiliary_loss_mlp": 0.00750853, + "balance_loss_clip": 1.04357672, + "balance_loss_mlp": 0.99992347, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8587021976932656, + "language_loss": 0.88179195, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90071654, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 3.9764158725738525 + }, + { + "auxiliary_loss_clip": 0.01141866, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.04411554, + "balance_loss_mlp": 1.03566146, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.8922587867289433, + "language_loss": 0.78029138, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80227876, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.5012366771698 + }, + { + "auxiliary_loss_clip": 0.0110443, + "auxiliary_loss_mlp": 0.01041392, + "balance_loss_clip": 1.03922963, + "balance_loss_mlp": 1.02267647, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 3.4813084598718484, + "language_loss": 0.73823822, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75969648, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.6689207553863525 + }, + { + "auxiliary_loss_clip": 0.01125827, + "auxiliary_loss_mlp": 0.01049352, + "balance_loss_clip": 1.04034817, + "balance_loss_mlp": 1.0299325, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 2.6784170389812365, + "language_loss": 0.87298572, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89473748, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.615457534790039 + }, + { + "auxiliary_loss_clip": 0.01128182, + "auxiliary_loss_mlp": 0.01041041, + "balance_loss_clip": 1.04087019, + "balance_loss_mlp": 1.02276587, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 2.2521359761477435, + "language_loss": 0.85940981, + "learning_rate": 3.834159402300841e-06, + "loss": 0.88110197, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.6260879039764404 + }, + { + "auxiliary_loss_clip": 0.01133029, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.04192924, + "balance_loss_mlp": 1.02480078, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.328853221547874, + "language_loss": 0.73235732, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75414008, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.614299774169922 + }, + { + "auxiliary_loss_clip": 0.01141552, + "auxiliary_loss_mlp": 0.01046466, + "balance_loss_clip": 1.04584181, + "balance_loss_mlp": 1.02821541, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 2.4507057821210827, + "language_loss": 0.76643437, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78831458, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 2.5251126289367676 + }, + { + "auxiliary_loss_clip": 0.01104409, + "auxiliary_loss_mlp": 0.01045788, + "balance_loss_clip": 1.04298317, + "balance_loss_mlp": 1.02815747, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7536451115759344, + "language_loss": 0.81954491, + "learning_rate": 3.833693249639615e-06, + "loss": 0.84104693, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.663074493408203 + }, + { + "auxiliary_loss_clip": 0.01113657, + "auxiliary_loss_mlp": 0.01045003, + "balance_loss_clip": 1.04193211, + "balance_loss_mlp": 1.02371216, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 2.289384609317682, + "language_loss": 0.72415346, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74574006, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 2.6724631786346436 + }, + { + "auxiliary_loss_clip": 0.01125467, + "auxiliary_loss_mlp": 0.01040011, + "balance_loss_clip": 1.04006195, + "balance_loss_mlp": 1.020329, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 2.0680319503387685, + "language_loss": 0.72069204, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74234676, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 2.584383487701416 + }, + { + "auxiliary_loss_clip": 0.011427, + "auxiliary_loss_mlp": 0.01051535, + "balance_loss_clip": 1.04339528, + "balance_loss_mlp": 1.0310905, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 1.7824786575833125, + "language_loss": 0.72904032, + "learning_rate": 3.833226471173919e-06, + "loss": 0.75098264, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.5505077838897705 + }, + { + "auxiliary_loss_clip": 0.01122864, + "auxiliary_loss_mlp": 0.01043942, + "balance_loss_clip": 1.04058671, + "balance_loss_mlp": 1.02522635, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.1850701649321542, + "language_loss": 0.70569611, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72736418, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.5930514335632324 + }, + { + "auxiliary_loss_clip": 0.01093585, + "auxiliary_loss_mlp": 0.01052872, + "balance_loss_clip": 1.03861034, + "balance_loss_mlp": 1.03342938, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 2.5890662613722375, + "language_loss": 0.76275188, + "learning_rate": 3.83291493793963e-06, + "loss": 0.78421652, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.66805362701416 + }, + { + "auxiliary_loss_clip": 0.01101945, + "auxiliary_loss_mlp": 0.01056527, + "balance_loss_clip": 1.04140866, + "balance_loss_mlp": 1.03691721, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.5971235935868338, + "language_loss": 0.66001552, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68160021, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.6653566360473633 + }, + { + "auxiliary_loss_clip": 0.01133383, + "auxiliary_loss_mlp": 0.01044321, + "balance_loss_clip": 1.04278171, + "balance_loss_mlp": 1.02467549, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.070841538367523, + "language_loss": 0.74986142, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77163851, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.5987582206726074 + }, + { + "auxiliary_loss_clip": 0.01118737, + "auxiliary_loss_mlp": 0.01051076, + "balance_loss_clip": 1.04241645, + "balance_loss_mlp": 1.03150225, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.4786908752984638, + "language_loss": 0.72842526, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75012338, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.631007432937622 + }, + { + "auxiliary_loss_clip": 0.01117382, + "auxiliary_loss_mlp": 0.01058207, + "balance_loss_clip": 1.04308999, + "balance_loss_mlp": 1.03815591, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 2.1623349898539197, + "language_loss": 0.72465366, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74640954, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.7029809951782227 + }, + { + "auxiliary_loss_clip": 0.01119391, + "auxiliary_loss_mlp": 0.01047825, + "balance_loss_clip": 1.04062843, + "balance_loss_mlp": 1.02713037, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.1392439234625624, + "language_loss": 0.74249238, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76416457, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.5816500186920166 + }, + { + "auxiliary_loss_clip": 0.01146867, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.04498637, + "balance_loss_mlp": 1.02871239, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.250300636326007, + "language_loss": 0.79047471, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81243747, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.5927915573120117 + }, + { + "auxiliary_loss_clip": 0.01106063, + "auxiliary_loss_mlp": 0.01053432, + "balance_loss_clip": 1.04037106, + "balance_loss_mlp": 1.03469205, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.8289744129842371, + "language_loss": 0.76833296, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78992796, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 2.6583242416381836 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01050273, + "balance_loss_clip": 1.04210305, + "balance_loss_mlp": 1.02932835, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.6344651853356487, + "language_loss": 0.71200424, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73369306, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 2.7132768630981445 + }, + { + "auxiliary_loss_clip": 0.0108548, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_clip": 1.04073822, + "balance_loss_mlp": 1.03193653, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 1.8904974948434183, + "language_loss": 0.7170912, + "learning_rate": 3.831509598604828e-06, + "loss": 0.73848331, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 2.9773573875427246 + }, + { + "auxiliary_loss_clip": 0.01068734, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_clip": 1.03436685, + "balance_loss_mlp": 1.02624702, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.637651919527095, + "language_loss": 0.87535703, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89649385, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 2.707530975341797 + }, + { + "auxiliary_loss_clip": 0.01140338, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.044276, + "balance_loss_mlp": 1.02542162, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.8048201181142178, + "language_loss": 0.81497133, + "learning_rate": 3.831196536861448e-06, + "loss": 0.83680654, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 2.5839791297912598 + }, + { + "auxiliary_loss_clip": 0.0110756, + "auxiliary_loss_mlp": 0.01052344, + "balance_loss_clip": 1.0402751, + "balance_loss_mlp": 1.03168476, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.1697744591473325, + "language_loss": 0.79768229, + "learning_rate": 3.831039901828054e-06, + "loss": 0.81928134, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.6904404163360596 + }, + { + "auxiliary_loss_clip": 0.01140644, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.0451349, + "balance_loss_mlp": 1.03476763, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.453304209472752, + "language_loss": 0.80371332, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82564342, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 2.628288507461548 + }, + { + "auxiliary_loss_clip": 0.01084043, + "auxiliary_loss_mlp": 0.0105105, + "balance_loss_clip": 1.04592454, + "balance_loss_mlp": 1.02973568, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.625358677948003, + "language_loss": 0.73975325, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76110411, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 2.7608134746551514 + }, + { + "auxiliary_loss_clip": 0.01094541, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.03990936, + "balance_loss_mlp": 1.03135502, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 1.9393323435655447, + "language_loss": 0.84865177, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87010264, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 2.6262309551239014 + }, + { + "auxiliary_loss_clip": 0.01111843, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.0410167, + "balance_loss_mlp": 1.02081728, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.8327025603704896, + "language_loss": 0.76838744, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78988218, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 2.6961607933044434 + }, + { + "auxiliary_loss_clip": 0.01130186, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.04517317, + "balance_loss_mlp": 1.0309875, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.2744413973663966, + "language_loss": 0.74208784, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.76389974, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 2.60391902923584 + }, + { + "auxiliary_loss_clip": 0.0113465, + "auxiliary_loss_mlp": 0.01043435, + "balance_loss_clip": 1.04331183, + "balance_loss_mlp": 1.02424216, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 2.3630292538577615, + "language_loss": 0.83659953, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85838038, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 4.197832107543945 + }, + { + "auxiliary_loss_clip": 0.01139216, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.04189944, + "balance_loss_mlp": 1.027349, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.5884674937898187, + "language_loss": 0.7841683, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80600917, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.562976360321045 + }, + { + "auxiliary_loss_clip": 0.01132637, + "auxiliary_loss_mlp": 0.01055234, + "balance_loss_clip": 1.04520631, + "balance_loss_mlp": 1.0357554, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 1.823089995388301, + "language_loss": 0.83155257, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85343128, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 2.5429341793060303 + }, + { + "auxiliary_loss_clip": 0.01144354, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_clip": 1.0448966, + "balance_loss_mlp": 1.02641535, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.8810029617421578, + "language_loss": 0.77015781, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79204929, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.5209028720855713 + }, + { + "auxiliary_loss_clip": 0.0110882, + "auxiliary_loss_mlp": 0.00751028, + "balance_loss_clip": 1.04325795, + "balance_loss_mlp": 0.9999553, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.002444375784796, + "language_loss": 0.89040244, + "learning_rate": 3.829469733648552e-06, + "loss": 0.90900087, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 4.2215492725372314 + }, + { + "auxiliary_loss_clip": 0.01067273, + "auxiliary_loss_mlp": 0.0105011, + "balance_loss_clip": 1.03443897, + "balance_loss_mlp": 1.03005886, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 1.9796748736266083, + "language_loss": 0.7533704, + "learning_rate": 3.829312335177034e-06, + "loss": 0.77454424, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.716304302215576 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.01043292, + "balance_loss_clip": 1.04437435, + "balance_loss_mlp": 1.02304995, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 1.9948707321996626, + "language_loss": 0.71976107, + "learning_rate": 3.82915486733781e-06, + "loss": 0.7412675, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.835759162902832 + }, + { + "auxiliary_loss_clip": 0.01128912, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.04296827, + "balance_loss_mlp": 1.0247016, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 2.3450340233464244, + "language_loss": 0.77662635, + "learning_rate": 3.82899733013685e-06, + "loss": 0.79833806, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.6608521938323975 + }, + { + "auxiliary_loss_clip": 0.01106607, + "auxiliary_loss_mlp": 0.0105319, + "balance_loss_clip": 1.04025674, + "balance_loss_mlp": 1.03325832, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 3.4427830506331403, + "language_loss": 0.75823563, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77983361, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 2.7165000438690186 + }, + { + "auxiliary_loss_clip": 0.01077565, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.03735209, + "balance_loss_mlp": 1.031515, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 1.756546275292976, + "language_loss": 0.81420058, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83548427, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 5.774705410003662 + }, + { + "auxiliary_loss_clip": 0.01106217, + "auxiliary_loss_mlp": 0.01053096, + "balance_loss_clip": 1.03848219, + "balance_loss_mlp": 1.03269958, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.6134851506804002, + "language_loss": 0.67161024, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69320333, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.7364604473114014 + }, + { + "auxiliary_loss_clip": 0.01123067, + "auxiliary_loss_mlp": 0.01052508, + "balance_loss_clip": 1.04258013, + "balance_loss_mlp": 1.0326004, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.52526118776033, + "language_loss": 0.75022304, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77197886, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.6592178344726562 + }, + { + "auxiliary_loss_clip": 0.01133723, + "auxiliary_loss_mlp": 0.01047219, + "balance_loss_clip": 1.04715586, + "balance_loss_mlp": 1.02837205, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 1.9464278383180813, + "language_loss": 0.70769119, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72950065, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.6369497776031494 + }, + { + "auxiliary_loss_clip": 0.01140326, + "auxiliary_loss_mlp": 0.01040849, + "balance_loss_clip": 1.0446744, + "balance_loss_mlp": 1.02337265, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 1.919848007467363, + "language_loss": 0.78418022, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80599201, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.5501677989959717 + }, + { + "auxiliary_loss_clip": 0.01131065, + "auxiliary_loss_mlp": 0.01046718, + "balance_loss_clip": 1.04362416, + "balance_loss_mlp": 1.02788246, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.9483318898600002, + "language_loss": 0.81906945, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84084725, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.612675428390503 + }, + { + "auxiliary_loss_clip": 0.01141688, + "auxiliary_loss_mlp": 0.01049447, + "balance_loss_clip": 1.04216778, + "balance_loss_mlp": 1.029217, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.979327581301608, + "language_loss": 0.69916278, + "learning_rate": 3.827734536224087e-06, + "loss": 0.7210741, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 2.661410093307495 + }, + { + "auxiliary_loss_clip": 0.01116864, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.04381657, + "balance_loss_mlp": 1.02367806, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.1719989127801513, + "language_loss": 0.62285405, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64443815, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 2.643840789794922 + }, + { + "auxiliary_loss_clip": 0.01142425, + "auxiliary_loss_mlp": 0.01041612, + "balance_loss_clip": 1.04601693, + "balance_loss_mlp": 1.02302706, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.820069244913238, + "language_loss": 0.89677954, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91861999, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 2.506985664367676 + }, + { + "auxiliary_loss_clip": 0.01139792, + "auxiliary_loss_mlp": 0.01048299, + "balance_loss_clip": 1.04553092, + "balance_loss_mlp": 1.03106141, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.7797136796303883, + "language_loss": 0.91578126, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93766201, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 2.544473171234131 + }, + { + "auxiliary_loss_clip": 0.01084997, + "auxiliary_loss_mlp": 0.01041408, + "balance_loss_clip": 1.04721618, + "balance_loss_mlp": 1.02110648, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 2.603043313225093, + "language_loss": 0.71637881, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73764282, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.7751219272613525 + }, + { + "auxiliary_loss_clip": 0.01123436, + "auxiliary_loss_mlp": 0.0103835, + "balance_loss_clip": 1.04110551, + "balance_loss_mlp": 1.02182794, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 1.8848278557523535, + "language_loss": 0.71038175, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73199964, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.5941317081451416 + }, + { + "auxiliary_loss_clip": 0.01095611, + "auxiliary_loss_mlp": 0.00750889, + "balance_loss_clip": 1.03907967, + "balance_loss_mlp": 0.99998641, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.9851415147193072, + "language_loss": 0.80096155, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.81942654, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.6818666458129883 + }, + { + "auxiliary_loss_clip": 0.01115723, + "auxiliary_loss_mlp": 0.0075067, + "balance_loss_clip": 1.04263532, + "balance_loss_mlp": 0.99998891, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 3.6368721565064854, + "language_loss": 0.69759107, + "learning_rate": 3.826625952782601e-06, + "loss": 0.71625501, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.624155044555664 + }, + { + "auxiliary_loss_clip": 0.01128329, + "auxiliary_loss_mlp": 0.01038039, + "balance_loss_clip": 1.04386735, + "balance_loss_mlp": 1.01933527, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.1200533208550962, + "language_loss": 0.76982403, + "learning_rate": 3.826467306608095e-06, + "loss": 0.79148769, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.675316333770752 + }, + { + "auxiliary_loss_clip": 0.0109459, + "auxiliary_loss_mlp": 0.01038974, + "balance_loss_clip": 1.03762603, + "balance_loss_mlp": 1.02054429, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 2.357354250825156, + "language_loss": 0.82253671, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84387231, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.7208449840545654 + }, + { + "auxiliary_loss_clip": 0.01091213, + "auxiliary_loss_mlp": 0.01041276, + "balance_loss_clip": 1.03691554, + "balance_loss_mlp": 1.02349007, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.1108289487134075, + "language_loss": 0.73834872, + "learning_rate": 3.826149806485631e-06, + "loss": 0.7596736, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.6130030155181885 + }, + { + "auxiliary_loss_clip": 0.01097248, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_clip": 1.04093671, + "balance_loss_mlp": 1.02448487, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.7945577166114566, + "language_loss": 0.77492702, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79631615, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 2.8966469764709473 + }, + { + "auxiliary_loss_clip": 0.01121285, + "auxiliary_loss_mlp": 0.01046883, + "balance_loss_clip": 1.04273343, + "balance_loss_mlp": 1.02897811, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6116924480618933, + "language_loss": 0.74353206, + "learning_rate": 3.825832029372035e-06, + "loss": 0.76521379, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 2.622039318084717 + }, + { + "auxiliary_loss_clip": 0.01116292, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.04606962, + "balance_loss_mlp": 1.02572656, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.9061732763463135, + "language_loss": 0.75398564, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77561682, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 2.748415470123291 + }, + { + "auxiliary_loss_clip": 0.01111686, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.04680061, + "balance_loss_mlp": 1.03049827, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 5.338167298076926, + "language_loss": 0.90743065, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92904639, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.6768267154693604 + }, + { + "auxiliary_loss_clip": 0.0109139, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.04284656, + "balance_loss_mlp": 1.03266454, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 1.780618322705641, + "language_loss": 0.78304732, + "learning_rate": 3.82535484444872e-06, + "loss": 0.80450022, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 2.7806968688964844 + }, + { + "auxiliary_loss_clip": 0.01112379, + "auxiliary_loss_mlp": 0.00750862, + "balance_loss_clip": 1.03961277, + "balance_loss_mlp": 0.99994516, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 2.8091516855098213, + "language_loss": 0.74746829, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76610076, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.6691887378692627 + }, + { + "auxiliary_loss_clip": 0.01111201, + "auxiliary_loss_mlp": 0.00751178, + "balance_loss_clip": 1.04189932, + "balance_loss_mlp": 1.00006616, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 1.9083314521114518, + "language_loss": 0.81928444, + "learning_rate": 3.825036375068263e-06, + "loss": 0.83790827, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.714923143386841 + }, + { + "auxiliary_loss_clip": 0.01093386, + "auxiliary_loss_mlp": 0.01048698, + "balance_loss_clip": 1.0429914, + "balance_loss_mlp": 1.03008962, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.3158434253347724, + "language_loss": 0.7967155, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81813633, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 2.721571445465088 + }, + { + "auxiliary_loss_clip": 0.01131555, + "auxiliary_loss_mlp": 0.01047318, + "balance_loss_clip": 1.04296708, + "balance_loss_mlp": 1.02892399, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.9662922434329502, + "language_loss": 0.94309312, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96488178, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 2.5760505199432373 + }, + { + "auxiliary_loss_clip": 0.01106654, + "auxiliary_loss_mlp": 0.0104512, + "balance_loss_clip": 1.04107738, + "balance_loss_mlp": 1.02574801, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.8697537299370708, + "language_loss": 0.85260415, + "learning_rate": 3.824558151970974e-06, + "loss": 0.8741219, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 2.6231272220611572 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.00750939, + "balance_loss_clip": 1.04115951, + "balance_loss_mlp": 0.99995279, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.791107188100838, + "language_loss": 0.8164019, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83498085, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 2.655287265777588 + }, + { + "auxiliary_loss_clip": 0.01143835, + "auxiliary_loss_mlp": 0.01047833, + "balance_loss_clip": 1.04821754, + "balance_loss_mlp": 1.02812791, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 1.7012121138018725, + "language_loss": 0.73679423, + "learning_rate": 3.824238990625567e-06, + "loss": 0.75871098, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 2.5566277503967285 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01051067, + "balance_loss_clip": 1.04560208, + "balance_loss_mlp": 1.03199315, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.4546882317664975, + "language_loss": 0.77404606, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79585201, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 2.5950751304626465 + }, + { + "auxiliary_loss_clip": 0.01042532, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.01963401, + "balance_loss_mlp": 1.02646625, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8179222299563641, + "language_loss": 0.55554593, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57625782, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 3.031132221221924 + }, + { + "auxiliary_loss_clip": 0.01130028, + "auxiliary_loss_mlp": 0.01041853, + "balance_loss_clip": 1.04284954, + "balance_loss_mlp": 1.02364957, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 2.450166366140642, + "language_loss": 0.77469581, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79641461, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 4.074232339859009 + }, + { + "auxiliary_loss_clip": 0.0112848, + "auxiliary_loss_mlp": 0.01041527, + "balance_loss_clip": 1.04462409, + "balance_loss_mlp": 1.02310896, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 2.0654172993547903, + "language_loss": 0.64820445, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66990447, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.6432814598083496 + }, + { + "auxiliary_loss_clip": 0.01136145, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.05083036, + "balance_loss_mlp": 1.02588439, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 2.339619154583385, + "language_loss": 0.85143489, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87325859, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.6031510829925537 + }, + { + "auxiliary_loss_clip": 0.01110095, + "auxiliary_loss_mlp": 0.01049423, + "balance_loss_clip": 1.04690099, + "balance_loss_mlp": 1.03133881, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.380542644017677, + "language_loss": 0.72542059, + "learning_rate": 3.823279846575403e-06, + "loss": 0.74701583, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 4.203780889511108 + }, + { + "auxiliary_loss_clip": 0.01130356, + "auxiliary_loss_mlp": 0.01044093, + "balance_loss_clip": 1.04398286, + "balance_loss_mlp": 1.02425718, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 2.1622964772182245, + "language_loss": 0.84489024, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86663473, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.5914196968078613 + }, + { + "auxiliary_loss_clip": 0.01087835, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.03968239, + "balance_loss_mlp": 1.02279878, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.7682930452089054, + "language_loss": 0.82404304, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84534895, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.7621266841888428 + }, + { + "auxiliary_loss_clip": 0.01132316, + "auxiliary_loss_mlp": 0.01050005, + "balance_loss_clip": 1.04835939, + "balance_loss_mlp": 1.03306568, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.88783871703462, + "language_loss": 0.72766221, + "learning_rate": 3.822799341092573e-06, + "loss": 0.74948549, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.535301446914673 + }, + { + "auxiliary_loss_clip": 0.0111613, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.04353726, + "balance_loss_mlp": 1.0211916, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 1.6807953666955813, + "language_loss": 0.7657041, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78725457, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 2.8324708938598633 + }, + { + "auxiliary_loss_clip": 0.01131222, + "auxiliary_loss_mlp": 0.01050028, + "balance_loss_clip": 1.04658389, + "balance_loss_mlp": 1.03088307, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.7530010044646014, + "language_loss": 0.7083106, + "learning_rate": 3.822478658490228e-06, + "loss": 0.7301231, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.7091152667999268 + }, + { + "auxiliary_loss_clip": 0.01016149, + "auxiliary_loss_mlp": 0.00748379, + "balance_loss_clip": 1.01442134, + "balance_loss_mlp": 0.99972898, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.7751209596073592, + "language_loss": 0.51778251, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53542781, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 6.5172107219696045 + }, + { + "auxiliary_loss_clip": 0.01122783, + "auxiliary_loss_mlp": 0.01044065, + "balance_loss_clip": 1.04249763, + "balance_loss_mlp": 1.02387071, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.7100044347138035, + "language_loss": 0.80496848, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82663691, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 2.665342330932617 + }, + { + "auxiliary_loss_clip": 0.01116835, + "auxiliary_loss_mlp": 0.01045101, + "balance_loss_clip": 1.04424477, + "balance_loss_mlp": 1.02739811, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.9890937263503772, + "language_loss": 0.69317418, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71479356, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.7085394859313965 + }, + { + "auxiliary_loss_clip": 0.01124529, + "auxiliary_loss_mlp": 0.01047602, + "balance_loss_clip": 1.05258286, + "balance_loss_mlp": 1.02838516, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.794642177640119, + "language_loss": 0.87234157, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89406288, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.01144295, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_clip": 1.04728651, + "balance_loss_mlp": 1.02603626, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 1.7000762651545667, + "language_loss": 0.74560058, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76749408, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.6717681884765625 + }, + { + "auxiliary_loss_clip": 0.01111429, + "auxiliary_loss_mlp": 0.00751062, + "balance_loss_clip": 1.04285192, + "balance_loss_mlp": 1.00004148, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.9513530544861795, + "language_loss": 0.70216739, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72079223, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 2.7444911003112793 + }, + { + "auxiliary_loss_clip": 0.0109734, + "auxiliary_loss_mlp": 0.01053799, + "balance_loss_clip": 1.04627347, + "balance_loss_mlp": 1.03439188, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 2.4979930381743882, + "language_loss": 0.71954083, + "learning_rate": 3.821354092781567e-06, + "loss": 0.74105227, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 2.6938302516937256 + }, + { + "auxiliary_loss_clip": 0.01130222, + "auxiliary_loss_mlp": 0.01046118, + "balance_loss_clip": 1.0452739, + "balance_loss_mlp": 1.02818859, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8246178384557818, + "language_loss": 0.82061863, + "learning_rate": 3.821193164224981e-06, + "loss": 0.84238207, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 2.6162619590759277 + }, + { + "auxiliary_loss_clip": 0.01133027, + "auxiliary_loss_mlp": 0.01046138, + "balance_loss_clip": 1.04263914, + "balance_loss_mlp": 1.02607465, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.67568067047942, + "language_loss": 0.71566212, + "learning_rate": 3.821032166608568e-06, + "loss": 0.73745376, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 2.6097404956817627 + }, + { + "auxiliary_loss_clip": 0.01099425, + "auxiliary_loss_mlp": 0.01052299, + "balance_loss_clip": 1.04229307, + "balance_loss_mlp": 1.03428674, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 1.9004896159739517, + "language_loss": 0.75838065, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.77989793, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.690244674682617 + }, + { + "auxiliary_loss_clip": 0.01141547, + "auxiliary_loss_mlp": 0.01047292, + "balance_loss_clip": 1.04642725, + "balance_loss_mlp": 1.0287075, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.778445861505838, + "language_loss": 0.87505627, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89694464, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.5707786083221436 + }, + { + "auxiliary_loss_clip": 0.0112547, + "auxiliary_loss_mlp": 0.01048207, + "balance_loss_clip": 1.04486179, + "balance_loss_mlp": 1.03191161, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.9502364677217519, + "language_loss": 0.88046217, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90219897, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.5786120891571045 + }, + { + "auxiliary_loss_clip": 0.01130977, + "auxiliary_loss_mlp": 0.01050525, + "balance_loss_clip": 1.04306412, + "balance_loss_mlp": 1.02971101, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.011445466254161, + "language_loss": 0.82197809, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84379309, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.561833620071411 + }, + { + "auxiliary_loss_clip": 0.01145491, + "auxiliary_loss_mlp": 0.01053078, + "balance_loss_clip": 1.04459345, + "balance_loss_mlp": 1.03325331, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 2.1861304777673194, + "language_loss": 0.8141492, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83613491, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.646705389022827 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.01051831, + "balance_loss_clip": 1.04532325, + "balance_loss_mlp": 1.03497446, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.3946214548396758, + "language_loss": 0.8362059, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85810542, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.5695314407348633 + }, + { + "auxiliary_loss_clip": 0.01094128, + "auxiliary_loss_mlp": 0.0105889, + "balance_loss_clip": 1.04087472, + "balance_loss_mlp": 1.03924382, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9205850817842618, + "language_loss": 0.69353026, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71506041, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.649064064025879 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_clip": 1.04677498, + "balance_loss_mlp": 1.0269413, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 11.99911985123552, + "language_loss": 0.82522559, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84701645, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.5708577632904053 + }, + { + "auxiliary_loss_clip": 0.01149625, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.04628658, + "balance_loss_mlp": 1.03062391, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 2.0394971096236483, + "language_loss": 0.88735241, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90935516, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 2.537384510040283 + }, + { + "auxiliary_loss_clip": 0.01134514, + "auxiliary_loss_mlp": 0.01041808, + "balance_loss_clip": 1.04329348, + "balance_loss_mlp": 1.02489209, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.5421225520426018, + "language_loss": 0.80926883, + "learning_rate": 3.819418393498343e-06, + "loss": 0.83103204, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 2.653566598892212 + }, + { + "auxiliary_loss_clip": 0.01123794, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.04315019, + "balance_loss_mlp": 1.02389884, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.625611588647069, + "language_loss": 0.77830851, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79996479, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 2.596391201019287 + }, + { + "auxiliary_loss_clip": 0.01110811, + "auxiliary_loss_mlp": 0.01041512, + "balance_loss_clip": 1.04054737, + "balance_loss_mlp": 1.02425039, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.079093452783756, + "language_loss": 0.86167979, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88320303, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.5972182750701904 + }, + { + "auxiliary_loss_clip": 0.01123828, + "auxiliary_loss_mlp": 0.00750671, + "balance_loss_clip": 1.04149723, + "balance_loss_mlp": 1.0000639, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.4303398080957768, + "language_loss": 0.80338502, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82212996, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.617387294769287 + }, + { + "auxiliary_loss_clip": 0.01116738, + "auxiliary_loss_mlp": 0.01041357, + "balance_loss_clip": 1.04389441, + "balance_loss_mlp": 1.0239408, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.6202822738466442, + "language_loss": 0.73192602, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75350696, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.581662893295288 + }, + { + "auxiliary_loss_clip": 0.01128188, + "auxiliary_loss_mlp": 0.01043292, + "balance_loss_clip": 1.04317605, + "balance_loss_mlp": 1.02446914, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 1.8390110633665142, + "language_loss": 0.73065156, + "learning_rate": 3.81860891934076e-06, + "loss": 0.75236636, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 2.5488944053649902 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01041258, + "balance_loss_clip": 1.03972661, + "balance_loss_mlp": 1.02238703, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.6986378705137786, + "language_loss": 0.70544827, + "learning_rate": 3.818446817599176e-06, + "loss": 0.7272259, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 2.605414628982544 + }, + { + "auxiliary_loss_clip": 0.01005613, + "auxiliary_loss_mlp": 0.01015759, + "balance_loss_clip": 1.01265001, + "balance_loss_mlp": 1.01290953, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7821496833653524, + "language_loss": 0.53398705, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55420077, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 3.171246290206909 + }, + { + "auxiliary_loss_clip": 0.01112263, + "auxiliary_loss_mlp": 0.0075109, + "balance_loss_clip": 1.04218471, + "balance_loss_mlp": 1.00001597, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 3.1325578485775867, + "language_loss": 0.75711519, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77574873, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 2.5915489196777344 + }, + { + "auxiliary_loss_clip": 0.01103396, + "auxiliary_loss_mlp": 0.01041232, + "balance_loss_clip": 1.04125571, + "balance_loss_mlp": 1.0241971, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.8746972346106456, + "language_loss": 0.72722858, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74867487, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 2.681283712387085 + }, + { + "auxiliary_loss_clip": 0.01116749, + "auxiliary_loss_mlp": 0.01045933, + "balance_loss_clip": 1.0426178, + "balance_loss_mlp": 1.02813518, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 6.755072899119803, + "language_loss": 0.83450937, + "learning_rate": 3.817797721137495e-06, + "loss": 0.8561362, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.591766119003296 + }, + { + "auxiliary_loss_clip": 0.01075664, + "auxiliary_loss_mlp": 0.00751071, + "balance_loss_clip": 1.03742695, + "balance_loss_mlp": 1.00001693, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 1.9490853600664604, + "language_loss": 0.86286175, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88112915, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 2.7049100399017334 + }, + { + "auxiliary_loss_clip": 0.01114307, + "auxiliary_loss_mlp": 0.00750655, + "balance_loss_clip": 1.03968072, + "balance_loss_mlp": 0.99997067, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.604917603851203, + "language_loss": 0.91550332, + "learning_rate": 3.817472759295605e-06, + "loss": 0.9341529, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.709961414337158 + }, + { + "auxiliary_loss_clip": 0.01099617, + "auxiliary_loss_mlp": 0.01056246, + "balance_loss_clip": 1.04546237, + "balance_loss_mlp": 1.03772116, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.093990283150592, + "language_loss": 0.81761813, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83917671, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 4.16123366355896 + }, + { + "auxiliary_loss_clip": 0.01122303, + "auxiliary_loss_mlp": 0.01046721, + "balance_loss_clip": 1.04012799, + "balance_loss_mlp": 1.02901864, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.7440576649681456, + "language_loss": 0.81019557, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83188581, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.5779757499694824 + }, + { + "auxiliary_loss_clip": 0.01142758, + "auxiliary_loss_mlp": 0.01063317, + "balance_loss_clip": 1.04396081, + "balance_loss_mlp": 1.04529214, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 1.9465916597584345, + "language_loss": 0.76762903, + "learning_rate": 3.816984799657568e-06, + "loss": 0.78968972, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.5583057403564453 + }, + { + "auxiliary_loss_clip": 0.01129239, + "auxiliary_loss_mlp": 0.01049867, + "balance_loss_clip": 1.04641855, + "balance_loss_mlp": 1.0318898, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.1203443531874426, + "language_loss": 0.78763139, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.80942243, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 4.034500598907471 + }, + { + "auxiliary_loss_clip": 0.01122633, + "auxiliary_loss_mlp": 0.01056584, + "balance_loss_clip": 1.04263949, + "balance_loss_mlp": 1.03914392, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 1.6497844222649964, + "language_loss": 0.78082573, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80261791, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.5951132774353027 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_clip": 1.03880954, + "balance_loss_mlp": 1.02895999, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.051274753294977, + "language_loss": 0.81693488, + "learning_rate": 3.816496219917336e-06, + "loss": 0.83844912, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 2.728255033493042 + }, + { + "auxiliary_loss_clip": 0.01118973, + "auxiliary_loss_mlp": 0.01059897, + "balance_loss_clip": 1.0451417, + "balance_loss_mlp": 1.04252815, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 2.530141544748788, + "language_loss": 0.86417073, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88595939, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 2.7523059844970703 + }, + { + "auxiliary_loss_clip": 0.01112157, + "auxiliary_loss_mlp": 0.01048436, + "balance_loss_clip": 1.04199481, + "balance_loss_mlp": 1.03151989, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.8691857834411187, + "language_loss": 0.76240987, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78401577, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.7712931632995605 + }, + { + "auxiliary_loss_clip": 0.01122801, + "auxiliary_loss_mlp": 0.01042034, + "balance_loss_clip": 1.04530263, + "balance_loss_mlp": 1.02582192, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 1.9091457506322214, + "language_loss": 0.73626721, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75791556, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.712770938873291 + }, + { + "auxiliary_loss_clip": 0.01098242, + "auxiliary_loss_mlp": 0.01039919, + "balance_loss_clip": 1.03668547, + "balance_loss_mlp": 1.02275264, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.5994476526881816, + "language_loss": 0.72423327, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74561489, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 4.313385009765625 + }, + { + "auxiliary_loss_clip": 0.01084151, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_clip": 1.04092336, + "balance_loss_mlp": 1.02820754, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 1.950146075245899, + "language_loss": 0.75021541, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77153337, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 4.24636435508728 + }, + { + "auxiliary_loss_clip": 0.01083638, + "auxiliary_loss_mlp": 0.01052798, + "balance_loss_clip": 1.03591275, + "balance_loss_mlp": 1.03236544, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.8672912412753233, + "language_loss": 0.7896536, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81101793, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.7381958961486816 + }, + { + "auxiliary_loss_clip": 0.01122256, + "auxiliary_loss_mlp": 0.00750929, + "balance_loss_clip": 1.04070389, + "balance_loss_mlp": 1.00000882, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 1.8848082701283404, + "language_loss": 0.84730911, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86604095, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 2.6584739685058594 + }, + { + "auxiliary_loss_clip": 0.01078811, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.03522205, + "balance_loss_mlp": 1.01694226, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 3.7000708819926285, + "language_loss": 0.7146017, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73573995, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 2.7393152713775635 + }, + { + "auxiliary_loss_clip": 0.01089428, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.0376668, + "balance_loss_mlp": 1.0231961, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 1.857279520730098, + "language_loss": 0.70203924, + "learning_rate": 3.815026761751955e-06, + "loss": 0.72333509, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 2.652338981628418 + }, + { + "auxiliary_loss_clip": 0.01086235, + "auxiliary_loss_mlp": 0.0103988, + "balance_loss_clip": 1.03885722, + "balance_loss_mlp": 1.02327418, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.7594434355658963, + "language_loss": 0.88440812, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90566927, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 2.7045955657958984 + }, + { + "auxiliary_loss_clip": 0.01129941, + "auxiliary_loss_mlp": 0.01043561, + "balance_loss_clip": 1.04557061, + "balance_loss_mlp": 1.02583468, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 2.703561150090451, + "language_loss": 0.7399652, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76170021, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.6283483505249023 + }, + { + "auxiliary_loss_clip": 0.01121782, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.04139173, + "balance_loss_mlp": 1.02564859, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.5602246970524112, + "language_loss": 0.82689017, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84852302, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 2.629922389984131 + }, + { + "auxiliary_loss_clip": 0.01129603, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.04294264, + "balance_loss_mlp": 1.02336454, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.3307066804661414, + "language_loss": 0.85092545, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87263292, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.550657033920288 + }, + { + "auxiliary_loss_clip": 0.01138886, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.04328799, + "balance_loss_mlp": 1.02438021, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.8050640855541518, + "language_loss": 0.72818077, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74998009, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.5656845569610596 + }, + { + "auxiliary_loss_clip": 0.01112798, + "auxiliary_loss_mlp": 0.01043667, + "balance_loss_clip": 1.03608418, + "balance_loss_mlp": 1.0238781, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 1.5366490107266393, + "language_loss": 0.74406028, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76562488, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.783278226852417 + }, + { + "auxiliary_loss_clip": 0.01091193, + "auxiliary_loss_mlp": 0.01044986, + "balance_loss_clip": 1.03741658, + "balance_loss_mlp": 1.02638912, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 6.532167403994886, + "language_loss": 0.79294944, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.81431127, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.6450183391571045 + }, + { + "auxiliary_loss_clip": 0.011129, + "auxiliary_loss_mlp": 0.01044864, + "balance_loss_clip": 1.03981745, + "balance_loss_mlp": 1.02627885, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 2.1026746808302272, + "language_loss": 0.6930809, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.7146585, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 2.631761312484741 + }, + { + "auxiliary_loss_clip": 0.01113109, + "auxiliary_loss_mlp": 0.01046761, + "balance_loss_clip": 1.03974295, + "balance_loss_mlp": 1.02790201, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 1.8615333120808917, + "language_loss": 0.81050587, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.83210456, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 2.730985641479492 + }, + { + "auxiliary_loss_clip": 0.01104371, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_clip": 1.03859258, + "balance_loss_mlp": 1.02943778, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.0933187010513117, + "language_loss": 0.82168126, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84321666, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 2.6885502338409424 + }, + { + "auxiliary_loss_clip": 0.0104696, + "auxiliary_loss_mlp": 0.01042217, + "balance_loss_clip": 1.03574705, + "balance_loss_mlp": 1.02483618, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.4312243734866463, + "language_loss": 0.78428847, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80518019, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 2.7835335731506348 + }, + { + "auxiliary_loss_clip": 0.01116802, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_clip": 1.04314232, + "balance_loss_mlp": 1.02698016, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 2.670865869405056, + "language_loss": 0.81476068, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83636504, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 2.6477549076080322 + }, + { + "auxiliary_loss_clip": 0.01118267, + "auxiliary_loss_mlp": 0.01045774, + "balance_loss_clip": 1.03987479, + "balance_loss_mlp": 1.02768946, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 2.25805070843992, + "language_loss": 0.87248206, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89412236, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 2.6383588314056396 + }, + { + "auxiliary_loss_clip": 0.01107418, + "auxiliary_loss_mlp": 0.01052852, + "balance_loss_clip": 1.03839421, + "balance_loss_mlp": 1.03479207, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 1.8419187909300005, + "language_loss": 0.71967536, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74127805, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.6420722007751465 + }, + { + "auxiliary_loss_clip": 0.01124765, + "auxiliary_loss_mlp": 0.01039082, + "balance_loss_clip": 1.03987789, + "balance_loss_mlp": 1.02128363, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.7047573679727044, + "language_loss": 0.81803077, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.83966923, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 2.6522064208984375 + }, + { + "auxiliary_loss_clip": 0.01076772, + "auxiliary_loss_mlp": 0.01058278, + "balance_loss_clip": 1.03483653, + "balance_loss_mlp": 1.03591394, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 1.9317436284277498, + "language_loss": 0.69428378, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71563429, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 2.867009401321411 + }, + { + "auxiliary_loss_clip": 0.01136656, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.04210865, + "balance_loss_mlp": 1.02294862, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.9071573507823052, + "language_loss": 0.79638058, + "learning_rate": 3.812235911671472e-06, + "loss": 0.81814504, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 2.6839163303375244 + }, + { + "auxiliary_loss_clip": 0.01110965, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_clip": 1.03982139, + "balance_loss_mlp": 1.02620244, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 1.7490924250449655, + "language_loss": 0.84332436, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86487758, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 2.6949400901794434 + }, + { + "auxiliary_loss_clip": 0.01132576, + "auxiliary_loss_mlp": 0.0104358, + "balance_loss_clip": 1.04080176, + "balance_loss_mlp": 1.02591276, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 3.184393885612006, + "language_loss": 0.85623258, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87799412, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 2.5421698093414307 + }, + { + "auxiliary_loss_clip": 0.0110506, + "auxiliary_loss_mlp": 0.01040802, + "balance_loss_clip": 1.03934038, + "balance_loss_mlp": 1.02398133, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.8426691408336693, + "language_loss": 0.83205807, + "learning_rate": 3.811741346238036e-06, + "loss": 0.8535167, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 2.657498359680176 + }, + { + "auxiliary_loss_clip": 0.0110537, + "auxiliary_loss_mlp": 0.01049243, + "balance_loss_clip": 1.04320538, + "balance_loss_mlp": 1.03226721, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 1.8205087930111523, + "language_loss": 0.76874161, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.79028761, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 2.6099462509155273 + }, + { + "auxiliary_loss_clip": 0.01134135, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_clip": 1.04115868, + "balance_loss_mlp": 1.0270431, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.7221033571900897, + "language_loss": 0.80779731, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82958734, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.571763038635254 + }, + { + "auxiliary_loss_clip": 0.01127481, + "auxiliary_loss_mlp": 0.0103926, + "balance_loss_clip": 1.04325151, + "balance_loss_mlp": 1.02209425, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 2.3562152067968634, + "language_loss": 0.69587934, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71754676, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 2.5362980365753174 + }, + { + "auxiliary_loss_clip": 0.01139164, + "auxiliary_loss_mlp": 0.00750639, + "balance_loss_clip": 1.04521275, + "balance_loss_mlp": 1.00004601, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.6327657471076646, + "language_loss": 0.87712067, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89601874, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 2.532589912414551 + }, + { + "auxiliary_loss_clip": 0.01123885, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.03927708, + "balance_loss_mlp": 1.01952636, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.434696532604237, + "language_loss": 0.78818196, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.80979294, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 4.0157554149627686 + }, + { + "auxiliary_loss_clip": 0.01127513, + "auxiliary_loss_mlp": 0.01041401, + "balance_loss_clip": 1.04381227, + "balance_loss_mlp": 1.0245688, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.6358808205245556, + "language_loss": 0.95039034, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97207952, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.5826597213745117 + }, + { + "auxiliary_loss_clip": 0.01049897, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.04298329, + "balance_loss_mlp": 1.02739072, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 2.0537807184774564, + "language_loss": 0.71168131, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73264146, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 2.9251770973205566 + }, + { + "auxiliary_loss_clip": 0.01039067, + "auxiliary_loss_mlp": 0.01068699, + "balance_loss_clip": 1.01625586, + "balance_loss_mlp": 1.06663692, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7873823484038817, + "language_loss": 0.54016322, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56124085, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.415212392807007 + }, + { + "auxiliary_loss_clip": 0.01133812, + "auxiliary_loss_mlp": 0.00750699, + "balance_loss_clip": 1.03983665, + "balance_loss_mlp": 1.00006008, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 2.0208907255922663, + "language_loss": 0.75460249, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77344763, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 2.5708742141723633 + }, + { + "auxiliary_loss_clip": 0.01110345, + "auxiliary_loss_mlp": 0.01048252, + "balance_loss_clip": 1.04102826, + "balance_loss_mlp": 1.02791536, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.031239087804796, + "language_loss": 0.86689711, + "learning_rate": 3.810088330151188e-06, + "loss": 0.88848305, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 4.182969570159912 + }, + { + "auxiliary_loss_clip": 0.01091531, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.03408313, + "balance_loss_mlp": 1.02694619, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.7043651804002096, + "language_loss": 0.73477125, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75613314, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.7499120235443115 + }, + { + "auxiliary_loss_clip": 0.01097439, + "auxiliary_loss_mlp": 0.01041161, + "balance_loss_clip": 1.03915942, + "balance_loss_mlp": 1.02379203, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6077358502030892, + "language_loss": 0.74881077, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77019674, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 2.7230234146118164 + }, + { + "auxiliary_loss_clip": 0.01114374, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.04200482, + "balance_loss_mlp": 1.02576351, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.7836531796905202, + "language_loss": 0.84481138, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86638463, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.6825408935546875 + }, + { + "auxiliary_loss_clip": 0.01138457, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.04434609, + "balance_loss_mlp": 1.02713513, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 4.125301550974842, + "language_loss": 0.79263729, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81445831, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.558706045150757 + }, + { + "auxiliary_loss_clip": 0.01073552, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_clip": 1.03816175, + "balance_loss_mlp": 1.024423, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 2.5074471587966345, + "language_loss": 0.75031865, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77148044, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 4.359051465988159 + }, + { + "auxiliary_loss_clip": 0.01092137, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_clip": 1.03985167, + "balance_loss_mlp": 1.02519751, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.623213865173689, + "language_loss": 0.72918785, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75054038, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 4.351848363876343 + }, + { + "auxiliary_loss_clip": 0.01106306, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.03939629, + "balance_loss_mlp": 1.02664995, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 1.8397967192361806, + "language_loss": 0.89237905, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91388184, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.666560649871826 + }, + { + "auxiliary_loss_clip": 0.01090143, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.03915894, + "balance_loss_mlp": 1.02976239, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 3.048736532240507, + "language_loss": 0.8828615, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90424049, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 2.7524380683898926 + }, + { + "auxiliary_loss_clip": 0.01044737, + "auxiliary_loss_mlp": 0.01008921, + "balance_loss_clip": 1.01208818, + "balance_loss_mlp": 1.00672805, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7900657798975959, + "language_loss": 0.59825206, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61878872, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 3.0672755241394043 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.01042559, + "balance_loss_clip": 1.03959572, + "balance_loss_mlp": 1.0239861, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8348173873574025, + "language_loss": 0.8236146, + "learning_rate": 3.808428450193401e-06, + "loss": 0.84526557, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.646507740020752 + }, + { + "auxiliary_loss_clip": 0.0114464, + "auxiliary_loss_mlp": 0.01045636, + "balance_loss_clip": 1.04509747, + "balance_loss_mlp": 1.02658629, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.939312564237191, + "language_loss": 0.6930455, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.7149483, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.503641366958618 + }, + { + "auxiliary_loss_clip": 0.01124873, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.04427004, + "balance_loss_mlp": 1.02308416, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3083798820509376, + "language_loss": 0.8848561, + "learning_rate": 3.808095651090769e-06, + "loss": 0.9065029, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.5891284942626953 + }, + { + "auxiliary_loss_clip": 0.01030494, + "auxiliary_loss_mlp": 0.01010933, + "balance_loss_clip": 1.00825012, + "balance_loss_mlp": 1.008775, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.6379247266294888, + "language_loss": 0.52844828, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54886252, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.2385618686676025 + }, + { + "auxiliary_loss_clip": 0.01114364, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.04249191, + "balance_loss_mlp": 1.02918601, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.6861079371483125, + "language_loss": 0.8524304, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87405628, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.6145596504211426 + }, + { + "auxiliary_loss_clip": 0.01026011, + "auxiliary_loss_mlp": 0.01007606, + "balance_loss_clip": 1.01385379, + "balance_loss_mlp": 1.00538909, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.8082300290805441, + "language_loss": 0.57486415, + "learning_rate": 3.80759593822885e-06, + "loss": 0.5952003, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 3.048182964324951 + }, + { + "auxiliary_loss_clip": 0.0101195, + "auxiliary_loss_mlp": 0.01014733, + "balance_loss_clip": 1.01261139, + "balance_loss_mlp": 1.01233673, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8860600414766505, + "language_loss": 0.56236023, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58262706, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 3.074516773223877 + }, + { + "auxiliary_loss_clip": 0.01088546, + "auxiliary_loss_mlp": 0.01057705, + "balance_loss_clip": 1.03831172, + "balance_loss_mlp": 1.03985953, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 3.4452698764873952, + "language_loss": 0.70590949, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72737199, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.743098735809326 + }, + { + "auxiliary_loss_clip": 0.01118139, + "auxiliary_loss_mlp": 0.01044551, + "balance_loss_clip": 1.03706884, + "balance_loss_mlp": 1.02634716, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.835155800854185, + "language_loss": 0.86150974, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88313663, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.690781831741333 + }, + { + "auxiliary_loss_clip": 0.01074164, + "auxiliary_loss_mlp": 0.01047674, + "balance_loss_clip": 1.03778374, + "balance_loss_mlp": 1.03074598, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 1.9434157812347823, + "language_loss": 0.8189044, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.8401227, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 2.7708821296691895 + }, + { + "auxiliary_loss_clip": 0.01096503, + "auxiliary_loss_mlp": 0.01047863, + "balance_loss_clip": 1.03929472, + "balance_loss_mlp": 1.02968359, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.4264094152886377, + "language_loss": 0.82812846, + "learning_rate": 3.806761712658952e-06, + "loss": 0.84957212, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 2.7296924591064453 + }, + { + "auxiliary_loss_clip": 0.01123999, + "auxiliary_loss_mlp": 0.01046332, + "balance_loss_clip": 1.0435015, + "balance_loss_mlp": 1.02965438, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.908514275054457, + "language_loss": 0.80659437, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82829767, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.6148552894592285 + }, + { + "auxiliary_loss_clip": 0.0111934, + "auxiliary_loss_mlp": 0.01042844, + "balance_loss_clip": 1.04595923, + "balance_loss_mlp": 1.02534413, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 2.0325670628204433, + "language_loss": 0.80279148, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82441336, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 2.6307270526885986 + }, + { + "auxiliary_loss_clip": 0.01123047, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.03934479, + "balance_loss_mlp": 1.02380681, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.9473345649542113, + "language_loss": 0.8534708, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87511337, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 2.6018319129943848 + }, + { + "auxiliary_loss_clip": 0.01114394, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.04301453, + "balance_loss_mlp": 1.01982033, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 5.439970817542904, + "language_loss": 0.74539185, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76690805, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 2.6379964351654053 + }, + { + "auxiliary_loss_clip": 0.01088752, + "auxiliary_loss_mlp": 0.00750688, + "balance_loss_clip": 1.03796828, + "balance_loss_mlp": 0.99999678, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.1837742587560993, + "language_loss": 0.65245026, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67084467, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 2.842184543609619 + }, + { + "auxiliary_loss_clip": 0.01095363, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.03723717, + "balance_loss_mlp": 1.01973403, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.4896672894970893, + "language_loss": 0.78602433, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80735934, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 2.674980640411377 + }, + { + "auxiliary_loss_clip": 0.01069274, + "auxiliary_loss_mlp": 0.01043516, + "balance_loss_clip": 1.03462267, + "balance_loss_mlp": 1.02515781, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.9910188786241163, + "language_loss": 0.75075287, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77188075, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 2.69858980178833 + }, + { + "auxiliary_loss_clip": 0.01106547, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04015839, + "balance_loss_mlp": 1.02703953, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 2.114179025485308, + "language_loss": 0.67987704, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70140296, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.7419698238372803 + }, + { + "auxiliary_loss_clip": 0.01134965, + "auxiliary_loss_mlp": 0.01042687, + "balance_loss_clip": 1.04236639, + "balance_loss_mlp": 1.02519894, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 1.5338354792322715, + "language_loss": 0.69912016, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72089666, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 2.5863068103790283 + }, + { + "auxiliary_loss_clip": 0.01113388, + "auxiliary_loss_mlp": 0.01048643, + "balance_loss_clip": 1.03877711, + "balance_loss_mlp": 1.02856827, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.7924455300747058, + "language_loss": 0.60714662, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62876701, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 2.7092537879943848 + }, + { + "auxiliary_loss_clip": 0.01023347, + "auxiliary_loss_mlp": 0.01026248, + "balance_loss_clip": 1.01095796, + "balance_loss_mlp": 1.02384043, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.7978873427616715, + "language_loss": 0.58817708, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60867304, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.209890127182007 + }, + { + "auxiliary_loss_clip": 0.01116116, + "auxiliary_loss_mlp": 0.01043937, + "balance_loss_clip": 1.03944588, + "balance_loss_mlp": 1.02575767, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.7423804477568052, + "language_loss": 0.75720882, + "learning_rate": 3.80475258451721e-06, + "loss": 0.77880937, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 2.6497302055358887 + }, + { + "auxiliary_loss_clip": 0.01125288, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_clip": 1.04173541, + "balance_loss_mlp": 1.02549791, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 1.8519032065638008, + "language_loss": 0.77397382, + "learning_rate": 3.804584712183972e-06, + "loss": 0.7956534, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 2.612584114074707 + }, + { + "auxiliary_loss_clip": 0.01020192, + "auxiliary_loss_mlp": 0.01002009, + "balance_loss_clip": 1.00884068, + "balance_loss_mlp": 0.99982756, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8559376897744465, + "language_loss": 0.593328, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61354995, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 3.031022071838379 + }, + { + "auxiliary_loss_clip": 0.01125037, + "auxiliary_loss_mlp": 0.01058208, + "balance_loss_clip": 1.04149759, + "balance_loss_mlp": 1.03937268, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.605899219817326, + "language_loss": 0.70105278, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72288525, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 4.2788567543029785 + }, + { + "auxiliary_loss_clip": 0.01103341, + "auxiliary_loss_mlp": 0.01063859, + "balance_loss_clip": 1.03998864, + "balance_loss_mlp": 1.04639482, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.9308771875991302, + "language_loss": 0.79662985, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.8183018, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.697279453277588 + }, + { + "auxiliary_loss_clip": 0.01103093, + "auxiliary_loss_mlp": 0.01059663, + "balance_loss_clip": 1.04257572, + "balance_loss_mlp": 1.04070854, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 2.0692540947988753, + "language_loss": 0.71001673, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73164433, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.679210662841797 + }, + { + "auxiliary_loss_clip": 0.0111091, + "auxiliary_loss_mlp": 0.01058166, + "balance_loss_clip": 1.04445255, + "balance_loss_mlp": 1.04035616, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 1.8800972600142236, + "language_loss": 0.71594566, + "learning_rate": 3.803744324194691e-06, + "loss": 0.73763639, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 2.6991190910339355 + }, + { + "auxiliary_loss_clip": 0.01124992, + "auxiliary_loss_mlp": 0.01060452, + "balance_loss_clip": 1.04333544, + "balance_loss_mlp": 1.04241562, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.8715216745530936, + "language_loss": 0.76854026, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79039472, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 4.0903544425964355 + }, + { + "auxiliary_loss_clip": 0.01116198, + "auxiliary_loss_mlp": 0.01054468, + "balance_loss_clip": 1.04221964, + "balance_loss_mlp": 1.03670609, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 5.17814466709411, + "language_loss": 0.71745396, + "learning_rate": 3.803407690167187e-06, + "loss": 0.73916072, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.6651248931884766 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01052023, + "balance_loss_clip": 1.03878212, + "balance_loss_mlp": 1.03509486, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.9664061750764519, + "language_loss": 0.84293395, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86456549, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 2.604025363922119 + }, + { + "auxiliary_loss_clip": 0.01081482, + "auxiliary_loss_mlp": 0.01057095, + "balance_loss_clip": 1.03907871, + "balance_loss_mlp": 1.03929758, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6698215392654483, + "language_loss": 0.8117134, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83309925, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.735380172729492 + }, + { + "auxiliary_loss_clip": 0.01119639, + "auxiliary_loss_mlp": 0.01044583, + "balance_loss_clip": 1.04052377, + "balance_loss_mlp": 1.0291816, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.4714346447129152, + "language_loss": 0.74879909, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77044123, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 2.586158275604248 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.0104743, + "balance_loss_clip": 1.04437399, + "balance_loss_mlp": 1.03170681, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.7342205061555058, + "language_loss": 0.79661167, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81845605, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.5437886714935303 + }, + { + "auxiliary_loss_clip": 0.01056436, + "auxiliary_loss_mlp": 0.01050337, + "balance_loss_clip": 1.03800905, + "balance_loss_mlp": 1.02916527, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.4547452385161823, + "language_loss": 0.7067821, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72784984, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 4.331611156463623 + }, + { + "auxiliary_loss_clip": 0.01092142, + "auxiliary_loss_mlp": 0.00750775, + "balance_loss_clip": 1.04154539, + "balance_loss_mlp": 1.00000238, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 2.0366946719598933, + "language_loss": 0.83715856, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85558772, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 4.321947813034058 + }, + { + "auxiliary_loss_clip": 0.01104076, + "auxiliary_loss_mlp": 0.01053867, + "balance_loss_clip": 1.03915501, + "balance_loss_mlp": 1.03661728, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.0785392187636154, + "language_loss": 0.829391, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.85097039, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 2.5720033645629883 + }, + { + "auxiliary_loss_clip": 0.01129358, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_clip": 1.04433513, + "balance_loss_mlp": 1.03088868, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.5619316486968, + "language_loss": 0.80579722, + "learning_rate": 3.802058419152413e-06, + "loss": 0.82758617, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.6399781703948975 + }, + { + "auxiliary_loss_clip": 0.01125602, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.04291844, + "balance_loss_mlp": 1.02665997, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 4.360880434783988, + "language_loss": 0.76642644, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78812802, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 2.6854665279388428 + }, + { + "auxiliary_loss_clip": 0.01008401, + "auxiliary_loss_mlp": 0.01039466, + "balance_loss_clip": 1.00898647, + "balance_loss_mlp": 1.03701043, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.850455158379966, + "language_loss": 0.55448115, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57495981, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.1504697799682617 + }, + { + "auxiliary_loss_clip": 0.01113114, + "auxiliary_loss_mlp": 0.01037854, + "balance_loss_clip": 1.03751934, + "balance_loss_mlp": 1.02123606, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.9669934774855393, + "language_loss": 0.72786105, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.74937069, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.595942258834839 + }, + { + "auxiliary_loss_clip": 0.01096535, + "auxiliary_loss_mlp": 0.01044349, + "balance_loss_clip": 1.0420177, + "balance_loss_mlp": 1.02723062, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.6992267657861175, + "language_loss": 0.69833314, + "learning_rate": 3.80138214341862e-06, + "loss": 0.71974194, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.650189161300659 + }, + { + "auxiliary_loss_clip": 0.01108301, + "auxiliary_loss_mlp": 0.01047522, + "balance_loss_clip": 1.03788447, + "balance_loss_mlp": 1.02890122, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.593879161505299, + "language_loss": 0.70043689, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72199512, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.6460278034210205 + }, + { + "auxiliary_loss_clip": 0.01100538, + "auxiliary_loss_mlp": 0.01045913, + "balance_loss_clip": 1.04013574, + "balance_loss_mlp": 1.02654111, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.281333565983231, + "language_loss": 0.80025339, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82171786, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.6101534366607666 + }, + { + "auxiliary_loss_clip": 0.01134074, + "auxiliary_loss_mlp": 0.01045901, + "balance_loss_clip": 1.04370284, + "balance_loss_mlp": 1.02778137, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 1.9984678817764991, + "language_loss": 0.88051546, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90231514, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.5382723808288574 + }, + { + "auxiliary_loss_clip": 0.01130107, + "auxiliary_loss_mlp": 0.01049314, + "balance_loss_clip": 1.04390216, + "balance_loss_mlp": 1.03109908, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.1495441466310314, + "language_loss": 0.92406404, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94585824, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.5683066844940186 + }, + { + "auxiliary_loss_clip": 0.01113728, + "auxiliary_loss_mlp": 0.01044792, + "balance_loss_clip": 1.04077017, + "balance_loss_mlp": 1.02791142, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 2.044227114362316, + "language_loss": 0.78771877, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80930388, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 2.6012790203094482 + }, + { + "auxiliary_loss_clip": 0.01125034, + "auxiliary_loss_mlp": 0.01045578, + "balance_loss_clip": 1.04342222, + "balance_loss_mlp": 1.02897227, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.75452679280384, + "language_loss": 0.75051701, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.77222323, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 2.5211760997772217 + }, + { + "auxiliary_loss_clip": 0.01112868, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.03981113, + "balance_loss_mlp": 1.03234017, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.5344795742343122, + "language_loss": 0.68763787, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.70926917, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.621434450149536 + }, + { + "auxiliary_loss_clip": 0.01137956, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.04422808, + "balance_loss_mlp": 1.03055549, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 2.5368338789011404, + "language_loss": 0.61511374, + "learning_rate": 3.800026313549776e-06, + "loss": 0.636967, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.5422353744506836 + }, + { + "auxiliary_loss_clip": 0.011033, + "auxiliary_loss_mlp": 0.01043736, + "balance_loss_clip": 1.03789473, + "balance_loss_mlp": 1.02690339, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.617786705673478, + "language_loss": 0.81832325, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.83979362, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 2.6349449157714844 + }, + { + "auxiliary_loss_clip": 0.01116046, + "auxiliary_loss_mlp": 0.01051637, + "balance_loss_clip": 1.04404044, + "balance_loss_mlp": 1.03354096, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.449470155169788, + "language_loss": 0.87208986, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89376664, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 2.590890407562256 + }, + { + "auxiliary_loss_clip": 0.01117728, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.04433584, + "balance_loss_mlp": 1.02549982, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.6099165570706362, + "language_loss": 0.81076598, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83237243, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 2.62545108795166 + }, + { + "auxiliary_loss_clip": 0.01135232, + "auxiliary_loss_mlp": 0.0104562, + "balance_loss_clip": 1.04261005, + "balance_loss_mlp": 1.02817965, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 2.363927267401001, + "language_loss": 0.80646408, + "learning_rate": 3.799346760237336e-06, + "loss": 0.82827252, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 2.5011401176452637 + }, + { + "auxiliary_loss_clip": 0.01023134, + "auxiliary_loss_mlp": 0.01001394, + "balance_loss_clip": 1.01229429, + "balance_loss_mlp": 0.99921209, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9415734436157396, + "language_loss": 0.61072886, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63097417, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 3.0924692153930664 + }, + { + "auxiliary_loss_clip": 0.01102189, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.0397445, + "balance_loss_mlp": 1.02885246, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 6.754054416670072, + "language_loss": 0.78831172, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.80978525, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.712251663208008 + }, + { + "auxiliary_loss_clip": 0.01112851, + "auxiliary_loss_mlp": 0.01049434, + "balance_loss_clip": 1.0404613, + "balance_loss_mlp": 1.03083777, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 1.9782258317762869, + "language_loss": 0.78727221, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80889505, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 2.647688865661621 + }, + { + "auxiliary_loss_clip": 0.01120323, + "auxiliary_loss_mlp": 0.00750671, + "balance_loss_clip": 1.0413518, + "balance_loss_mlp": 0.9999488, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.845137624181328, + "language_loss": 0.74574828, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.7644583, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 2.6182680130004883 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.0105469, + "balance_loss_clip": 1.04171276, + "balance_loss_mlp": 1.03730893, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.8883779608340256, + "language_loss": 0.59869379, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62034059, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 2.729522228240967 + }, + { + "auxiliary_loss_clip": 0.01113992, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_clip": 1.04295826, + "balance_loss_mlp": 1.0307951, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 2.145501679436511, + "language_loss": 0.72878659, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75040632, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.672942876815796 + }, + { + "auxiliary_loss_clip": 0.01138703, + "auxiliary_loss_mlp": 0.0105562, + "balance_loss_clip": 1.04103661, + "balance_loss_mlp": 1.03605747, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 2.018240033493533, + "language_loss": 0.85449755, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87644076, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 2.5564870834350586 + }, + { + "auxiliary_loss_clip": 0.01113509, + "auxiliary_loss_mlp": 0.01059655, + "balance_loss_clip": 1.03853917, + "balance_loss_mlp": 1.04204798, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 1.5808522396006706, + "language_loss": 0.81959689, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84132856, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.615704298019409 + }, + { + "auxiliary_loss_clip": 0.01109529, + "auxiliary_loss_mlp": 0.0104768, + "balance_loss_clip": 1.04027224, + "balance_loss_mlp": 1.02864194, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.6689996669097609, + "language_loss": 0.74238193, + "learning_rate": 3.797813774376267e-06, + "loss": 0.76395404, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 2.6666979789733887 + }, + { + "auxiliary_loss_clip": 0.01024494, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.01986623, + "balance_loss_mlp": 1.0344094, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7654420023339554, + "language_loss": 0.56495547, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58556557, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 3.2763428688049316 + }, + { + "auxiliary_loss_clip": 0.01083733, + "auxiliary_loss_mlp": 0.01049, + "balance_loss_clip": 1.03334069, + "balance_loss_mlp": 1.03057027, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.6683456035828585, + "language_loss": 0.83156121, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85288852, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 4.1615355014801025 + }, + { + "auxiliary_loss_clip": 0.01100336, + "auxiliary_loss_mlp": 0.01048871, + "balance_loss_clip": 1.03831553, + "balance_loss_mlp": 1.03054857, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.125963663008396, + "language_loss": 0.78164136, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80313349, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.694535970687866 + }, + { + "auxiliary_loss_clip": 0.01102697, + "auxiliary_loss_mlp": 0.01046622, + "balance_loss_clip": 1.03971744, + "balance_loss_mlp": 1.02818024, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1981362075961237, + "language_loss": 0.79296279, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81445599, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.600372314453125 + }, + { + "auxiliary_loss_clip": 0.01105099, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.03994751, + "balance_loss_mlp": 1.02705383, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.7424114060017797, + "language_loss": 0.88920665, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91070414, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.628239870071411 + }, + { + "auxiliary_loss_clip": 0.01135095, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_clip": 1.0413909, + "balance_loss_mlp": 1.02547503, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.0404541889168315, + "language_loss": 0.72072971, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74250615, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.744645833969116 + }, + { + "auxiliary_loss_clip": 0.01097311, + "auxiliary_loss_mlp": 0.01046788, + "balance_loss_clip": 1.04152322, + "balance_loss_mlp": 1.03015828, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 2.120622861022798, + "language_loss": 0.86391795, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88535893, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 4.128571510314941 + }, + { + "auxiliary_loss_clip": 0.01130249, + "auxiliary_loss_mlp": 0.01043477, + "balance_loss_clip": 1.0423913, + "balance_loss_mlp": 1.02391505, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 2.262864352843457, + "language_loss": 0.73971367, + "learning_rate": 3.796446484348989e-06, + "loss": 0.76145095, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 2.6120941638946533 + }, + { + "auxiliary_loss_clip": 0.01079506, + "auxiliary_loss_mlp": 0.01046641, + "balance_loss_clip": 1.0372957, + "balance_loss_mlp": 1.02630401, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.077263177794748, + "language_loss": 0.79746974, + "learning_rate": 3.796275266481036e-06, + "loss": 0.81873119, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 2.7005996704101562 + }, + { + "auxiliary_loss_clip": 0.01120392, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.04180479, + "balance_loss_mlp": 1.02625489, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 1.7302057391876469, + "language_loss": 0.83031124, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85194975, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.5846056938171387 + }, + { + "auxiliary_loss_clip": 0.01090012, + "auxiliary_loss_mlp": 0.01043404, + "balance_loss_clip": 1.04334879, + "balance_loss_mlp": 1.02534425, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 2.264216720182475, + "language_loss": 0.93577278, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95710695, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 2.7471282482147217 + }, + { + "auxiliary_loss_clip": 0.01102451, + "auxiliary_loss_mlp": 0.01047245, + "balance_loss_clip": 1.0405829, + "balance_loss_mlp": 1.02829051, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.0593706116415316, + "language_loss": 0.83706331, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.85856026, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 4.232856273651123 + }, + { + "auxiliary_loss_clip": 0.01127159, + "auxiliary_loss_mlp": 0.01044098, + "balance_loss_clip": 1.04140139, + "balance_loss_mlp": 1.02519107, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 2.001798024441473, + "language_loss": 0.76471448, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78642708, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 4.1028265953063965 + }, + { + "auxiliary_loss_clip": 0.01114092, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.04061306, + "balance_loss_mlp": 1.02453351, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 2.0902592754341334, + "language_loss": 0.76837307, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.78994101, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.638761043548584 + }, + { + "auxiliary_loss_clip": 0.01132325, + "auxiliary_loss_mlp": 0.0104295, + "balance_loss_clip": 1.04217577, + "balance_loss_mlp": 1.02536678, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 1.9028458380555975, + "language_loss": 0.8544333, + "learning_rate": 3.795246529087043e-06, + "loss": 0.87618607, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.562934398651123 + }, + { + "auxiliary_loss_clip": 0.01134508, + "auxiliary_loss_mlp": 0.0104208, + "balance_loss_clip": 1.04462945, + "balance_loss_mlp": 1.02494955, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 2.0340538773004195, + "language_loss": 0.68168736, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7034533, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 2.5816569328308105 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.00750771, + "balance_loss_clip": 1.03931022, + "balance_loss_mlp": 0.99993646, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.7630600338890927, + "language_loss": 0.78381944, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.8024255, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.5780789852142334 + }, + { + "auxiliary_loss_clip": 0.01120482, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.03859663, + "balance_loss_mlp": 1.02496827, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.3790385798076565, + "language_loss": 0.781129, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80274856, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.5288493633270264 + }, + { + "auxiliary_loss_clip": 0.01120516, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.04047251, + "balance_loss_mlp": 1.02194786, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7664802072385093, + "language_loss": 0.80263019, + "learning_rate": 3.794559342552472e-06, + "loss": 0.82422161, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.7310619354248047 + }, + { + "auxiliary_loss_clip": 0.01121424, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.03836679, + "balance_loss_mlp": 1.02679563, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.075266567207002, + "language_loss": 0.86576712, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.88742143, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.566671371459961 + }, + { + "auxiliary_loss_clip": 0.010917, + "auxiliary_loss_mlp": 0.01040323, + "balance_loss_clip": 1.03865254, + "balance_loss_mlp": 1.0224061, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.745818082741228, + "language_loss": 0.74793911, + "learning_rate": 3.794215340959902e-06, + "loss": 0.76925927, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.6894428730010986 + }, + { + "auxiliary_loss_clip": 0.01016545, + "auxiliary_loss_mlp": 0.01017853, + "balance_loss_clip": 1.01973498, + "balance_loss_mlp": 1.01549292, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.8142532438172106, + "language_loss": 0.57510984, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59545386, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 3.198098659515381 + }, + { + "auxiliary_loss_clip": 0.01094909, + "auxiliary_loss_mlp": 0.01042543, + "balance_loss_clip": 1.03873372, + "balance_loss_mlp": 1.02571011, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.4470895045997563, + "language_loss": 0.81585181, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83722627, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 2.6493282318115234 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.01046398, + "balance_loss_clip": 1.04420662, + "balance_loss_mlp": 1.02975667, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.036888267370153, + "language_loss": 0.93233192, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95378423, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.628918170928955 + }, + { + "auxiliary_loss_clip": 0.01099969, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_clip": 1.03674984, + "balance_loss_mlp": 1.03388202, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.7242857612806122, + "language_loss": 0.69491744, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71642947, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.6562976837158203 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01057989, + "balance_loss_clip": 1.04500127, + "balance_loss_mlp": 1.04112077, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.123706503316309, + "language_loss": 0.66687, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68843973, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 2.6420421600341797 + }, + { + "auxiliary_loss_clip": 0.01095642, + "auxiliary_loss_mlp": 0.01056666, + "balance_loss_clip": 1.0357275, + "balance_loss_mlp": 1.04031062, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.8089428713905564, + "language_loss": 0.89069253, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91221559, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 2.6276025772094727 + }, + { + "auxiliary_loss_clip": 0.01134811, + "auxiliary_loss_mlp": 0.01048037, + "balance_loss_clip": 1.04071546, + "balance_loss_mlp": 1.03187227, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.0864942204891026, + "language_loss": 0.83120465, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.85303313, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.5464928150177 + }, + { + "auxiliary_loss_clip": 0.01124815, + "auxiliary_loss_mlp": 0.01049988, + "balance_loss_clip": 1.04209399, + "balance_loss_mlp": 1.03346539, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.9285520121843918, + "language_loss": 0.86641991, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88816798, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 2.5643794536590576 + }, + { + "auxiliary_loss_clip": 0.01122706, + "auxiliary_loss_mlp": 0.01052997, + "balance_loss_clip": 1.04146636, + "balance_loss_mlp": 1.03578925, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.181083355901776, + "language_loss": 0.78059685, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80235392, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 2.527717351913452 + }, + { + "auxiliary_loss_clip": 0.01124254, + "auxiliary_loss_mlp": 0.01061051, + "balance_loss_clip": 1.04211557, + "balance_loss_mlp": 1.04185796, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 1.8492729705719317, + "language_loss": 0.77322745, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79508048, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.5180349349975586 + }, + { + "auxiliary_loss_clip": 0.01089418, + "auxiliary_loss_mlp": 0.01053063, + "balance_loss_clip": 1.0427556, + "balance_loss_mlp": 1.03532505, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 1.9694422727586578, + "language_loss": 0.767156, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.78858078, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.575124740600586 + }, + { + "auxiliary_loss_clip": 0.01124273, + "auxiliary_loss_mlp": 0.01043321, + "balance_loss_clip": 1.04082668, + "balance_loss_mlp": 1.02658415, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 1.8826581823714237, + "language_loss": 0.81574607, + "learning_rate": 3.792145618140317e-06, + "loss": 0.83742201, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.524169683456421 + }, + { + "auxiliary_loss_clip": 0.01105967, + "auxiliary_loss_mlp": 0.01050237, + "balance_loss_clip": 1.03905749, + "balance_loss_mlp": 1.03317833, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 6.548441387556503, + "language_loss": 0.8552714, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87683344, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 2.6444218158721924 + }, + { + "auxiliary_loss_clip": 0.0109514, + "auxiliary_loss_mlp": 0.0104502, + "balance_loss_clip": 1.03841412, + "balance_loss_mlp": 1.02896214, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 1.7415543773471651, + "language_loss": 0.78032565, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80172718, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 2.7211966514587402 + }, + { + "auxiliary_loss_clip": 0.01098929, + "auxiliary_loss_mlp": 0.00750537, + "balance_loss_clip": 1.03773379, + "balance_loss_mlp": 0.99991935, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.6891940716825804, + "language_loss": 0.72366691, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74216163, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 2.690831184387207 + }, + { + "auxiliary_loss_clip": 0.01097964, + "auxiliary_loss_mlp": 0.01049325, + "balance_loss_clip": 1.03802168, + "balance_loss_mlp": 1.0327549, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.7188475202890623, + "language_loss": 0.72663367, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.7481066, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.6610629558563232 + }, + { + "auxiliary_loss_clip": 0.01122262, + "auxiliary_loss_mlp": 0.00750537, + "balance_loss_clip": 1.04164088, + "balance_loss_mlp": 1.0000248, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.964730221985146, + "language_loss": 0.78957826, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80830628, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 2.5204062461853027 + }, + { + "auxiliary_loss_clip": 0.01134467, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.04127932, + "balance_loss_mlp": 1.02620363, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.7833604168910009, + "language_loss": 0.79707319, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.81885493, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 2.4646530151367188 + }, + { + "auxiliary_loss_clip": 0.01109269, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.03808165, + "balance_loss_mlp": 1.01854122, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.882851435817599, + "language_loss": 0.79371178, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81516856, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 2.524442434310913 + }, + { + "auxiliary_loss_clip": 0.01097549, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.0470345, + "balance_loss_mlp": 1.02465153, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.8220584504061135, + "language_loss": 0.84200466, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86339718, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 2.5981838703155518 + }, + { + "auxiliary_loss_clip": 0.01110048, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_clip": 1.03860843, + "balance_loss_mlp": 1.02715957, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.1100908269827094, + "language_loss": 0.77223361, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79378998, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.5609259605407715 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.04247522, + "balance_loss_mlp": 1.01860714, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.7850354580014802, + "language_loss": 0.77231097, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79396987, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 3.953691244125366 + }, + { + "auxiliary_loss_clip": 0.01110874, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.04056048, + "balance_loss_mlp": 1.01792204, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 3.6122603477609294, + "language_loss": 0.74542403, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76688486, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.612584352493286 + }, + { + "auxiliary_loss_clip": 0.01128988, + "auxiliary_loss_mlp": 0.01039421, + "balance_loss_clip": 1.03930593, + "balance_loss_mlp": 1.02246916, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.7300983427827716, + "language_loss": 0.82792568, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84960973, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 2.5148727893829346 + }, + { + "auxiliary_loss_clip": 0.0108546, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.03543699, + "balance_loss_mlp": 1.02010357, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.088946850520076, + "language_loss": 0.74977791, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.7710163, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 4.07060980796814 + }, + { + "auxiliary_loss_clip": 0.0113521, + "auxiliary_loss_mlp": 0.01039855, + "balance_loss_clip": 1.04104376, + "balance_loss_mlp": 1.02160454, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 2.0576962616869605, + "language_loss": 0.80665672, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.82840735, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 2.480478525161743 + }, + { + "auxiliary_loss_clip": 0.011141, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.04061031, + "balance_loss_mlp": 1.02583206, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 3.0823715163455927, + "language_loss": 0.87388599, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89546907, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 2.4966821670532227 + }, + { + "auxiliary_loss_clip": 0.01113396, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.04292655, + "balance_loss_mlp": 1.02158117, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.7307343330403364, + "language_loss": 0.84188509, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86340618, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 2.6339950561523438 + }, + { + "auxiliary_loss_clip": 0.01101102, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.04147184, + "balance_loss_mlp": 1.02003169, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 2.1371187538325187, + "language_loss": 0.79073727, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81212115, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 2.570408582687378 + }, + { + "auxiliary_loss_clip": 0.01111144, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.04042113, + "balance_loss_mlp": 1.02273798, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.8245586192237389, + "language_loss": 0.70064819, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72215068, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.603569507598877 + }, + { + "auxiliary_loss_clip": 0.01107004, + "auxiliary_loss_mlp": 0.0104278, + "balance_loss_clip": 1.03775668, + "balance_loss_mlp": 1.02647185, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.6100505960225866, + "language_loss": 0.83040375, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85190159, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.544381618499756 + }, + { + "auxiliary_loss_clip": 0.01084846, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.03858709, + "balance_loss_mlp": 1.02170682, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.192924027394085, + "language_loss": 0.81364608, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.834903, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 4.119114398956299 + }, + { + "auxiliary_loss_clip": 0.01112236, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.040766, + "balance_loss_mlp": 1.02010858, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 1.7187481868532994, + "language_loss": 0.77385306, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79533565, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.624371290206909 + }, + { + "auxiliary_loss_clip": 0.01100466, + "auxiliary_loss_mlp": 0.01044604, + "balance_loss_clip": 1.04891026, + "balance_loss_mlp": 1.02848077, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 2.3906432817574386, + "language_loss": 0.76360917, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78505987, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 2.735077142715454 + }, + { + "auxiliary_loss_clip": 0.0108674, + "auxiliary_loss_mlp": 0.01040188, + "balance_loss_clip": 1.03645134, + "balance_loss_mlp": 1.0237968, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.9829022449658988, + "language_loss": 0.85451102, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87578034, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.6642465591430664 + }, + { + "auxiliary_loss_clip": 0.01113123, + "auxiliary_loss_mlp": 0.0075049, + "balance_loss_clip": 1.04207349, + "balance_loss_mlp": 0.99994123, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.7511097975210344, + "language_loss": 0.74232471, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76096082, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.7298197746276855 + }, + { + "auxiliary_loss_clip": 0.01106126, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.04040766, + "balance_loss_mlp": 1.02416325, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.579125429897169, + "language_loss": 0.70907331, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.73053098, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.630565643310547 + }, + { + "auxiliary_loss_clip": 0.01119361, + "auxiliary_loss_mlp": 0.01035228, + "balance_loss_clip": 1.03814197, + "balance_loss_mlp": 1.01914692, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 2.6629631198446746, + "language_loss": 0.69946456, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.72101045, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 2.5372185707092285 + }, + { + "auxiliary_loss_clip": 0.0108972, + "auxiliary_loss_mlp": 0.01049711, + "balance_loss_clip": 1.03842723, + "balance_loss_mlp": 1.03354645, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.8749631292041296, + "language_loss": 0.85121715, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87261152, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.619919538497925 + }, + { + "auxiliary_loss_clip": 0.0107033, + "auxiliary_loss_mlp": 0.01048441, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.02996325, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 1.9073308384816081, + "language_loss": 0.78319979, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80438751, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 2.7073302268981934 + }, + { + "auxiliary_loss_clip": 0.01092138, + "auxiliary_loss_mlp": 0.00750473, + "balance_loss_clip": 1.04132152, + "balance_loss_mlp": 0.99997199, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.0682587845957032, + "language_loss": 0.84294391, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86136997, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 2.6106455326080322 + }, + { + "auxiliary_loss_clip": 0.01123235, + "auxiliary_loss_mlp": 0.01043979, + "balance_loss_clip": 1.04367959, + "balance_loss_mlp": 1.02736139, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.8413235836984123, + "language_loss": 0.82354701, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84521914, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.492638111114502 + }, + { + "auxiliary_loss_clip": 0.01068308, + "auxiliary_loss_mlp": 0.01045213, + "balance_loss_clip": 1.0307343, + "balance_loss_mlp": 1.02649689, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 3.3689966243287572, + "language_loss": 0.81169724, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.8328324, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 2.5632073879241943 + }, + { + "auxiliary_loss_clip": 0.01125864, + "auxiliary_loss_mlp": 0.01049766, + "balance_loss_clip": 1.04370141, + "balance_loss_mlp": 1.03146744, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 1.883464412618242, + "language_loss": 0.74373025, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76548654, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 2.5772976875305176 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01040553, + "balance_loss_clip": 1.03918827, + "balance_loss_mlp": 1.02282643, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.114307270427088, + "language_loss": 0.82642066, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.84791058, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 2.568035364151001 + }, + { + "auxiliary_loss_clip": 0.0109828, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_clip": 1.04271805, + "balance_loss_mlp": 1.02206862, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.4158486246797217, + "language_loss": 0.73932517, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76072943, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 2.675382614135742 + }, + { + "auxiliary_loss_clip": 0.01004865, + "auxiliary_loss_mlp": 0.01052393, + "balance_loss_clip": 1.01662135, + "balance_loss_mlp": 1.05027139, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8758289378167547, + "language_loss": 0.62796247, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64853501, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.27009654045105 + }, + { + "auxiliary_loss_clip": 0.01110369, + "auxiliary_loss_mlp": 0.00750681, + "balance_loss_clip": 1.0407021, + "balance_loss_mlp": 0.99998558, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 1.9619496919338495, + "language_loss": 0.75763446, + "learning_rate": 3.785877779175034e-06, + "loss": 0.776245, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 2.6419875621795654 + }, + { + "auxiliary_loss_clip": 0.01120227, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.04142642, + "balance_loss_mlp": 1.01558912, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9456644428021563, + "language_loss": 0.69163764, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71316332, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.641036033630371 + }, + { + "auxiliary_loss_clip": 0.01105765, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.04051065, + "balance_loss_mlp": 1.02014768, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.1974803477963154, + "language_loss": 0.76137614, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78281659, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.630451202392578 + }, + { + "auxiliary_loss_clip": 0.01072914, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.03596783, + "balance_loss_mlp": 1.02545714, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.840840697686864, + "language_loss": 0.72815967, + "learning_rate": 3.785351493339121e-06, + "loss": 0.74932492, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.637626886367798 + }, + { + "auxiliary_loss_clip": 0.01086137, + "auxiliary_loss_mlp": 0.00750778, + "balance_loss_clip": 1.03827214, + "balance_loss_mlp": 0.99994433, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.459128348194245, + "language_loss": 0.69569325, + "learning_rate": 3.785175929316863e-06, + "loss": 0.71406239, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 2.760308027267456 + }, + { + "auxiliary_loss_clip": 0.01104634, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_clip": 1.03950739, + "balance_loss_mlp": 1.0307169, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 4.384163047441604, + "language_loss": 0.76521385, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78673488, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 2.589938163757324 + }, + { + "auxiliary_loss_clip": 0.01125256, + "auxiliary_loss_mlp": 0.01043925, + "balance_loss_clip": 1.04066944, + "balance_loss_mlp": 1.02689004, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.0508328347654365, + "language_loss": 0.81475973, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.83645153, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 2.524261713027954 + }, + { + "auxiliary_loss_clip": 0.01106415, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.04170191, + "balance_loss_mlp": 1.02285624, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 1.7650464973512625, + "language_loss": 0.73495793, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75641847, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.537367820739746 + }, + { + "auxiliary_loss_clip": 0.0107993, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_clip": 1.03799295, + "balance_loss_mlp": 1.02499735, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.7205084177665373, + "language_loss": 0.64749902, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66871893, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 2.7120633125305176 + }, + { + "auxiliary_loss_clip": 0.01112063, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_clip": 1.04396582, + "balance_loss_mlp": 1.02577078, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.6398720486883038, + "language_loss": 0.79445189, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81600964, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 2.6125354766845703 + }, + { + "auxiliary_loss_clip": 0.01124882, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.04348278, + "balance_loss_mlp": 1.02588844, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 6.5593504928245, + "language_loss": 0.81094164, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83262384, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.499600887298584 + }, + { + "auxiliary_loss_clip": 0.01122736, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.0416888, + "balance_loss_mlp": 1.02327776, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 1.917278661215861, + "language_loss": 0.81251621, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83414567, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 2.484020471572876 + }, + { + "auxiliary_loss_clip": 0.01106942, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_clip": 1.04176545, + "balance_loss_mlp": 1.02796006, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.2640897280433543, + "language_loss": 0.80340445, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82493681, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.558379888534546 + }, + { + "auxiliary_loss_clip": 0.01067725, + "auxiliary_loss_mlp": 0.0104905, + "balance_loss_clip": 1.04111814, + "balance_loss_mlp": 1.02826011, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.7186376983861869, + "language_loss": 0.76753795, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78870571, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.6372921466827393 + }, + { + "auxiliary_loss_clip": 0.01136497, + "auxiliary_loss_mlp": 0.01045704, + "balance_loss_clip": 1.04296112, + "balance_loss_mlp": 1.02732205, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.7176336395052076, + "language_loss": 0.87177885, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89360082, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 4.092036247253418 + }, + { + "auxiliary_loss_clip": 0.011349, + "auxiliary_loss_mlp": 0.0075073, + "balance_loss_clip": 1.04116356, + "balance_loss_mlp": 0.99998415, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.1251590487575664, + "language_loss": 0.89510536, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91396165, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.4941420555114746 + }, + { + "auxiliary_loss_clip": 0.01125205, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.0407685, + "balance_loss_mlp": 1.02390432, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 1.703857045684927, + "language_loss": 0.72408772, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74575949, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 2.523054838180542 + }, + { + "auxiliary_loss_clip": 0.01117413, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.04387236, + "balance_loss_mlp": 1.02146006, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 1.7325596614504717, + "language_loss": 0.69737321, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71893609, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 4.104460000991821 + }, + { + "auxiliary_loss_clip": 0.01121199, + "auxiliary_loss_mlp": 0.01048201, + "balance_loss_clip": 1.0427866, + "balance_loss_mlp": 1.03134501, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.7117001416796223, + "language_loss": 0.93289101, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95458496, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 2.5761635303497314 + }, + { + "auxiliary_loss_clip": 0.01093557, + "auxiliary_loss_mlp": 0.01046597, + "balance_loss_clip": 1.03814328, + "balance_loss_mlp": 1.02834582, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.6555656014164015, + "language_loss": 0.80999148, + "learning_rate": 3.782534349431226e-06, + "loss": 0.831393, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 2.6003572940826416 + }, + { + "auxiliary_loss_clip": 0.01123807, + "auxiliary_loss_mlp": 0.01054541, + "balance_loss_clip": 1.04050183, + "balance_loss_mlp": 1.03675532, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 3.8615002489562147, + "language_loss": 0.74012858, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76191205, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.6603713035583496 + }, + { + "auxiliary_loss_clip": 0.01119435, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.04511642, + "balance_loss_mlp": 1.03255486, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 1.819190206257592, + "language_loss": 0.77017891, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79188573, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.5502469539642334 + }, + { + "auxiliary_loss_clip": 0.01072984, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.03688657, + "balance_loss_mlp": 1.02801323, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 2.5265726203920633, + "language_loss": 0.74405086, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76526791, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.7312135696411133 + }, + { + "auxiliary_loss_clip": 0.0110225, + "auxiliary_loss_mlp": 0.01050925, + "balance_loss_clip": 1.03847957, + "balance_loss_mlp": 1.03274536, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.821760078929093, + "language_loss": 0.74791175, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76944351, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 2.6669020652770996 + }, + { + "auxiliary_loss_clip": 0.01090222, + "auxiliary_loss_mlp": 0.01041555, + "balance_loss_clip": 1.03353214, + "balance_loss_mlp": 1.02375698, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 1.981395070265544, + "language_loss": 0.79320866, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81452644, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 4.055568218231201 + }, + { + "auxiliary_loss_clip": 0.01101476, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_clip": 1.04045188, + "balance_loss_mlp": 1.03165579, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 2.510426748423699, + "language_loss": 0.87995869, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.90147406, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 2.662909507751465 + }, + { + "auxiliary_loss_clip": 0.01120687, + "auxiliary_loss_mlp": 0.0104735, + "balance_loss_clip": 1.03859115, + "balance_loss_mlp": 1.0297544, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.2256401801883428, + "language_loss": 0.62255812, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64423853, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 2.5802090167999268 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.0104543, + "balance_loss_clip": 1.03877628, + "balance_loss_mlp": 1.02646339, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.1186547698596883, + "language_loss": 0.80160737, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82310218, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 2.599116802215576 + }, + { + "auxiliary_loss_clip": 0.01111618, + "auxiliary_loss_mlp": 0.01054703, + "balance_loss_clip": 1.04081881, + "balance_loss_mlp": 1.03542638, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 2.136737062354823, + "language_loss": 0.7096234, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73128664, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 2.5657479763031006 + }, + { + "auxiliary_loss_clip": 0.01098194, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.03957534, + "balance_loss_mlp": 1.02398539, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.746434941004697, + "language_loss": 0.7168231, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73821783, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.6879541873931885 + }, + { + "auxiliary_loss_clip": 0.01077936, + "auxiliary_loss_mlp": 0.01043771, + "balance_loss_clip": 1.03645778, + "balance_loss_mlp": 1.02178884, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 2.0109888772230033, + "language_loss": 0.84702969, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86824679, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.6168112754821777 + }, + { + "auxiliary_loss_clip": 0.01088446, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.04605103, + "balance_loss_mlp": 1.03244543, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.7578042009775938, + "language_loss": 0.71782005, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.73919618, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 2.7717151641845703 + }, + { + "auxiliary_loss_clip": 0.01093011, + "auxiliary_loss_mlp": 0.01040302, + "balance_loss_clip": 1.03851628, + "balance_loss_mlp": 1.02245688, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 1.9468222815768623, + "language_loss": 0.8330487, + "learning_rate": 3.780232677305744e-06, + "loss": 0.8543818, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 2.676764488220215 + }, + { + "auxiliary_loss_clip": 0.01101858, + "auxiliary_loss_mlp": 0.01038053, + "balance_loss_clip": 1.03901112, + "balance_loss_mlp": 1.02086329, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.663662560957572, + "language_loss": 0.79173791, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81313705, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 2.636881113052368 + }, + { + "auxiliary_loss_clip": 0.01136786, + "auxiliary_loss_mlp": 0.01040696, + "balance_loss_clip": 1.0446434, + "balance_loss_mlp": 1.02201617, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.2071013375661526, + "language_loss": 0.7708832, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.79265803, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 2.5494256019592285 + }, + { + "auxiliary_loss_clip": 0.0104822, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.03312016, + "balance_loss_mlp": 1.02375209, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.508256481323629, + "language_loss": 0.74846864, + "learning_rate": 3.779699901503696e-06, + "loss": 0.76936734, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 2.717338800430298 + }, + { + "auxiliary_loss_clip": 0.01123891, + "auxiliary_loss_mlp": 0.01042923, + "balance_loss_clip": 1.03959799, + "balance_loss_mlp": 1.02407622, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.1978174531363726, + "language_loss": 0.90114659, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92281479, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.01131771, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.04289484, + "balance_loss_mlp": 1.02970421, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.8788088769149218, + "language_loss": 0.88390124, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90567696, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.5554308891296387 + }, + { + "auxiliary_loss_clip": 0.01104948, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.04048538, + "balance_loss_mlp": 1.02415037, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.6541995209002232, + "language_loss": 0.70197678, + "learning_rate": 3.779166518324077e-06, + "loss": 0.72343314, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 2.894620418548584 + }, + { + "auxiliary_loss_clip": 0.0110476, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.04102337, + "balance_loss_mlp": 1.0178709, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2291874237877396, + "language_loss": 0.69336754, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.7147705, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.6628518104553223 + }, + { + "auxiliary_loss_clip": 0.01078621, + "auxiliary_loss_mlp": 0.01037155, + "balance_loss_clip": 1.03827286, + "balance_loss_mlp": 1.02057278, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.8958786816210111, + "language_loss": 0.71059608, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73175383, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.689465284347534 + }, + { + "auxiliary_loss_clip": 0.0111717, + "auxiliary_loss_mlp": 0.01043524, + "balance_loss_clip": 1.0451231, + "balance_loss_mlp": 1.02526069, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.4316560547146127, + "language_loss": 0.7518416, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.77344853, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.582956075668335 + }, + { + "auxiliary_loss_clip": 0.01123409, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.04196417, + "balance_loss_mlp": 1.02449226, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.1788927054270033, + "language_loss": 0.70312947, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.72477478, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.545841693878174 + }, + { + "auxiliary_loss_clip": 0.01135551, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.04400897, + "balance_loss_mlp": 1.01815856, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 2.395268418729245, + "language_loss": 0.7415477, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.76325327, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 2.5097744464874268 + }, + { + "auxiliary_loss_clip": 0.01099758, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.04181552, + "balance_loss_mlp": 1.0238142, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.138357009971849, + "language_loss": 0.858464, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87988698, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 2.553666830062866 + }, + { + "auxiliary_loss_clip": 0.01135234, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.04288149, + "balance_loss_mlp": 1.02296948, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.0807808116322772, + "language_loss": 0.76685745, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.7886132, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 2.553001880645752 + }, + { + "auxiliary_loss_clip": 0.01080335, + "auxiliary_loss_mlp": 0.00751045, + "balance_loss_clip": 1.04025865, + "balance_loss_mlp": 0.99999452, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 2.0668504257618823, + "language_loss": 0.80244553, + "learning_rate": 3.77774119516197e-06, + "loss": 0.8207593, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.6934900283813477 + }, + { + "auxiliary_loss_clip": 0.01103189, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.03818774, + "balance_loss_mlp": 1.0278908, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 1.9075464701073062, + "language_loss": 0.80604804, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82755291, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 2.7223222255706787 + }, + { + "auxiliary_loss_clip": 0.01134862, + "auxiliary_loss_mlp": 0.01052246, + "balance_loss_clip": 1.04101908, + "balance_loss_mlp": 1.03473425, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 2.1401944111442357, + "language_loss": 0.73610848, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75797957, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.7079837322235107 + }, + { + "auxiliary_loss_clip": 0.01122076, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.04264641, + "balance_loss_mlp": 1.02396965, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.4263215067514525, + "language_loss": 0.77492285, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79654998, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 2.482516288757324 + }, + { + "auxiliary_loss_clip": 0.0107618, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_clip": 1.03477597, + "balance_loss_mlp": 1.03287542, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.7485216880010992, + "language_loss": 0.76303196, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78430206, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 2.7244503498077393 + }, + { + "auxiliary_loss_clip": 0.01118553, + "auxiliary_loss_mlp": 0.01041607, + "balance_loss_clip": 1.03856826, + "balance_loss_mlp": 1.02409482, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.040794252187446, + "language_loss": 0.72562444, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74722606, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.695800542831421 + }, + { + "auxiliary_loss_clip": 0.01121666, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_clip": 1.04144454, + "balance_loss_mlp": 1.02758741, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.7679151548944139, + "language_loss": 0.82092798, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84259081, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.591503620147705 + }, + { + "auxiliary_loss_clip": 0.01034163, + "auxiliary_loss_mlp": 0.01009218, + "balance_loss_clip": 1.01309025, + "balance_loss_mlp": 1.00673854, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7574764892371475, + "language_loss": 0.64998794, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67042172, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.163931131362915 + }, + { + "auxiliary_loss_clip": 0.0109695, + "auxiliary_loss_mlp": 0.01043486, + "balance_loss_clip": 1.03874159, + "balance_loss_mlp": 1.02611768, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.890715635040177, + "language_loss": 0.83794403, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85934842, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 4.072749137878418 + }, + { + "auxiliary_loss_clip": 0.01100074, + "auxiliary_loss_mlp": 0.01051731, + "balance_loss_clip": 1.0387938, + "balance_loss_mlp": 1.03364658, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 2.054462746059787, + "language_loss": 0.79947871, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82099676, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.5877747535705566 + }, + { + "auxiliary_loss_clip": 0.01135354, + "auxiliary_loss_mlp": 0.01045155, + "balance_loss_clip": 1.04184532, + "balance_loss_mlp": 1.0262965, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.4498217931601425, + "language_loss": 0.79074669, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81255174, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 4.1158363819122314 + }, + { + "auxiliary_loss_clip": 0.010932, + "auxiliary_loss_mlp": 0.01047734, + "balance_loss_clip": 1.0375011, + "balance_loss_mlp": 1.03041339, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.9884015967077395, + "language_loss": 0.87901425, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90042359, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 2.7017271518707275 + }, + { + "auxiliary_loss_clip": 0.01114183, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_clip": 1.04181552, + "balance_loss_mlp": 1.03631699, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 3.820563049282923, + "language_loss": 0.85006893, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.87176192, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.5757429599761963 + }, + { + "auxiliary_loss_clip": 0.01102491, + "auxiliary_loss_mlp": 0.01051958, + "balance_loss_clip": 1.03816628, + "balance_loss_mlp": 1.03288436, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 1.629754598564563, + "language_loss": 0.70950019, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73104477, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.553873062133789 + }, + { + "auxiliary_loss_clip": 0.01121782, + "auxiliary_loss_mlp": 0.01053089, + "balance_loss_clip": 1.04128766, + "balance_loss_mlp": 1.03548157, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 25.103439786805787, + "language_loss": 0.83127737, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85302603, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.5797200202941895 + }, + { + "auxiliary_loss_clip": 0.0106834, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.03398788, + "balance_loss_mlp": 1.02685928, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.5693213511632438, + "language_loss": 0.74782825, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.76895642, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.671821117401123 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.010476, + "balance_loss_clip": 1.0461731, + "balance_loss_mlp": 1.03073215, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.312527488010395, + "language_loss": 0.79512262, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.81681019, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.6223018169403076 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.01053713, + "balance_loss_clip": 1.04469228, + "balance_loss_mlp": 1.03508008, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 1.927336635544675, + "language_loss": 0.51623333, + "learning_rate": 3.774698062689362e-06, + "loss": 0.53817594, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 4.025586128234863 + }, + { + "auxiliary_loss_clip": 0.01083237, + "auxiliary_loss_mlp": 0.01057708, + "balance_loss_clip": 1.03873873, + "balance_loss_mlp": 1.03895688, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.8764758718609402, + "language_loss": 0.89571881, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.91712832, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 4.096251964569092 + }, + { + "auxiliary_loss_clip": 0.01096348, + "auxiliary_loss_mlp": 0.01053379, + "balance_loss_clip": 1.04335058, + "balance_loss_mlp": 1.03373325, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.6403127082350741, + "language_loss": 0.78879613, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81029344, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 2.626314640045166 + }, + { + "auxiliary_loss_clip": 0.01118341, + "auxiliary_loss_mlp": 0.01052751, + "balance_loss_clip": 1.04249692, + "balance_loss_mlp": 1.03222299, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.765166747457636, + "language_loss": 0.74641454, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76812547, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 2.5201666355133057 + }, + { + "auxiliary_loss_clip": 0.01116631, + "auxiliary_loss_mlp": 0.01045546, + "balance_loss_clip": 1.04252219, + "balance_loss_mlp": 1.02697325, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.7105222019955058, + "language_loss": 0.78571296, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80733472, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 2.599421977996826 + }, + { + "auxiliary_loss_clip": 0.01127765, + "auxiliary_loss_mlp": 0.00750816, + "balance_loss_clip": 1.04458106, + "balance_loss_mlp": 1.00004339, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.698963753438948, + "language_loss": 0.81356001, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83234584, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.673849105834961 + }, + { + "auxiliary_loss_clip": 0.01123055, + "auxiliary_loss_mlp": 0.01046998, + "balance_loss_clip": 1.04140294, + "balance_loss_mlp": 1.03018987, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 4.128473633527911, + "language_loss": 0.94286472, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96456528, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.4989821910858154 + }, + { + "auxiliary_loss_clip": 0.01089444, + "auxiliary_loss_mlp": 0.0075072, + "balance_loss_clip": 1.04144967, + "balance_loss_mlp": 1.00003791, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.253804584437314, + "language_loss": 0.72910523, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74750692, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 2.745478391647339 + }, + { + "auxiliary_loss_clip": 0.01101484, + "auxiliary_loss_mlp": 0.01044635, + "balance_loss_clip": 1.04092336, + "balance_loss_mlp": 1.02661061, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 1.8318742953277192, + "language_loss": 0.77197433, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79343545, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 2.608691453933716 + }, + { + "auxiliary_loss_clip": 0.01052737, + "auxiliary_loss_mlp": 0.01043614, + "balance_loss_clip": 1.03248167, + "balance_loss_mlp": 1.0258162, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 1.7279278681405514, + "language_loss": 0.75458789, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.77555138, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 2.7214667797088623 + }, + { + "auxiliary_loss_clip": 0.01026217, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.02294159, + "balance_loss_mlp": 1.03488159, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8466442826390554, + "language_loss": 0.69041026, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71104205, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 3.2018895149230957 + }, + { + "auxiliary_loss_clip": 0.01101507, + "auxiliary_loss_mlp": 0.01041737, + "balance_loss_clip": 1.03976631, + "balance_loss_mlp": 1.02289021, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 1.751064186535777, + "language_loss": 0.67280984, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69424224, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 2.7070324420928955 + }, + { + "auxiliary_loss_clip": 0.01087636, + "auxiliary_loss_mlp": 0.01048034, + "balance_loss_clip": 1.04020143, + "balance_loss_mlp": 1.02823329, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.613460805437104, + "language_loss": 0.89533287, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91668957, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 2.6578400135040283 + }, + { + "auxiliary_loss_clip": 0.01095306, + "auxiliary_loss_mlp": 0.01056906, + "balance_loss_clip": 1.03985929, + "balance_loss_mlp": 1.03714132, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 3.5462573635921832, + "language_loss": 0.87963212, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90115422, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 2.5941050052642822 + }, + { + "auxiliary_loss_clip": 0.01136343, + "auxiliary_loss_mlp": 0.01046575, + "balance_loss_clip": 1.0434804, + "balance_loss_mlp": 1.02795422, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.1489538558407792, + "language_loss": 0.76134074, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78316987, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.4985342025756836 + }, + { + "auxiliary_loss_clip": 0.01114326, + "auxiliary_loss_mlp": 0.01048373, + "balance_loss_clip": 1.04144382, + "balance_loss_mlp": 1.03026485, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 3.3401553297967204, + "language_loss": 0.74608678, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76771379, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.5854170322418213 + }, + { + "auxiliary_loss_clip": 0.01121444, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.04063368, + "balance_loss_mlp": 1.02516651, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.6183801785842236, + "language_loss": 0.72883737, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75048065, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.589780569076538 + }, + { + "auxiliary_loss_clip": 0.011207, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.04275632, + "balance_loss_mlp": 1.02148092, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.485690034319125, + "language_loss": 0.77108514, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79265392, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.6062591075897217 + }, + { + "auxiliary_loss_clip": 0.01101266, + "auxiliary_loss_mlp": 0.01048259, + "balance_loss_clip": 1.04489231, + "balance_loss_mlp": 1.03139091, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 1.8473590064000969, + "language_loss": 0.79601324, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81750852, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.6310219764709473 + }, + { + "auxiliary_loss_clip": 0.01116119, + "auxiliary_loss_mlp": 0.01044819, + "balance_loss_clip": 1.04171431, + "balance_loss_mlp": 1.02660358, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.4480054893605812, + "language_loss": 0.76546597, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78707534, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 2.673356294631958 + }, + { + "auxiliary_loss_clip": 0.0109475, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.03991675, + "balance_loss_mlp": 1.02369428, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.7207468311316623, + "language_loss": 0.68803918, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.70940399, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.629688024520874 + }, + { + "auxiliary_loss_clip": 0.01123594, + "auxiliary_loss_mlp": 0.01040433, + "balance_loss_clip": 1.04131866, + "balance_loss_mlp": 1.02125227, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 2.349390760342645, + "language_loss": 0.71079624, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.73243654, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 2.5163817405700684 + }, + { + "auxiliary_loss_clip": 0.01115186, + "auxiliary_loss_mlp": 0.01053068, + "balance_loss_clip": 1.04698396, + "balance_loss_mlp": 1.03472161, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.3519199694166617, + "language_loss": 0.82264, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.8443225, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 2.5371575355529785 + }, + { + "auxiliary_loss_clip": 0.01134937, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.04250455, + "balance_loss_mlp": 1.02255321, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.8655514476244646, + "language_loss": 0.826765, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.84851074, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.5808563232421875 + }, + { + "auxiliary_loss_clip": 0.0112747, + "auxiliary_loss_mlp": 0.01045616, + "balance_loss_clip": 1.04110599, + "balance_loss_mlp": 1.02732968, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 1.8453691418666442, + "language_loss": 0.85518026, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87691116, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 2.5106067657470703 + }, + { + "auxiliary_loss_clip": 0.01101616, + "auxiliary_loss_mlp": 0.01039739, + "balance_loss_clip": 1.03890789, + "balance_loss_mlp": 1.02080858, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 1.6748666361868476, + "language_loss": 0.89452535, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91593891, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 2.6386337280273438 + }, + { + "auxiliary_loss_clip": 0.01130959, + "auxiliary_loss_mlp": 0.01040657, + "balance_loss_clip": 1.04156303, + "balance_loss_mlp": 1.02560067, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.873705031670764, + "language_loss": 0.69914097, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72085714, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.504741907119751 + }, + { + "auxiliary_loss_clip": 0.01132555, + "auxiliary_loss_mlp": 0.00750616, + "balance_loss_clip": 1.04130232, + "balance_loss_mlp": 0.99993145, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 1.95213987039605, + "language_loss": 0.77567697, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79450864, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 2.5418038368225098 + }, + { + "auxiliary_loss_clip": 0.01137351, + "auxiliary_loss_mlp": 0.01042239, + "balance_loss_clip": 1.04141259, + "balance_loss_mlp": 1.02382159, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.6388793393110315, + "language_loss": 0.78153074, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80332661, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 2.500776767730713 + }, + { + "auxiliary_loss_clip": 0.01000752, + "auxiliary_loss_mlp": 0.00748543, + "balance_loss_clip": 1.017905, + "balance_loss_mlp": 1.00006068, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7900728359244401, + "language_loss": 0.62684226, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64433527, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.1202948093414307 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.0428896, + "balance_loss_mlp": 1.0262785, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 2.8612208713586824, + "language_loss": 0.70786297, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72945249, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.6189053058624268 + }, + { + "auxiliary_loss_clip": 0.0111446, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.04150081, + "balance_loss_mlp": 1.02778769, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.739620720419351, + "language_loss": 0.68766427, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70925546, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 4.244371652603149 + }, + { + "auxiliary_loss_clip": 0.01088141, + "auxiliary_loss_mlp": 0.01042083, + "balance_loss_clip": 1.04258084, + "balance_loss_mlp": 1.02433228, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.7117345902908612, + "language_loss": 0.82803684, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84933913, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 2.6901817321777344 + }, + { + "auxiliary_loss_clip": 0.01118462, + "auxiliary_loss_mlp": 0.0103965, + "balance_loss_clip": 1.03913546, + "balance_loss_mlp": 1.02291346, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 1.8188243431986495, + "language_loss": 0.81836933, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.83995044, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 4.011701345443726 + }, + { + "auxiliary_loss_clip": 0.01110611, + "auxiliary_loss_mlp": 0.01045112, + "balance_loss_clip": 1.03855777, + "balance_loss_mlp": 1.02731395, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.854377725023631, + "language_loss": 0.78657049, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.8081277, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.5411131381988525 + }, + { + "auxiliary_loss_clip": 0.01137275, + "auxiliary_loss_mlp": 0.0105073, + "balance_loss_clip": 1.04306388, + "balance_loss_mlp": 1.03413653, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 13.278932655382663, + "language_loss": 0.79861027, + "learning_rate": 3.768371587287296e-06, + "loss": 0.8204903, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.4975576400756836 + }, + { + "auxiliary_loss_clip": 0.01123589, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.04160869, + "balance_loss_mlp": 1.03091574, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.5935369946730862, + "language_loss": 0.84734344, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86905396, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.531449794769287 + }, + { + "auxiliary_loss_clip": 0.01103587, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.0459199, + "balance_loss_mlp": 1.02157462, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.1274880724655123, + "language_loss": 0.87926626, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90067923, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.6428234577178955 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.0104999, + "balance_loss_clip": 1.03760958, + "balance_loss_mlp": 1.03151309, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 1.714218971603758, + "language_loss": 0.85287184, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87449419, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.596363067626953 + }, + { + "auxiliary_loss_clip": 0.01136326, + "auxiliary_loss_mlp": 0.0104183, + "balance_loss_clip": 1.04546165, + "balance_loss_mlp": 1.025141, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.6013797548517026, + "language_loss": 0.84259957, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86438119, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.607123851776123 + }, + { + "auxiliary_loss_clip": 0.01116216, + "auxiliary_loss_mlp": 0.01043214, + "balance_loss_clip": 1.03739738, + "balance_loss_mlp": 1.02597618, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7166585235015843, + "language_loss": 0.74734366, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76893795, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 4.026150465011597 + }, + { + "auxiliary_loss_clip": 0.01108832, + "auxiliary_loss_mlp": 0.00750647, + "balance_loss_clip": 1.0404135, + "balance_loss_mlp": 0.99993992, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 1.5651400517778375, + "language_loss": 0.71133935, + "learning_rate": 3.76727879248177e-06, + "loss": 0.7299341, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 4.113340854644775 + }, + { + "auxiliary_loss_clip": 0.01126977, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.04202318, + "balance_loss_mlp": 1.02839446, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.031851518499873, + "language_loss": 0.88138568, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90311635, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.5734148025512695 + }, + { + "auxiliary_loss_clip": 0.01133459, + "auxiliary_loss_mlp": 0.010452, + "balance_loss_clip": 1.04163146, + "balance_loss_mlp": 1.02833152, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6509810878765352, + "language_loss": 0.80471122, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.82649779, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.508190393447876 + }, + { + "auxiliary_loss_clip": 0.01136431, + "auxiliary_loss_mlp": 0.01042607, + "balance_loss_clip": 1.04250407, + "balance_loss_mlp": 1.02612066, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 1.8503906233647778, + "language_loss": 0.67462772, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69641817, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.5337538719177246 + }, + { + "auxiliary_loss_clip": 0.01124724, + "auxiliary_loss_mlp": 0.01044427, + "balance_loss_clip": 1.0421586, + "balance_loss_mlp": 1.0270822, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.6699989737984893, + "language_loss": 0.85423458, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87592608, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 2.513702392578125 + }, + { + "auxiliary_loss_clip": 0.01118515, + "auxiliary_loss_mlp": 0.01036828, + "balance_loss_clip": 1.03937399, + "balance_loss_mlp": 1.02100873, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.4568069864847415, + "language_loss": 0.82908666, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85064012, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 2.5736801624298096 + }, + { + "auxiliary_loss_clip": 0.01100797, + "auxiliary_loss_mlp": 0.01044706, + "balance_loss_clip": 1.03624725, + "balance_loss_mlp": 1.02705121, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 2.1067071966019912, + "language_loss": 0.77094364, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79239869, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 2.6698179244995117 + }, + { + "auxiliary_loss_clip": 0.01031412, + "auxiliary_loss_mlp": 0.01005576, + "balance_loss_clip": 1.02063131, + "balance_loss_mlp": 1.00302505, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8183296108819842, + "language_loss": 0.57020003, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59056991, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 3.2844393253326416 + }, + { + "auxiliary_loss_clip": 0.01104963, + "auxiliary_loss_mlp": 0.01048436, + "balance_loss_clip": 1.03920341, + "balance_loss_mlp": 1.02999461, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.9236077254252195, + "language_loss": 0.67063028, + "learning_rate": 3.765817980138021e-06, + "loss": 0.6921643, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 2.5899460315704346 + }, + { + "auxiliary_loss_clip": 0.01138437, + "auxiliary_loss_mlp": 0.0104216, + "balance_loss_clip": 1.04436159, + "balance_loss_mlp": 1.02613795, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.7566044979945494, + "language_loss": 0.75339526, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77520126, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 2.555095672607422 + }, + { + "auxiliary_loss_clip": 0.01103665, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.04077578, + "balance_loss_mlp": 1.01968706, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.5166947691894188, + "language_loss": 0.66822875, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.68961614, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 2.587496042251587 + }, + { + "auxiliary_loss_clip": 0.01079291, + "auxiliary_loss_mlp": 0.00750598, + "balance_loss_clip": 1.03347778, + "balance_loss_mlp": 1.00001669, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 1.5411286738566154, + "language_loss": 0.70978636, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.72808528, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 2.8836653232574463 + }, + { + "auxiliary_loss_clip": 0.0110787, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.04221332, + "balance_loss_mlp": 1.03127265, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.1182782155360522, + "language_loss": 0.62710023, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64865601, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 2.7100303173065186 + }, + { + "auxiliary_loss_clip": 0.01104115, + "auxiliary_loss_mlp": 0.01047515, + "balance_loss_clip": 1.03897429, + "balance_loss_mlp": 1.03138566, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.945514977647083, + "language_loss": 0.76117063, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78268683, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 2.609713077545166 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01044398, + "balance_loss_clip": 1.04319787, + "balance_loss_mlp": 1.02577758, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.7485882032231441, + "language_loss": 0.66042447, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68224716, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.553556203842163 + }, + { + "auxiliary_loss_clip": 0.01107526, + "auxiliary_loss_mlp": 0.00750579, + "balance_loss_clip": 1.04108691, + "balance_loss_mlp": 0.99997032, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7650061270078383, + "language_loss": 0.78041363, + "learning_rate": 3.764536253816785e-06, + "loss": 0.79899466, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.607293128967285 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.01047871, + "balance_loss_clip": 1.04124045, + "balance_loss_mlp": 1.03026342, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 2.298336688007914, + "language_loss": 0.83403724, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85568255, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.547255277633667 + }, + { + "auxiliary_loss_clip": 0.0111203, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.03801751, + "balance_loss_mlp": 1.02155399, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 1.8493866229418359, + "language_loss": 0.67320275, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69470429, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 2.658862829208374 + }, + { + "auxiliary_loss_clip": 0.01121509, + "auxiliary_loss_mlp": 0.00750586, + "balance_loss_clip": 1.04068005, + "balance_loss_mlp": 1.00000298, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 1.88068593581612, + "language_loss": 0.75779003, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77651095, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.5470099449157715 + }, + { + "auxiliary_loss_clip": 0.01103907, + "auxiliary_loss_mlp": 0.01039926, + "balance_loss_clip": 1.04628587, + "balance_loss_mlp": 1.02155566, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 3.2594055729356906, + "language_loss": 0.81162071, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83305907, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 2.679638385772705 + }, + { + "auxiliary_loss_clip": 0.01111493, + "auxiliary_loss_mlp": 0.01041031, + "balance_loss_clip": 1.04194641, + "balance_loss_mlp": 1.02292275, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 1.9727203460556049, + "language_loss": 0.77278018, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79430538, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 2.64363169670105 + }, + { + "auxiliary_loss_clip": 0.01115898, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.03718853, + "balance_loss_mlp": 1.02266896, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.7843610487820012, + "language_loss": 0.84898263, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87053454, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.566826581954956 + }, + { + "auxiliary_loss_clip": 0.01113154, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.046193, + "balance_loss_mlp": 1.02522576, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.871824814311685, + "language_loss": 0.6937865, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71535516, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.6350677013397217 + }, + { + "auxiliary_loss_clip": 0.01099048, + "auxiliary_loss_mlp": 0.01042105, + "balance_loss_clip": 1.03368497, + "balance_loss_mlp": 1.02553439, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.8717640264990285, + "language_loss": 0.73631227, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.75772387, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 2.5851645469665527 + }, + { + "auxiliary_loss_clip": 0.0112036, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.04180813, + "balance_loss_mlp": 1.02331233, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.1181096330541163, + "language_loss": 0.88275164, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90436059, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 2.550258159637451 + }, + { + "auxiliary_loss_clip": 0.01108865, + "auxiliary_loss_mlp": 0.010481, + "balance_loss_clip": 1.03937364, + "balance_loss_mlp": 1.030159, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 1.6797819610095144, + "language_loss": 0.78861672, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81018639, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 2.5574865341186523 + }, + { + "auxiliary_loss_clip": 0.01110801, + "auxiliary_loss_mlp": 0.0105224, + "balance_loss_clip": 1.04337859, + "balance_loss_mlp": 1.03485894, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6367196234209886, + "language_loss": 0.75649977, + "learning_rate": 3.762515489146692e-06, + "loss": 0.77813017, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.609140634536743 + }, + { + "auxiliary_loss_clip": 0.01136291, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_clip": 1.04100835, + "balance_loss_mlp": 1.02837014, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.8683362481847812, + "language_loss": 0.85429037, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87612581, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.4931676387786865 + }, + { + "auxiliary_loss_clip": 0.01129457, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.04027104, + "balance_loss_mlp": 1.02350879, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.6968114128044482, + "language_loss": 0.82679284, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84848905, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 2.526780366897583 + }, + { + "auxiliary_loss_clip": 0.01083485, + "auxiliary_loss_mlp": 0.01045481, + "balance_loss_clip": 1.03554308, + "balance_loss_mlp": 1.02621651, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.038787301116759, + "language_loss": 0.77948493, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80077457, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.582751512527466 + }, + { + "auxiliary_loss_clip": 0.01109491, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.03732944, + "balance_loss_mlp": 1.02563, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.907108769176067, + "language_loss": 0.84670186, + "learning_rate": 3.761778660099352e-06, + "loss": 0.86823666, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 4.133713006973267 + }, + { + "auxiliary_loss_clip": 0.01091334, + "auxiliary_loss_mlp": 0.00750622, + "balance_loss_clip": 1.0351516, + "balance_loss_mlp": 1.00000906, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8434776400568007, + "language_loss": 0.79668087, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81510043, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.60807466506958 + }, + { + "auxiliary_loss_clip": 0.01137843, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.04336357, + "balance_loss_mlp": 1.0271678, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.108376190336617, + "language_loss": 0.81533861, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83716476, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.5365004539489746 + }, + { + "auxiliary_loss_clip": 0.01007981, + "auxiliary_loss_mlp": 0.01004785, + "balance_loss_clip": 1.02526426, + "balance_loss_mlp": 1.00247276, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8771170426941074, + "language_loss": 0.63496965, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65509731, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 4.571329832077026 + }, + { + "auxiliary_loss_clip": 0.01104786, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.0422976, + "balance_loss_mlp": 1.02521837, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 1.8626420750961188, + "language_loss": 0.80014205, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.82161182, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.6142184734344482 + }, + { + "auxiliary_loss_clip": 0.01110871, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.04240358, + "balance_loss_mlp": 1.02043509, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 1.6419498144194582, + "language_loss": 0.84847653, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.86995518, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.62164044380188 + }, + { + "auxiliary_loss_clip": 0.01116446, + "auxiliary_loss_mlp": 0.01039633, + "balance_loss_clip": 1.04103327, + "balance_loss_mlp": 1.02325368, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.412193812332619, + "language_loss": 0.80157965, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82314038, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.5754823684692383 + }, + { + "auxiliary_loss_clip": 0.01114022, + "auxiliary_loss_mlp": 0.0075071, + "balance_loss_clip": 1.04371715, + "balance_loss_mlp": 1.00000036, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.2974492580354178, + "language_loss": 0.79372507, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81237245, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 2.5949795246124268 + }, + { + "auxiliary_loss_clip": 0.01099946, + "auxiliary_loss_mlp": 0.01044883, + "balance_loss_clip": 1.03899693, + "balance_loss_mlp": 1.02679896, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 1.7737969173817814, + "language_loss": 0.67811286, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69956124, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 2.671804428100586 + }, + { + "auxiliary_loss_clip": 0.01102637, + "auxiliary_loss_mlp": 0.01039729, + "balance_loss_clip": 1.03838944, + "balance_loss_mlp": 1.02179968, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.679861704358855, + "language_loss": 0.73413849, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75556219, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 4.361302614212036 + }, + { + "auxiliary_loss_clip": 0.01118491, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.03928638, + "balance_loss_mlp": 1.02454305, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 2.0187830717465998, + "language_loss": 0.60150313, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62311399, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.616715908050537 + }, + { + "auxiliary_loss_clip": 0.01092068, + "auxiliary_loss_mlp": 0.01045567, + "balance_loss_clip": 1.03766084, + "balance_loss_mlp": 1.02807879, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.665051698055534, + "language_loss": 0.59830505, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.61968136, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 4.317780017852783 + }, + { + "auxiliary_loss_clip": 0.01097444, + "auxiliary_loss_mlp": 0.01045673, + "balance_loss_clip": 1.03866172, + "balance_loss_mlp": 1.02890015, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.7108502129671863, + "language_loss": 0.87573707, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89716828, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 2.647686004638672 + }, + { + "auxiliary_loss_clip": 0.01024617, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_clip": 1.03089881, + "balance_loss_mlp": 1.02755976, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 2.050379907978102, + "language_loss": 0.71258205, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73329198, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 2.8747506141662598 + }, + { + "auxiliary_loss_clip": 0.01077504, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03730714, + "balance_loss_mlp": 1.02983308, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 3.6024835159068473, + "language_loss": 0.63973325, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66100013, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 3.2314937114715576 + }, + { + "auxiliary_loss_clip": 0.01133824, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.04393244, + "balance_loss_mlp": 1.02666926, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 3.0274510599507303, + "language_loss": 0.79272377, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81449425, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 2.5406494140625 + }, + { + "auxiliary_loss_clip": 0.01096289, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.03611588, + "balance_loss_mlp": 1.02373242, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 1.8922690075268742, + "language_loss": 0.78807646, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.8094573, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 2.6104953289031982 + }, + { + "auxiliary_loss_clip": 0.0112205, + "auxiliary_loss_mlp": 0.01041384, + "balance_loss_clip": 1.04360747, + "balance_loss_mlp": 1.02436054, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.4579125116602425, + "language_loss": 0.80912352, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83075774, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 2.6730451583862305 + }, + { + "auxiliary_loss_clip": 0.01115089, + "auxiliary_loss_mlp": 0.010483, + "balance_loss_clip": 1.04111314, + "balance_loss_mlp": 1.02933383, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.9049602125421037, + "language_loss": 0.86629969, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88793361, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.5552315711975098 + }, + { + "auxiliary_loss_clip": 0.01122104, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.04327965, + "balance_loss_mlp": 1.02673769, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.588334942571281, + "language_loss": 0.77052927, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79221815, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.544618844985962 + }, + { + "auxiliary_loss_clip": 0.01101262, + "auxiliary_loss_mlp": 0.01040366, + "balance_loss_clip": 1.03594947, + "balance_loss_mlp": 1.02336645, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.220088161199844, + "language_loss": 0.9922955, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01371181, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 2.5817887783050537 + }, + { + "auxiliary_loss_clip": 0.01100351, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.03906953, + "balance_loss_mlp": 1.02432334, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.688169773956819, + "language_loss": 0.86237061, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88380569, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.6719717979431152 + }, + { + "auxiliary_loss_clip": 0.01132845, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_clip": 1.04281688, + "balance_loss_mlp": 1.02811944, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.866843421068398, + "language_loss": 0.73199117, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75377256, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.48823618888855 + }, + { + "auxiliary_loss_clip": 0.01137956, + "auxiliary_loss_mlp": 0.01044695, + "balance_loss_clip": 1.04530537, + "balance_loss_mlp": 1.02651525, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.5900793289587227, + "language_loss": 0.61368924, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.63551575, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.5783514976501465 + }, + { + "auxiliary_loss_clip": 0.01091327, + "auxiliary_loss_mlp": 0.01041614, + "balance_loss_clip": 1.0433681, + "balance_loss_mlp": 1.02471018, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.00558111236293, + "language_loss": 0.78293729, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80426669, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.7689931392669678 + }, + { + "auxiliary_loss_clip": 0.01075092, + "auxiliary_loss_mlp": 0.01065268, + "balance_loss_clip": 1.03840959, + "balance_loss_mlp": 1.0465045, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.8801427951326348, + "language_loss": 0.69918567, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72058922, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.6957955360412598 + }, + { + "auxiliary_loss_clip": 0.01124045, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.04313302, + "balance_loss_mlp": 1.02476859, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.5669834310019115, + "language_loss": 0.80538189, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82703328, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 2.5598819255828857 + }, + { + "auxiliary_loss_clip": 0.01131408, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.04272223, + "balance_loss_mlp": 1.02510929, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.0685244320019205, + "language_loss": 0.8229714, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84474146, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 2.550485849380493 + }, + { + "auxiliary_loss_clip": 0.01095441, + "auxiliary_loss_mlp": 0.00750934, + "balance_loss_clip": 1.03750134, + "balance_loss_mlp": 1.00002718, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 3.1742681986621886, + "language_loss": 0.8577463, + "learning_rate": 3.756590952429017e-06, + "loss": 0.87621003, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 2.646416187286377 + }, + { + "auxiliary_loss_clip": 0.01131906, + "auxiliary_loss_mlp": 0.00750731, + "balance_loss_clip": 1.04139245, + "balance_loss_mlp": 1.0000484, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 2.4482891532001316, + "language_loss": 0.72940779, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74823415, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 2.6406848430633545 + }, + { + "auxiliary_loss_clip": 0.01127817, + "auxiliary_loss_mlp": 0.01041351, + "balance_loss_clip": 1.04284322, + "balance_loss_mlp": 1.02279031, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.7583643082107347, + "language_loss": 0.73159647, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75328815, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 2.6138081550598145 + }, + { + "auxiliary_loss_clip": 0.01114134, + "auxiliary_loss_mlp": 0.0105135, + "balance_loss_clip": 1.04091442, + "balance_loss_mlp": 1.0316925, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.632503924407971, + "language_loss": 0.8161028, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83775765, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.5481953620910645 + }, + { + "auxiliary_loss_clip": 0.01127966, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.04568624, + "balance_loss_mlp": 1.02172327, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.8258180593739572, + "language_loss": 0.72613287, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.7478137, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 2.601954221725464 + }, + { + "auxiliary_loss_clip": 0.01119792, + "auxiliary_loss_mlp": 0.01038362, + "balance_loss_clip": 1.04364312, + "balance_loss_mlp": 1.02231669, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 1.7883237997032462, + "language_loss": 0.65701038, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.67859185, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 2.5922625064849854 + }, + { + "auxiliary_loss_clip": 0.01124038, + "auxiliary_loss_mlp": 0.01041083, + "balance_loss_clip": 1.04333937, + "balance_loss_mlp": 1.02327299, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 1.8053898821421297, + "language_loss": 0.68466669, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70631796, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 2.582876205444336 + }, + { + "auxiliary_loss_clip": 0.01117043, + "auxiliary_loss_mlp": 0.01042415, + "balance_loss_clip": 1.04248452, + "balance_loss_mlp": 1.02368736, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 2.857592485198829, + "language_loss": 0.72907412, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.7506687, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.665175199508667 + }, + { + "auxiliary_loss_clip": 0.01102084, + "auxiliary_loss_mlp": 0.01045197, + "balance_loss_clip": 1.03830254, + "balance_loss_mlp": 1.02773261, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 1.9085712224037696, + "language_loss": 0.82014769, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.8416205, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 2.5523102283477783 + }, + { + "auxiliary_loss_clip": 0.01048722, + "auxiliary_loss_mlp": 0.00748186, + "balance_loss_clip": 1.01838887, + "balance_loss_mlp": 0.99978554, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7954124953377797, + "language_loss": 0.59658384, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61455297, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 2.931931257247925 + }, + { + "auxiliary_loss_clip": 0.01114229, + "auxiliary_loss_mlp": 0.01042322, + "balance_loss_clip": 1.04827428, + "balance_loss_mlp": 1.02497673, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.6496409868603161, + "language_loss": 0.76372445, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78529, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.7015912532806396 + }, + { + "auxiliary_loss_clip": 0.01123641, + "auxiliary_loss_mlp": 0.01039614, + "balance_loss_clip": 1.0404582, + "balance_loss_mlp": 1.02175641, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7013116871411298, + "language_loss": 0.84699655, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.8686291, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 2.572760581970215 + }, + { + "auxiliary_loss_clip": 0.01106059, + "auxiliary_loss_mlp": 0.01043555, + "balance_loss_clip": 1.04092574, + "balance_loss_mlp": 1.02451754, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 2.978953602868966, + "language_loss": 0.77495921, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79645538, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 4.292303085327148 + }, + { + "auxiliary_loss_clip": 0.01095067, + "auxiliary_loss_mlp": 0.01050441, + "balance_loss_clip": 1.04479945, + "balance_loss_mlp": 1.03142715, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.2381958469792016, + "language_loss": 0.77218032, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79363537, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.648333787918091 + }, + { + "auxiliary_loss_clip": 0.01116629, + "auxiliary_loss_mlp": 0.01044052, + "balance_loss_clip": 1.03943491, + "balance_loss_mlp": 1.02518129, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 3.4254631710074976, + "language_loss": 0.86537075, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88697755, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.5205607414245605 + }, + { + "auxiliary_loss_clip": 0.0113881, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.0463208, + "balance_loss_mlp": 1.02537966, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 1.9817887247778445, + "language_loss": 0.91326928, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.93506956, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 3.996953248977661 + }, + { + "auxiliary_loss_clip": 0.01074379, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.03347766, + "balance_loss_mlp": 1.02412069, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.6137524152515903, + "language_loss": 0.64950609, + "learning_rate": 3.75360309139087e-06, + "loss": 0.6706841, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.678548812866211 + }, + { + "auxiliary_loss_clip": 0.01116978, + "auxiliary_loss_mlp": 0.010473, + "balance_loss_clip": 1.04431045, + "balance_loss_mlp": 1.02984798, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.8615165593705003, + "language_loss": 0.72953874, + "learning_rate": 3.753415784551761e-06, + "loss": 0.75118148, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 2.5452780723571777 + }, + { + "auxiliary_loss_clip": 0.0110737, + "auxiliary_loss_mlp": 0.01048556, + "balance_loss_clip": 1.04700959, + "balance_loss_mlp": 1.03115165, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.329570644115121, + "language_loss": 0.80805326, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.82961249, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.623051643371582 + }, + { + "auxiliary_loss_clip": 0.01110792, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_clip": 1.04394186, + "balance_loss_mlp": 1.02832222, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.785403698318736, + "language_loss": 0.7927469, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81431478, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.5978238582611084 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.0104732, + "balance_loss_clip": 1.04410934, + "balance_loss_mlp": 1.03048778, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 2.234279298606365, + "language_loss": 0.77456921, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79641318, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 2.5776398181915283 + }, + { + "auxiliary_loss_clip": 0.01100266, + "auxiliary_loss_mlp": 0.01040907, + "balance_loss_clip": 1.03919172, + "balance_loss_mlp": 1.02302539, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 1.7470839684763713, + "language_loss": 0.81721961, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83863139, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 2.7680232524871826 + }, + { + "auxiliary_loss_clip": 0.01098918, + "auxiliary_loss_mlp": 0.01045164, + "balance_loss_clip": 1.04210842, + "balance_loss_mlp": 1.02657962, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 4.112203155988036, + "language_loss": 0.74293661, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76437747, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 4.3113298416137695 + }, + { + "auxiliary_loss_clip": 0.01111335, + "auxiliary_loss_mlp": 0.0105299, + "balance_loss_clip": 1.04545093, + "balance_loss_mlp": 1.03377342, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.1803912494117856, + "language_loss": 0.72168851, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.74333179, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 4.120116233825684 + }, + { + "auxiliary_loss_clip": 0.01107527, + "auxiliary_loss_mlp": 0.010483, + "balance_loss_clip": 1.04590297, + "balance_loss_mlp": 1.02897608, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.0146111658349857, + "language_loss": 0.6954875, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71704578, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 2.6273133754730225 + }, + { + "auxiliary_loss_clip": 0.01103785, + "auxiliary_loss_mlp": 0.01046202, + "balance_loss_clip": 1.04141557, + "balance_loss_mlp": 1.02876163, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 2.030099329497089, + "language_loss": 0.68892413, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71042401, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 2.736300468444824 + }, + { + "auxiliary_loss_clip": 0.01133269, + "auxiliary_loss_mlp": 0.01038546, + "balance_loss_clip": 1.04202735, + "balance_loss_mlp": 1.02220249, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5351189928336788, + "language_loss": 0.77616751, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.79788572, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.6135191917419434 + }, + { + "auxiliary_loss_clip": 0.01131311, + "auxiliary_loss_mlp": 0.01045469, + "balance_loss_clip": 1.04069841, + "balance_loss_mlp": 1.02916133, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.8296407763863218, + "language_loss": 0.73722982, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75899768, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.5881006717681885 + }, + { + "auxiliary_loss_clip": 0.0112228, + "auxiliary_loss_mlp": 0.01047027, + "balance_loss_clip": 1.0421102, + "balance_loss_mlp": 1.02913368, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 2.25631077457392, + "language_loss": 0.70256984, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72426295, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 2.541466236114502 + }, + { + "auxiliary_loss_clip": 0.01095438, + "auxiliary_loss_mlp": 0.01049046, + "balance_loss_clip": 1.04171813, + "balance_loss_mlp": 1.02971053, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.483087529083249, + "language_loss": 0.72361952, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74506438, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.5914528369903564 + }, + { + "auxiliary_loss_clip": 0.01101873, + "auxiliary_loss_mlp": 0.01040889, + "balance_loss_clip": 1.03793168, + "balance_loss_mlp": 1.02392566, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.6691241392156497, + "language_loss": 0.9152528, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.93668044, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.613239288330078 + }, + { + "auxiliary_loss_clip": 0.01084636, + "auxiliary_loss_mlp": 0.01050421, + "balance_loss_clip": 1.04270983, + "balance_loss_mlp": 1.03284979, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.82369729476225, + "language_loss": 0.57810211, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59945273, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 2.695655584335327 + }, + { + "auxiliary_loss_clip": 0.01099563, + "auxiliary_loss_mlp": 0.01042979, + "balance_loss_clip": 1.03643584, + "balance_loss_mlp": 1.02564621, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.7284723852562713, + "language_loss": 0.81880224, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84022772, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.58355975151062 + }, + { + "auxiliary_loss_clip": 0.01079688, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.04149413, + "balance_loss_mlp": 1.03145587, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.267364444112997, + "language_loss": 0.83841282, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85970032, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.7353012561798096 + }, + { + "auxiliary_loss_clip": 0.01112519, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.04008603, + "balance_loss_mlp": 1.0283823, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 2.1059848805460057, + "language_loss": 0.93031895, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95190293, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.545478343963623 + }, + { + "auxiliary_loss_clip": 0.01107942, + "auxiliary_loss_mlp": 0.01043143, + "balance_loss_clip": 1.04346097, + "balance_loss_mlp": 1.02569127, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.010983888761685, + "language_loss": 0.77420342, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79571426, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.5956332683563232 + }, + { + "auxiliary_loss_clip": 0.01077425, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.04338455, + "balance_loss_mlp": 1.02931798, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.579158140965593, + "language_loss": 0.70244712, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7236774, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 2.9451048374176025 + }, + { + "auxiliary_loss_clip": 0.01089008, + "auxiliary_loss_mlp": 0.01051087, + "balance_loss_clip": 1.04006481, + "balance_loss_mlp": 1.03141701, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.818665998829751, + "language_loss": 0.80232978, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82373071, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.5951924324035645 + }, + { + "auxiliary_loss_clip": 0.01123416, + "auxiliary_loss_mlp": 0.01041737, + "balance_loss_clip": 1.04183388, + "balance_loss_mlp": 1.02393925, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.4349040381961156, + "language_loss": 0.75127602, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77292752, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 2.580125331878662 + }, + { + "auxiliary_loss_clip": 0.01111854, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.04541218, + "balance_loss_mlp": 1.02531755, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.63828135006847, + "language_loss": 0.66349053, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68502957, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.5906996726989746 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01048922, + "balance_loss_clip": 1.04353857, + "balance_loss_mlp": 1.03038502, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6115867183252706, + "language_loss": 0.69559813, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.71745634, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 2.5694658756256104 + }, + { + "auxiliary_loss_clip": 0.0112597, + "auxiliary_loss_mlp": 0.01045704, + "balance_loss_clip": 1.04471338, + "balance_loss_mlp": 1.02793026, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.5654659366267312, + "language_loss": 0.71920139, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.74091816, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.526266098022461 + }, + { + "auxiliary_loss_clip": 0.01109673, + "auxiliary_loss_mlp": 0.01050022, + "balance_loss_clip": 1.04116511, + "balance_loss_mlp": 1.03149688, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.202723259645847, + "language_loss": 0.80014098, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82173795, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.6869328022003174 + }, + { + "auxiliary_loss_clip": 0.01087639, + "auxiliary_loss_mlp": 0.01045704, + "balance_loss_clip": 1.04098415, + "balance_loss_mlp": 1.02968264, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 1.697566066130912, + "language_loss": 0.76725233, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78858578, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 2.684255599975586 + }, + { + "auxiliary_loss_clip": 0.01128053, + "auxiliary_loss_mlp": 0.01039186, + "balance_loss_clip": 1.04445672, + "balance_loss_mlp": 1.02241302, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.100823184291572, + "language_loss": 0.76550502, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.78717744, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 2.5440876483917236 + }, + { + "auxiliary_loss_clip": 0.01113238, + "auxiliary_loss_mlp": 0.01043482, + "balance_loss_clip": 1.04367018, + "balance_loss_mlp": 1.0269475, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 2.152861871414175, + "language_loss": 0.79058808, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81215525, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 2.568240165710449 + }, + { + "auxiliary_loss_clip": 0.01078048, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.0378201, + "balance_loss_mlp": 1.02727199, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.8172416886859544, + "language_loss": 0.85557908, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87681198, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 2.643192768096924 + }, + { + "auxiliary_loss_clip": 0.01099534, + "auxiliary_loss_mlp": 0.01048555, + "balance_loss_clip": 1.04090738, + "balance_loss_mlp": 1.02995801, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 2.089302231566735, + "language_loss": 0.86880809, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89028895, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 2.636624336242676 + }, + { + "auxiliary_loss_clip": 0.01123126, + "auxiliary_loss_mlp": 0.01042053, + "balance_loss_clip": 1.041875, + "balance_loss_mlp": 1.02544737, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.8844978239270271, + "language_loss": 0.78037965, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80203152, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 2.559540271759033 + }, + { + "auxiliary_loss_clip": 0.01127892, + "auxiliary_loss_mlp": 0.0105191, + "balance_loss_clip": 1.04315114, + "balance_loss_mlp": 1.03387403, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.964284301476147, + "language_loss": 0.74129105, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76308906, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 2.637016773223877 + }, + { + "auxiliary_loss_clip": 0.0108535, + "auxiliary_loss_mlp": 0.0104242, + "balance_loss_clip": 1.03983688, + "balance_loss_mlp": 1.02481246, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.4969642919538584, + "language_loss": 0.74561393, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76689166, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 2.626772880554199 + }, + { + "auxiliary_loss_clip": 0.01121442, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.04197192, + "balance_loss_mlp": 1.02516365, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.921899574927718, + "language_loss": 0.84502006, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86665857, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 4.11370062828064 + }, + { + "auxiliary_loss_clip": 0.01125445, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.04666841, + "balance_loss_mlp": 1.02160811, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.6759434155845616, + "language_loss": 0.84459287, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86623561, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.5439019203186035 + }, + { + "auxiliary_loss_clip": 0.01107439, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.04121006, + "balance_loss_mlp": 1.02151883, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8504428461090554, + "language_loss": 0.76674592, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.7882024, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 2.6079046726226807 + }, + { + "auxiliary_loss_clip": 0.01124384, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.0291177, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 2.534344717410584, + "language_loss": 0.64406961, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66576922, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 2.598055839538574 + }, + { + "auxiliary_loss_clip": 0.01128653, + "auxiliary_loss_mlp": 0.01041183, + "balance_loss_clip": 1.04360926, + "balance_loss_mlp": 1.02356434, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.096279301524285, + "language_loss": 0.81647229, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83817065, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 4.100393772125244 + }, + { + "auxiliary_loss_clip": 0.0108422, + "auxiliary_loss_mlp": 0.01049199, + "balance_loss_clip": 1.04114258, + "balance_loss_mlp": 1.03074527, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2201274999640583, + "language_loss": 0.57477856, + "learning_rate": 3.74605902628851e-06, + "loss": 0.59611273, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.7789793014526367 + }, + { + "auxiliary_loss_clip": 0.01097132, + "auxiliary_loss_mlp": 0.01052836, + "balance_loss_clip": 1.04313862, + "balance_loss_mlp": 1.03460884, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.6563517548979894, + "language_loss": 0.7101444, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73164409, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.612992286682129 + }, + { + "auxiliary_loss_clip": 0.01127396, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.03914857, + "balance_loss_mlp": 1.01414728, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 2.8796147985957825, + "language_loss": 0.79103124, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81260812, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.492182731628418 + }, + { + "auxiliary_loss_clip": 0.01114719, + "auxiliary_loss_mlp": 0.01045074, + "balance_loss_clip": 1.04432285, + "balance_loss_mlp": 1.02783585, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.6241114227592157, + "language_loss": 0.84066927, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86226726, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 2.6552929878234863 + }, + { + "auxiliary_loss_clip": 0.01125334, + "auxiliary_loss_mlp": 0.01045514, + "balance_loss_clip": 1.04390836, + "balance_loss_mlp": 1.02907491, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7504808769818705, + "language_loss": 0.7624287, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78413719, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 2.571861505508423 + }, + { + "auxiliary_loss_clip": 0.01135483, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.04261684, + "balance_loss_mlp": 1.02748358, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.5398614873458671, + "language_loss": 0.81803113, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.83982772, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 4.040276527404785 + }, + { + "auxiliary_loss_clip": 0.01108985, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.04019666, + "balance_loss_mlp": 1.02380776, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.6480724304190648, + "language_loss": 0.84845722, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.86994541, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.624608039855957 + }, + { + "auxiliary_loss_clip": 0.01072453, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.03640449, + "balance_loss_mlp": 1.0244627, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 1.7885848029697557, + "language_loss": 0.70003575, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72117072, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 4.225854873657227 + }, + { + "auxiliary_loss_clip": 0.01130848, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.04206061, + "balance_loss_mlp": 1.02482235, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.9974702180472264, + "language_loss": 0.70536911, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72710776, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.5202999114990234 + }, + { + "auxiliary_loss_clip": 0.01118998, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.04097354, + "balance_loss_mlp": 1.0271765, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 1.8602844450009477, + "language_loss": 0.73769903, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.75932288, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.531806707382202 + }, + { + "auxiliary_loss_clip": 0.01133032, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.04149842, + "balance_loss_mlp": 1.03064477, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.5833193216162265, + "language_loss": 0.80584908, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82766151, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.667344331741333 + }, + { + "auxiliary_loss_clip": 0.01006015, + "auxiliary_loss_mlp": 0.01006672, + "balance_loss_clip": 1.0153569, + "balance_loss_mlp": 1.00439537, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9350942724753845, + "language_loss": 0.63652891, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65665579, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 3.2494471073150635 + }, + { + "auxiliary_loss_clip": 0.01107195, + "auxiliary_loss_mlp": 0.01036914, + "balance_loss_clip": 1.04380548, + "balance_loss_mlp": 1.02049863, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.6197374298578002, + "language_loss": 0.80984139, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.8312825, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.669757604598999 + }, + { + "auxiliary_loss_clip": 0.01048403, + "auxiliary_loss_mlp": 0.01009239, + "balance_loss_clip": 1.01849532, + "balance_loss_mlp": 1.00704598, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.8583134904462751, + "language_loss": 0.61930883, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63988531, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.167961597442627 + }, + { + "auxiliary_loss_clip": 0.01082587, + "auxiliary_loss_mlp": 0.01052219, + "balance_loss_clip": 1.037323, + "balance_loss_mlp": 1.0333128, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.440535697362403, + "language_loss": 0.71184129, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73318934, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 2.7530243396759033 + }, + { + "auxiliary_loss_clip": 0.01132804, + "auxiliary_loss_mlp": 0.01050293, + "balance_loss_clip": 1.04217482, + "balance_loss_mlp": 1.03281689, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.712287995470163, + "language_loss": 0.85242516, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87425613, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.5256283283233643 + }, + { + "auxiliary_loss_clip": 0.0108772, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_clip": 1.03847814, + "balance_loss_mlp": 1.02706861, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 2.2844338218808464, + "language_loss": 0.76453876, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.78587496, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.9200596809387207 + }, + { + "auxiliary_loss_clip": 0.01101995, + "auxiliary_loss_mlp": 0.01045851, + "balance_loss_clip": 1.04271221, + "balance_loss_mlp": 1.02805293, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.9423987114105679, + "language_loss": 0.80822945, + "learning_rate": 3.74282069289017e-06, + "loss": 0.82970786, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.812554359436035 + }, + { + "auxiliary_loss_clip": 0.0106875, + "auxiliary_loss_mlp": 0.00750908, + "balance_loss_clip": 1.03679132, + "balance_loss_mlp": 1.00014329, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 1.6889897660519844, + "language_loss": 0.79378277, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81197941, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.7244625091552734 + }, + { + "auxiliary_loss_clip": 0.0109761, + "auxiliary_loss_mlp": 0.01052476, + "balance_loss_clip": 1.04286194, + "balance_loss_mlp": 1.03398633, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 1.7382693581514612, + "language_loss": 0.8253988, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.84689963, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.6708884239196777 + }, + { + "auxiliary_loss_clip": 0.01107959, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.03973174, + "balance_loss_mlp": 1.02568328, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.492879814253326, + "language_loss": 0.82925278, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8507691, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.634096622467041 + }, + { + "auxiliary_loss_clip": 0.01120553, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_clip": 1.04109609, + "balance_loss_mlp": 1.0300684, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.745827721137657, + "language_loss": 0.78725052, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80893016, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 2.702432870864868 + }, + { + "auxiliary_loss_clip": 0.01109644, + "auxiliary_loss_mlp": 0.01044537, + "balance_loss_clip": 1.04101753, + "balance_loss_mlp": 1.02619052, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 2.546941520034091, + "language_loss": 0.80903244, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83057427, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.6037096977233887 + }, + { + "auxiliary_loss_clip": 0.01137959, + "auxiliary_loss_mlp": 0.01049861, + "balance_loss_clip": 1.04638958, + "balance_loss_mlp": 1.03391099, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.6339243239772236, + "language_loss": 0.81204474, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83392298, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 2.5480077266693115 + }, + { + "auxiliary_loss_clip": 0.01127737, + "auxiliary_loss_mlp": 0.01052337, + "balance_loss_clip": 1.04388595, + "balance_loss_mlp": 1.03497982, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 3.151228235780881, + "language_loss": 0.63561296, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.65741372, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 2.723144769668579 + }, + { + "auxiliary_loss_clip": 0.01132382, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.04103804, + "balance_loss_mlp": 1.02450037, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 1.9315445868987995, + "language_loss": 0.71487653, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73662686, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 2.5511178970336914 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_clip": 1.04019582, + "balance_loss_mlp": 1.02442753, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 3.482197829549567, + "language_loss": 0.87337816, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89513195, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 2.4783413410186768 + }, + { + "auxiliary_loss_clip": 0.01107255, + "auxiliary_loss_mlp": 0.01043509, + "balance_loss_clip": 1.04005277, + "balance_loss_mlp": 1.024948, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.1685325276697327, + "language_loss": 0.77240086, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79390848, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.58113431930542 + }, + { + "auxiliary_loss_clip": 0.01109443, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.04380584, + "balance_loss_mlp": 1.02297807, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.811478598132262, + "language_loss": 0.78475082, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80623245, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 2.731675386428833 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01048653, + "balance_loss_clip": 1.03890824, + "balance_loss_mlp": 1.03111732, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.0240991400056445, + "language_loss": 0.70757616, + "learning_rate": 3.740523309097912e-06, + "loss": 0.72902519, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 2.6522936820983887 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.03973222, + "balance_loss_mlp": 1.0262537, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.312598411342106, + "language_loss": 0.739618, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.76107347, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 2.60085129737854 + }, + { + "auxiliary_loss_clip": 0.01085983, + "auxiliary_loss_mlp": 0.0104256, + "balance_loss_clip": 1.03615189, + "balance_loss_mlp": 1.02602565, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.110705637194767, + "language_loss": 0.76398152, + "learning_rate": 3.740139487448616e-06, + "loss": 0.785267, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.699741840362549 + }, + { + "auxiliary_loss_clip": 0.01069364, + "auxiliary_loss_mlp": 0.01052049, + "balance_loss_clip": 1.03503966, + "balance_loss_mlp": 1.03297532, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.726167497028595, + "language_loss": 0.78725851, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80847263, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.660367488861084 + }, + { + "auxiliary_loss_clip": 0.01121411, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_clip": 1.04124475, + "balance_loss_mlp": 1.02812481, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 2.3233822816544683, + "language_loss": 0.67138088, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69304729, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.548205614089966 + }, + { + "auxiliary_loss_clip": 0.01090741, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.03681946, + "balance_loss_mlp": 1.0158236, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 2.237947595083816, + "language_loss": 0.7581442, + "learning_rate": 3.739563260095902e-06, + "loss": 0.77938473, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 2.6451621055603027 + }, + { + "auxiliary_loss_clip": 0.01108505, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.04252589, + "balance_loss_mlp": 1.02833021, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.017418074853098, + "language_loss": 0.80248761, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.82401496, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 4.0588743686676025 + }, + { + "auxiliary_loss_clip": 0.01113973, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.04253852, + "balance_loss_mlp": 1.03141201, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.162874264305945, + "language_loss": 0.85429919, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87592906, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 2.5535244941711426 + }, + { + "auxiliary_loss_clip": 0.01100427, + "auxiliary_loss_mlp": 0.01048848, + "balance_loss_clip": 1.04092133, + "balance_loss_mlp": 1.03183722, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.866922446867482, + "language_loss": 0.74541259, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76690537, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 2.7480902671813965 + }, + { + "auxiliary_loss_clip": 0.0109428, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.03858435, + "balance_loss_mlp": 1.02518702, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.8092947759941556, + "language_loss": 0.75510061, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77646971, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.637298345565796 + }, + { + "auxiliary_loss_clip": 0.0113511, + "auxiliary_loss_mlp": 0.0104379, + "balance_loss_clip": 1.04342937, + "balance_loss_mlp": 1.02766109, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 3.0582651988938014, + "language_loss": 0.79333556, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81512451, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 4.018051862716675 + }, + { + "auxiliary_loss_clip": 0.010922, + "auxiliary_loss_mlp": 0.01046023, + "balance_loss_clip": 1.03689742, + "balance_loss_mlp": 1.02731943, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.8967169158607016, + "language_loss": 0.727988, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74937028, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 2.6063342094421387 + }, + { + "auxiliary_loss_clip": 0.0110379, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.04014528, + "balance_loss_mlp": 1.02356648, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.647868819264261, + "language_loss": 0.74261177, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76404738, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 2.642500877380371 + }, + { + "auxiliary_loss_clip": 0.01133995, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_clip": 1.04317617, + "balance_loss_mlp": 1.02746129, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.6939742395909287, + "language_loss": 0.6797893, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70156515, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 2.6213138103485107 + }, + { + "auxiliary_loss_clip": 0.01093781, + "auxiliary_loss_mlp": 0.01039613, + "balance_loss_clip": 1.0383743, + "balance_loss_mlp": 1.02289963, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.859045959933542, + "language_loss": 0.79539567, + "learning_rate": 3.737831016747176e-06, + "loss": 0.81672961, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.6837713718414307 + }, + { + "auxiliary_loss_clip": 0.01138917, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.0444808, + "balance_loss_mlp": 1.02151489, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 2.2789336486360416, + "language_loss": 0.71893072, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74071229, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 2.559122085571289 + }, + { + "auxiliary_loss_clip": 0.01125119, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.04446793, + "balance_loss_mlp": 1.03248191, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.885066667240579, + "language_loss": 0.84929669, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87104881, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 2.5298025608062744 + }, + { + "auxiliary_loss_clip": 0.01108249, + "auxiliary_loss_mlp": 0.010436, + "balance_loss_clip": 1.04174745, + "balance_loss_mlp": 1.02816296, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.8990247271284364, + "language_loss": 0.73561585, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75713432, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 4.129668951034546 + }, + { + "auxiliary_loss_clip": 0.01115632, + "auxiliary_loss_mlp": 0.0105007, + "balance_loss_clip": 1.04334533, + "balance_loss_mlp": 1.03206944, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 2.3860641521251713, + "language_loss": 0.80822837, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.82988536, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 4.166968107223511 + }, + { + "auxiliary_loss_clip": 0.0113457, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_clip": 1.04499149, + "balance_loss_mlp": 1.0247525, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 3.0957123247883964, + "language_loss": 0.75214642, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77391088, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.495431661605835 + }, + { + "auxiliary_loss_clip": 0.0107876, + "auxiliary_loss_mlp": 0.01047354, + "balance_loss_clip": 1.04111743, + "balance_loss_mlp": 1.02965176, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.5842168146312117, + "language_loss": 0.74398333, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76524448, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.7871906757354736 + }, + { + "auxiliary_loss_clip": 0.01121204, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.04354739, + "balance_loss_mlp": 1.01923895, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.625731930066241, + "language_loss": 0.67020184, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69177389, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 2.8997371196746826 + }, + { + "auxiliary_loss_clip": 0.01123111, + "auxiliary_loss_mlp": 0.01045356, + "balance_loss_clip": 1.04329848, + "balance_loss_mlp": 1.02762985, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.05465431849877, + "language_loss": 0.74565339, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76733804, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.5265207290649414 + }, + { + "auxiliary_loss_clip": 0.01022688, + "auxiliary_loss_mlp": 0.01013152, + "balance_loss_clip": 1.02577102, + "balance_loss_mlp": 1.01087558, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.7847111908102988, + "language_loss": 0.50321341, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52357185, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 3.1977295875549316 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01043842, + "balance_loss_clip": 1.04308188, + "balance_loss_mlp": 1.02724767, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.812746638092246, + "language_loss": 0.74516952, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76678276, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.6295166015625 + }, + { + "auxiliary_loss_clip": 0.01003052, + "auxiliary_loss_mlp": 0.01018537, + "balance_loss_clip": 1.01345348, + "balance_loss_mlp": 1.01614141, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8628153290655859, + "language_loss": 0.60056567, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62078154, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.109490156173706 + }, + { + "auxiliary_loss_clip": 0.01080447, + "auxiliary_loss_mlp": 0.0104871, + "balance_loss_clip": 1.04241502, + "balance_loss_mlp": 1.03156805, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.91346839911398, + "language_loss": 0.7836712, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80496275, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.7185192108154297 + }, + { + "auxiliary_loss_clip": 0.01119866, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.04180884, + "balance_loss_mlp": 1.02569556, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.6953102015479116, + "language_loss": 0.78434849, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80597138, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.5784378051757812 + }, + { + "auxiliary_loss_clip": 0.01136054, + "auxiliary_loss_mlp": 0.0103921, + "balance_loss_clip": 1.04242539, + "balance_loss_mlp": 1.02184153, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 2.780821510422536, + "language_loss": 0.78559053, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80734313, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.5933971405029297 + }, + { + "auxiliary_loss_clip": 0.01119547, + "auxiliary_loss_mlp": 0.01048051, + "balance_loss_clip": 1.04052281, + "balance_loss_mlp": 1.03132629, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.5473575094904455, + "language_loss": 0.79619932, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.81787533, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.525890588760376 + }, + { + "auxiliary_loss_clip": 0.01088729, + "auxiliary_loss_mlp": 0.00750716, + "balance_loss_clip": 1.03933227, + "balance_loss_mlp": 1.00011587, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 3.5106558014156417, + "language_loss": 0.78933227, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80772668, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.652834177017212 + }, + { + "auxiliary_loss_clip": 0.0108197, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.0389179, + "balance_loss_mlp": 1.02350616, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.896123452196187, + "language_loss": 0.80564129, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82686424, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 2.6330127716064453 + }, + { + "auxiliary_loss_clip": 0.0105587, + "auxiliary_loss_mlp": 0.010539, + "balance_loss_clip": 1.03584218, + "balance_loss_mlp": 1.03580403, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.47979201961474, + "language_loss": 0.85555536, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87665308, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 2.662468194961548 + }, + { + "auxiliary_loss_clip": 0.01114476, + "auxiliary_loss_mlp": 0.0105057, + "balance_loss_clip": 1.04587913, + "balance_loss_mlp": 1.03180623, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.12224564338438, + "language_loss": 0.81544113, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83709157, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.6384825706481934 + }, + { + "auxiliary_loss_clip": 0.010969, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.03732026, + "balance_loss_mlp": 1.02287722, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 2.81811033027082, + "language_loss": 0.7460497, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76742011, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 2.7139663696289062 + }, + { + "auxiliary_loss_clip": 0.01126039, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.04545522, + "balance_loss_mlp": 1.01836228, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.7101118137534352, + "language_loss": 0.81230873, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.8339169, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 2.6006877422332764 + }, + { + "auxiliary_loss_clip": 0.01127022, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_clip": 1.04648113, + "balance_loss_mlp": 1.0257237, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.4691312927612157, + "language_loss": 0.79518831, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81688309, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 2.5304901599884033 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01045774, + "balance_loss_clip": 1.04461217, + "balance_loss_mlp": 1.02813053, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.2204036851132414, + "language_loss": 0.79371673, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81527865, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.01124292, + "auxiliary_loss_mlp": 0.01050816, + "balance_loss_clip": 1.04518604, + "balance_loss_mlp": 1.0344131, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7488666884388884, + "language_loss": 0.73808318, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.75983429, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 2.5648038387298584 + }, + { + "auxiliary_loss_clip": 0.01108184, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.04512787, + "balance_loss_mlp": 1.02513671, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.6442361692046634, + "language_loss": 0.64364004, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.66514516, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 2.6398699283599854 + }, + { + "auxiliary_loss_clip": 0.01110309, + "auxiliary_loss_mlp": 0.01045121, + "balance_loss_clip": 1.04136968, + "balance_loss_mlp": 1.02729869, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.5447349826085401, + "language_loss": 0.73153663, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75309098, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.623460054397583 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01046299, + "balance_loss_clip": 1.04252648, + "balance_loss_mlp": 1.02722573, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 1.7094204050211854, + "language_loss": 0.87981105, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90130424, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.588808536529541 + }, + { + "auxiliary_loss_clip": 0.01132996, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.0436306, + "balance_loss_mlp": 1.02362144, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 1.9989752949715547, + "language_loss": 0.72586626, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.74760401, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.557135581970215 + }, + { + "auxiliary_loss_clip": 0.01114021, + "auxiliary_loss_mlp": 0.01039546, + "balance_loss_clip": 1.0458957, + "balance_loss_mlp": 1.01997232, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.2182416650050873, + "language_loss": 0.83468008, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85621572, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.6401658058166504 + }, + { + "auxiliary_loss_clip": 0.01043501, + "auxiliary_loss_mlp": 0.01011794, + "balance_loss_clip": 1.02339363, + "balance_loss_mlp": 1.00948131, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.9583595498256764, + "language_loss": 0.5584538, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57900667, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 3.1476802825927734 + }, + { + "auxiliary_loss_clip": 0.01119189, + "auxiliary_loss_mlp": 0.01049982, + "balance_loss_clip": 1.04364192, + "balance_loss_mlp": 1.0328517, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.7795438238740549, + "language_loss": 0.70221347, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72390521, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 2.615292549133301 + }, + { + "auxiliary_loss_clip": 0.01098167, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.03912878, + "balance_loss_mlp": 1.01843107, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 1.9307151261292608, + "language_loss": 0.7382828, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.75960928, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.631300926208496 + }, + { + "auxiliary_loss_clip": 0.0108823, + "auxiliary_loss_mlp": 0.01061767, + "balance_loss_clip": 1.03848052, + "balance_loss_mlp": 1.04355204, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 2.115073697220784, + "language_loss": 0.83947998, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86097991, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 4.164889097213745 + }, + { + "auxiliary_loss_clip": 0.01104486, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_clip": 1.03919125, + "balance_loss_mlp": 1.02641654, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.9446525075505496, + "language_loss": 0.89633209, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91780174, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.607733964920044 + }, + { + "auxiliary_loss_clip": 0.01107605, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_clip": 1.04506826, + "balance_loss_mlp": 1.03248775, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 1.7055927164710263, + "language_loss": 0.74776053, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.76934803, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 2.6342806816101074 + }, + { + "auxiliary_loss_clip": 0.01105708, + "auxiliary_loss_mlp": 0.00750811, + "balance_loss_clip": 1.04135442, + "balance_loss_mlp": 1.00011969, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.797198852998035, + "language_loss": 0.74771738, + "learning_rate": 3.730848718849612e-06, + "loss": 0.76628262, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 4.110499620437622 + }, + { + "auxiliary_loss_clip": 0.01032817, + "auxiliary_loss_mlp": 0.01006996, + "balance_loss_clip": 1.01370072, + "balance_loss_mlp": 1.0047307, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7997869422005153, + "language_loss": 0.68487108, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70526922, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 3.021894931793213 + }, + { + "auxiliary_loss_clip": 0.01094632, + "auxiliary_loss_mlp": 0.0106093, + "balance_loss_clip": 1.04273367, + "balance_loss_mlp": 1.04171312, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 2.265426016448669, + "language_loss": 0.72993052, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75148612, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 2.597203254699707 + }, + { + "auxiliary_loss_clip": 0.0111274, + "auxiliary_loss_mlp": 0.01051613, + "balance_loss_clip": 1.04536176, + "balance_loss_mlp": 1.03407705, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.7900369583897708, + "language_loss": 0.83769691, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85934043, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 2.60237979888916 + }, + { + "auxiliary_loss_clip": 0.01068024, + "auxiliary_loss_mlp": 0.01053101, + "balance_loss_clip": 1.04069018, + "balance_loss_mlp": 1.03293109, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.700006219795214, + "language_loss": 0.80249631, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82370758, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 2.6891562938690186 + }, + { + "auxiliary_loss_clip": 0.01113897, + "auxiliary_loss_mlp": 0.01057382, + "balance_loss_clip": 1.04257584, + "balance_loss_mlp": 1.03896451, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 2.042106793031977, + "language_loss": 0.78805339, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80976617, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 2.5699024200439453 + }, + { + "auxiliary_loss_clip": 0.01088417, + "auxiliary_loss_mlp": 0.0105399, + "balance_loss_clip": 1.03839266, + "balance_loss_mlp": 1.03564382, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.636534946369555, + "language_loss": 0.83792245, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85934651, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 4.0467894077301025 + }, + { + "auxiliary_loss_clip": 0.01134267, + "auxiliary_loss_mlp": 0.01051047, + "balance_loss_clip": 1.0455091, + "balance_loss_mlp": 1.03485882, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 1.7190314437334129, + "language_loss": 0.7940625, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81591558, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.484703302383423 + }, + { + "auxiliary_loss_clip": 0.01078712, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_clip": 1.038728, + "balance_loss_mlp": 1.03423667, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.099597874834986, + "language_loss": 0.69337338, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71467197, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 4.172295570373535 + }, + { + "auxiliary_loss_clip": 0.01108117, + "auxiliary_loss_mlp": 0.01037769, + "balance_loss_clip": 1.0419085, + "balance_loss_mlp": 1.02107406, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 2.0840131230505676, + "language_loss": 0.9101032, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93156201, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.5401771068573 + }, + { + "auxiliary_loss_clip": 0.01125506, + "auxiliary_loss_mlp": 0.01047525, + "balance_loss_clip": 1.04461539, + "balance_loss_mlp": 1.03033495, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.668995601710422, + "language_loss": 0.81427407, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83600438, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 2.542445421218872 + }, + { + "auxiliary_loss_clip": 0.01094729, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.04093325, + "balance_loss_mlp": 1.0249505, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 1.9020907982157038, + "language_loss": 0.75740099, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77876049, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.6253647804260254 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.045609, + "balance_loss_mlp": 1.02726007, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.8676111572127447, + "language_loss": 0.8373971, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85899556, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 2.5641138553619385 + }, + { + "auxiliary_loss_clip": 0.01029307, + "auxiliary_loss_mlp": 0.01003273, + "balance_loss_clip": 1.01846504, + "balance_loss_mlp": 1.0010798, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8590846715826663, + "language_loss": 0.60592413, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62624991, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 2.9712014198303223 + }, + { + "auxiliary_loss_clip": 0.01096859, + "auxiliary_loss_mlp": 0.01040865, + "balance_loss_clip": 1.04083753, + "balance_loss_mlp": 1.02475429, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.3615572239133575, + "language_loss": 0.75457978, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77595705, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.618411064147949 + }, + { + "auxiliary_loss_clip": 0.01122155, + "auxiliary_loss_mlp": 0.00750687, + "balance_loss_clip": 1.04053235, + "balance_loss_mlp": 1.0000633, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.8074781162467692, + "language_loss": 0.60179919, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62052763, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.5450894832611084 + }, + { + "auxiliary_loss_clip": 0.01132737, + "auxiliary_loss_mlp": 0.01052306, + "balance_loss_clip": 1.04141724, + "balance_loss_mlp": 1.03390026, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9065022551774002, + "language_loss": 0.80241358, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82426405, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.66982102394104 + }, + { + "auxiliary_loss_clip": 0.01092087, + "auxiliary_loss_mlp": 0.01039919, + "balance_loss_clip": 1.03754091, + "balance_loss_mlp": 1.02412438, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.213258306992403, + "language_loss": 0.82803774, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84935784, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.7024264335632324 + }, + { + "auxiliary_loss_clip": 0.01045479, + "auxiliary_loss_mlp": 0.01005361, + "balance_loss_clip": 1.0162077, + "balance_loss_mlp": 1.00291729, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9671836862746699, + "language_loss": 0.63653815, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65704656, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 2.938995838165283 + }, + { + "auxiliary_loss_clip": 0.01111254, + "auxiliary_loss_mlp": 0.01045366, + "balance_loss_clip": 1.04578865, + "balance_loss_mlp": 1.02908254, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.8278581763377424, + "language_loss": 0.76087892, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78244507, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 2.614035129547119 + }, + { + "auxiliary_loss_clip": 0.01100978, + "auxiliary_loss_mlp": 0.01042028, + "balance_loss_clip": 1.04189813, + "balance_loss_mlp": 1.02501655, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 2.2860553812295663, + "language_loss": 0.71028131, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73171139, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.596869468688965 + }, + { + "auxiliary_loss_clip": 0.0112677, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.03886032, + "balance_loss_mlp": 1.02252114, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 3.961232100704492, + "language_loss": 0.75478148, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77644604, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.01089431, + "auxiliary_loss_mlp": 0.01044315, + "balance_loss_clip": 1.03763819, + "balance_loss_mlp": 1.02842426, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.1971058032625432, + "language_loss": 0.882972, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90430945, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 2.651374101638794 + }, + { + "auxiliary_loss_clip": 0.01128136, + "auxiliary_loss_mlp": 0.01046401, + "balance_loss_clip": 1.04147363, + "balance_loss_mlp": 1.03122544, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.7914258723683052, + "language_loss": 0.79743391, + "learning_rate": 3.726343252048485e-06, + "loss": 0.8191793, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.497971534729004 + }, + { + "auxiliary_loss_clip": 0.01117798, + "auxiliary_loss_mlp": 0.01045733, + "balance_loss_clip": 1.04482222, + "balance_loss_mlp": 1.0273751, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.4764163798344936, + "language_loss": 0.62033308, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.64196849, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 2.549285888671875 + }, + { + "auxiliary_loss_clip": 0.01130582, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.04174328, + "balance_loss_mlp": 1.02431953, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 4.0400485829233626, + "language_loss": 0.80308878, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82480109, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 2.5394043922424316 + }, + { + "auxiliary_loss_clip": 0.01077884, + "auxiliary_loss_mlp": 0.01045424, + "balance_loss_clip": 1.03879523, + "balance_loss_mlp": 1.02778149, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 1.9988607862288683, + "language_loss": 0.85074949, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87198257, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 2.701775312423706 + }, + { + "auxiliary_loss_clip": 0.01123078, + "auxiliary_loss_mlp": 0.01035868, + "balance_loss_clip": 1.04001927, + "balance_loss_mlp": 1.02105021, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.028038740843663, + "language_loss": 0.83830422, + "learning_rate": 3.725556155051766e-06, + "loss": 0.85989368, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 2.5646536350250244 + }, + { + "auxiliary_loss_clip": 0.01117503, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.04163265, + "balance_loss_mlp": 1.02663374, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 5.684696526592835, + "language_loss": 0.86184537, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88343835, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 2.555492639541626 + }, + { + "auxiliary_loss_clip": 0.0104497, + "auxiliary_loss_mlp": 0.01040844, + "balance_loss_clip": 1.03241348, + "balance_loss_mlp": 1.02361798, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 1.7924115386440924, + "language_loss": 0.78245109, + "learning_rate": 3.72516221392398e-06, + "loss": 0.8033092, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 3.026317834854126 + }, + { + "auxiliary_loss_clip": 0.01116611, + "auxiliary_loss_mlp": 0.01040587, + "balance_loss_clip": 1.04084754, + "balance_loss_mlp": 1.02445793, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.742369880990871, + "language_loss": 0.75108594, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77265793, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.8404271602630615 + }, + { + "auxiliary_loss_clip": 0.01073039, + "auxiliary_loss_mlp": 0.010445, + "balance_loss_clip": 1.03534257, + "balance_loss_mlp": 1.02648711, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.890393456017088, + "language_loss": 0.70749497, + "learning_rate": 3.7247680111229e-06, + "loss": 0.72867036, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 2.901716709136963 + }, + { + "auxiliary_loss_clip": 0.01088202, + "auxiliary_loss_mlp": 0.01040325, + "balance_loss_clip": 1.03531671, + "balance_loss_mlp": 1.02464938, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.2790468347161372, + "language_loss": 0.69341075, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71469605, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 2.5909953117370605 + }, + { + "auxiliary_loss_clip": 0.01096526, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.04055262, + "balance_loss_mlp": 1.02039075, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.4848768971236916, + "language_loss": 0.76157528, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78291386, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 2.6598830223083496 + }, + { + "auxiliary_loss_clip": 0.01094854, + "auxiliary_loss_mlp": 0.01037878, + "balance_loss_clip": 1.04028583, + "balance_loss_mlp": 1.02216625, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 2.608274620812265, + "language_loss": 0.69516563, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71649295, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.608980178833008 + }, + { + "auxiliary_loss_clip": 0.01121075, + "auxiliary_loss_mlp": 0.01042494, + "balance_loss_clip": 1.04216814, + "balance_loss_mlp": 1.02632928, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.8511638395808214, + "language_loss": 0.74246502, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76410073, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.5955100059509277 + }, + { + "auxiliary_loss_clip": 0.01092799, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.03902483, + "balance_loss_mlp": 1.02433181, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.9499941396062581, + "language_loss": 0.65293849, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.6742714, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.632005453109741 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.00750298, + "balance_loss_clip": 1.03948069, + "balance_loss_mlp": 1.00008011, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8273868322424829, + "language_loss": 0.8186295, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83713245, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 4.105808734893799 + }, + { + "auxiliary_loss_clip": 0.01103334, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_clip": 1.04036617, + "balance_loss_mlp": 1.02378774, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.7798487931820848, + "language_loss": 0.86912853, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89057773, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 2.5828890800476074 + }, + { + "auxiliary_loss_clip": 0.01069797, + "auxiliary_loss_mlp": 0.01050116, + "balance_loss_clip": 1.03602886, + "balance_loss_mlp": 1.03231812, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5888151051057264, + "language_loss": 0.85309362, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87429273, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 2.727095127105713 + }, + { + "auxiliary_loss_clip": 0.01122708, + "auxiliary_loss_mlp": 0.01049446, + "balance_loss_clip": 1.04246593, + "balance_loss_mlp": 1.03329337, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.8388094591320574, + "language_loss": 0.89267498, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91439652, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 2.547940492630005 + }, + { + "auxiliary_loss_clip": 0.01107987, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.03834057, + "balance_loss_mlp": 1.02121162, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.646829467074615, + "language_loss": 0.78510654, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80657029, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 4.201201677322388 + }, + { + "auxiliary_loss_clip": 0.01110164, + "auxiliary_loss_mlp": 0.01041702, + "balance_loss_clip": 1.04329932, + "balance_loss_mlp": 1.02644897, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 1.8427554505304717, + "language_loss": 0.79227793, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81379652, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 2.5958540439605713 + }, + { + "auxiliary_loss_clip": 0.01128767, + "auxiliary_loss_mlp": 0.01037969, + "balance_loss_clip": 1.04243755, + "balance_loss_mlp": 1.02114916, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 1.6027782163505193, + "language_loss": 0.75811905, + "learning_rate": 3.72239730252843e-06, + "loss": 0.77978647, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 2.5558598041534424 + }, + { + "auxiliary_loss_clip": 0.0113247, + "auxiliary_loss_mlp": 0.01043413, + "balance_loss_clip": 1.04155731, + "balance_loss_mlp": 1.02751052, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 1.6272779711560361, + "language_loss": 0.74912, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77087879, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.5503337383270264 + }, + { + "auxiliary_loss_clip": 0.01079409, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.03513765, + "balance_loss_mlp": 1.02476466, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8189711254136778, + "language_loss": 0.73347878, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75469255, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 2.654848575592041 + }, + { + "auxiliary_loss_clip": 0.01104189, + "auxiliary_loss_mlp": 0.01049061, + "balance_loss_clip": 1.03878975, + "balance_loss_mlp": 1.03205538, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.807214745900871, + "language_loss": 0.736238, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75777054, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 4.099200487136841 + }, + { + "auxiliary_loss_clip": 0.01105536, + "auxiliary_loss_mlp": 0.01039849, + "balance_loss_clip": 1.04320478, + "balance_loss_mlp": 1.02385139, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8844241667799662, + "language_loss": 0.66479027, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68624413, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.6225357055664062 + }, + { + "auxiliary_loss_clip": 0.01098644, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.03761256, + "balance_loss_mlp": 1.02224183, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.5656679020190425, + "language_loss": 0.83154249, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85291332, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 4.122512340545654 + }, + { + "auxiliary_loss_clip": 0.01042454, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.01366758, + "balance_loss_mlp": 1.02820897, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8298580130629474, + "language_loss": 0.57484704, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59557396, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.1173317432403564 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.010455, + "balance_loss_clip": 1.03807437, + "balance_loss_mlp": 1.02751112, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 1.7830667395669735, + "language_loss": 0.83301592, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85455531, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 2.5570037364959717 + }, + { + "auxiliary_loss_clip": 0.0112028, + "auxiliary_loss_mlp": 0.01043584, + "balance_loss_clip": 1.0448786, + "balance_loss_mlp": 1.02760386, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.7228955323520518, + "language_loss": 0.77158403, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79322273, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.600945472717285 + }, + { + "auxiliary_loss_clip": 0.01120011, + "auxiliary_loss_mlp": 0.01038971, + "balance_loss_clip": 1.04261756, + "balance_loss_mlp": 1.02185285, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.0555048607532087, + "language_loss": 0.83912027, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86071014, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.608497381210327 + }, + { + "auxiliary_loss_clip": 0.01121156, + "auxiliary_loss_mlp": 0.00750558, + "balance_loss_clip": 1.04189849, + "balance_loss_mlp": 1.00002289, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.382937063674583, + "language_loss": 0.76501518, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.7837323, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 2.588644027709961 + }, + { + "auxiliary_loss_clip": 0.01097232, + "auxiliary_loss_mlp": 0.01039912, + "balance_loss_clip": 1.04287815, + "balance_loss_mlp": 1.02368796, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.704147768399019, + "language_loss": 0.75423825, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77560973, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.7100937366485596 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.01043872, + "balance_loss_clip": 1.04084003, + "balance_loss_mlp": 1.02736759, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 3.575329003922402, + "language_loss": 0.78089809, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80263555, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.62276291847229 + }, + { + "auxiliary_loss_clip": 0.01121669, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.04202986, + "balance_loss_mlp": 1.02742195, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.5393172876377172, + "language_loss": 0.73010302, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75176013, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 2.6177139282226562 + }, + { + "auxiliary_loss_clip": 0.01074153, + "auxiliary_loss_mlp": 0.01040163, + "balance_loss_clip": 1.03864813, + "balance_loss_mlp": 1.02374816, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 2.212234391988997, + "language_loss": 0.7958138, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81695694, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.6956992149353027 + }, + { + "auxiliary_loss_clip": 0.0113022, + "auxiliary_loss_mlp": 0.01041108, + "balance_loss_clip": 1.04168856, + "balance_loss_mlp": 1.02437139, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 2.2095643377736334, + "language_loss": 0.83977199, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86148524, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.5572896003723145 + }, + { + "auxiliary_loss_clip": 0.01112445, + "auxiliary_loss_mlp": 0.0105539, + "balance_loss_clip": 1.0380547, + "balance_loss_mlp": 1.03470683, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.6629739185300059, + "language_loss": 0.7358377, + "learning_rate": 3.719221729768117e-06, + "loss": 0.75751609, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.624150276184082 + }, + { + "auxiliary_loss_clip": 0.01074254, + "auxiliary_loss_mlp": 0.0104603, + "balance_loss_clip": 1.03574216, + "balance_loss_mlp": 1.02861381, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.7775180521494565, + "language_loss": 0.76504242, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78624523, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 2.6863348484039307 + }, + { + "auxiliary_loss_clip": 0.01008143, + "auxiliary_loss_mlp": 0.01005495, + "balance_loss_clip": 1.02226448, + "balance_loss_mlp": 1.00331342, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7764285114605568, + "language_loss": 0.55354041, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57367682, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.2339375019073486 + }, + { + "auxiliary_loss_clip": 0.01110031, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.04281569, + "balance_loss_mlp": 1.02403831, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.557650169259149, + "language_loss": 0.70162368, + "learning_rate": 3.718624450942688e-06, + "loss": 0.72313237, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.5723555088043213 + }, + { + "auxiliary_loss_clip": 0.01127337, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.0400151, + "balance_loss_mlp": 1.02420259, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.5396739429421027, + "language_loss": 0.80400604, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82568419, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 2.5162720680236816 + }, + { + "auxiliary_loss_clip": 0.01082657, + "auxiliary_loss_mlp": 0.01044852, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.02856851, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.999849391648492, + "language_loss": 0.75179082, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77306592, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 2.70535945892334 + }, + { + "auxiliary_loss_clip": 0.0105789, + "auxiliary_loss_mlp": 0.01050492, + "balance_loss_clip": 1.03440666, + "balance_loss_mlp": 1.03203869, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.6838822788920895, + "language_loss": 0.7341184, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.75520223, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 2.7342634201049805 + }, + { + "auxiliary_loss_clip": 0.01109402, + "auxiliary_loss_mlp": 0.01044818, + "balance_loss_clip": 1.04165459, + "balance_loss_mlp": 1.02673352, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.268943129985135, + "language_loss": 0.76770568, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.78924787, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 2.614396572113037 + }, + { + "auxiliary_loss_clip": 0.01121486, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.0408324, + "balance_loss_mlp": 1.02645278, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.2873838360045164, + "language_loss": 0.81983447, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84148049, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 2.5556390285491943 + }, + { + "auxiliary_loss_clip": 0.01096066, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.04163802, + "balance_loss_mlp": 1.02646291, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.8764671467455105, + "language_loss": 0.76767725, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78907055, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.664200782775879 + }, + { + "auxiliary_loss_clip": 0.01124088, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_clip": 1.04651821, + "balance_loss_mlp": 1.03576267, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.9414515965258419, + "language_loss": 0.86511266, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88686991, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.6540558338165283 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01050212, + "balance_loss_clip": 1.04020476, + "balance_loss_mlp": 1.03381491, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.6226718772183166, + "language_loss": 0.74051964, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76209152, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 2.6272265911102295 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01048605, + "balance_loss_clip": 1.04458928, + "balance_loss_mlp": 1.03311968, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 1.9748081068084833, + "language_loss": 0.78379279, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.8054691, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.5566301345825195 + }, + { + "auxiliary_loss_clip": 0.009995, + "auxiliary_loss_mlp": 0.01063126, + "balance_loss_clip": 1.01092744, + "balance_loss_mlp": 1.06095636, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7981151548936691, + "language_loss": 0.5348773, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55550349, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.215195894241333 + }, + { + "auxiliary_loss_clip": 0.01106585, + "auxiliary_loss_mlp": 0.00750528, + "balance_loss_clip": 1.04244506, + "balance_loss_mlp": 1.00008833, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9651843612776718, + "language_loss": 0.80068469, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.81925583, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 2.5893194675445557 + }, + { + "auxiliary_loss_clip": 0.01107745, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.04233682, + "balance_loss_mlp": 1.03008091, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.0150323206391483, + "language_loss": 0.86555421, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88709176, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 2.601182222366333 + }, + { + "auxiliary_loss_clip": 0.01083455, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.0436219, + "balance_loss_mlp": 1.02181673, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 1.9586189011731756, + "language_loss": 0.69123405, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71244729, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 2.6504580974578857 + }, + { + "auxiliary_loss_clip": 0.01091259, + "auxiliary_loss_mlp": 0.01051966, + "balance_loss_clip": 1.04177999, + "balance_loss_mlp": 1.03414416, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 1.984737056790609, + "language_loss": 0.80512512, + "learning_rate": 3.715829397778135e-06, + "loss": 0.8265574, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 4.139296054840088 + }, + { + "auxiliary_loss_clip": 0.01116466, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.04068506, + "balance_loss_mlp": 1.02634454, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 2.084029654580782, + "language_loss": 0.8418504, + "learning_rate": 3.715629262894028e-06, + "loss": 0.8634308, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.6606316566467285 + }, + { + "auxiliary_loss_clip": 0.01116493, + "auxiliary_loss_mlp": 0.01048343, + "balance_loss_clip": 1.04254019, + "balance_loss_mlp": 1.03203511, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 1.938401694619814, + "language_loss": 0.80175936, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82340771, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 2.597388744354248 + }, + { + "auxiliary_loss_clip": 0.01094256, + "auxiliary_loss_mlp": 0.01049678, + "balance_loss_clip": 1.0396924, + "balance_loss_mlp": 1.02988899, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 2.0888817726146627, + "language_loss": 0.80532229, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82676166, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 2.632211685180664 + }, + { + "auxiliary_loss_clip": 0.01119933, + "auxiliary_loss_mlp": 0.01050479, + "balance_loss_clip": 1.04158711, + "balance_loss_mlp": 1.03413582, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 1.801944023739764, + "language_loss": 0.77620065, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.79790485, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.6277501583099365 + }, + { + "auxiliary_loss_clip": 0.01119993, + "auxiliary_loss_mlp": 0.01044537, + "balance_loss_clip": 1.04418635, + "balance_loss_mlp": 1.02665544, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 4.530011653106325, + "language_loss": 0.81184435, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83348966, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 4.09277606010437 + }, + { + "auxiliary_loss_clip": 0.01093348, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.04159081, + "balance_loss_mlp": 1.02865481, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 2.5215988550085013, + "language_loss": 0.80929661, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83069968, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 2.5983471870422363 + }, + { + "auxiliary_loss_clip": 0.01119934, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.04291177, + "balance_loss_mlp": 1.02191722, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.01737328230027, + "language_loss": 0.89501727, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91660386, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 2.6008901596069336 + }, + { + "auxiliary_loss_clip": 0.01087141, + "auxiliary_loss_mlp": 0.0104635, + "balance_loss_clip": 1.04111421, + "balance_loss_mlp": 1.02702618, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.1985655526819774, + "language_loss": 0.62244296, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64377785, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.7378969192504883 + }, + { + "auxiliary_loss_clip": 0.01091651, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.04034543, + "balance_loss_mlp": 1.03446555, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 2.539484814827424, + "language_loss": 0.73644066, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75788265, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.671778440475464 + }, + { + "auxiliary_loss_clip": 0.01121819, + "auxiliary_loss_mlp": 0.01041382, + "balance_loss_clip": 1.04210675, + "balance_loss_mlp": 1.02468038, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6331276795038174, + "language_loss": 0.82448757, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84611958, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 4.141616106033325 + }, + { + "auxiliary_loss_clip": 0.0108544, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.04020905, + "balance_loss_mlp": 1.02758753, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.8182492464843216, + "language_loss": 0.77937913, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80069101, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 4.079297780990601 + }, + { + "auxiliary_loss_clip": 0.01099674, + "auxiliary_loss_mlp": 0.0104103, + "balance_loss_clip": 1.04185224, + "balance_loss_mlp": 1.02517557, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 2.5850514522222934, + "language_loss": 0.79408741, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81549442, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.6501095294952393 + }, + { + "auxiliary_loss_clip": 0.01098757, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.0501802, + "balance_loss_mlp": 1.0218823, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 3.2561216884781934, + "language_loss": 0.71371377, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.73509002, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 2.7166855335235596 + }, + { + "auxiliary_loss_clip": 0.01121334, + "auxiliary_loss_mlp": 0.01041078, + "balance_loss_clip": 1.04613936, + "balance_loss_mlp": 1.02448368, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.6757485390234368, + "language_loss": 0.78206289, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.80368698, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.5466747283935547 + }, + { + "auxiliary_loss_clip": 0.01102495, + "auxiliary_loss_mlp": 0.00750573, + "balance_loss_clip": 1.04007649, + "balance_loss_mlp": 1.00015688, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.330822269556139, + "language_loss": 0.86124599, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.8797766, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.6418330669403076 + }, + { + "auxiliary_loss_clip": 0.01098395, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.04624438, + "balance_loss_mlp": 1.03284097, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 3.0450778669429313, + "language_loss": 0.88015056, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90163726, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.6475107669830322 + }, + { + "auxiliary_loss_clip": 0.01096127, + "auxiliary_loss_mlp": 0.01052288, + "balance_loss_clip": 1.04213214, + "balance_loss_mlp": 1.03109264, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.0330431074329387, + "language_loss": 0.77829206, + "learning_rate": 3.712418262187102e-06, + "loss": 0.7997762, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.59535551071167 + }, + { + "auxiliary_loss_clip": 0.01110547, + "auxiliary_loss_mlp": 0.01047523, + "balance_loss_clip": 1.04520345, + "balance_loss_mlp": 1.02862811, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 3.0800949652508303, + "language_loss": 0.81131905, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83289981, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 2.5703306198120117 + }, + { + "auxiliary_loss_clip": 0.01108246, + "auxiliary_loss_mlp": 0.0105048, + "balance_loss_clip": 1.04035997, + "balance_loss_mlp": 1.03303993, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.6307820557198014, + "language_loss": 0.73058516, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75217235, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.5532305240631104 + }, + { + "auxiliary_loss_clip": 0.01111245, + "auxiliary_loss_mlp": 0.0104543, + "balance_loss_clip": 1.04316902, + "balance_loss_mlp": 1.02756047, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 1.6497880266259863, + "language_loss": 0.79620564, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81777239, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.634014368057251 + }, + { + "auxiliary_loss_clip": 0.010236, + "auxiliary_loss_mlp": 0.01006172, + "balance_loss_clip": 1.01472998, + "balance_loss_mlp": 1.00389481, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.8964017677270254, + "language_loss": 0.6036644, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62396216, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.1876142024993896 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.04390264, + "balance_loss_mlp": 1.02765441, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.1758483650583975, + "language_loss": 0.81523168, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83705777, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 2.618751049041748 + }, + { + "auxiliary_loss_clip": 0.01095376, + "auxiliary_loss_mlp": 0.00750735, + "balance_loss_clip": 1.04064715, + "balance_loss_mlp": 1.0001359, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.176882111772063, + "language_loss": 0.81313199, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.83159316, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 2.586224317550659 + }, + { + "auxiliary_loss_clip": 0.01114405, + "auxiliary_loss_mlp": 0.0104722, + "balance_loss_clip": 1.04342675, + "balance_loss_mlp": 1.02818251, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.2942793497577885, + "language_loss": 0.61390191, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63551819, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.5654165744781494 + }, + { + "auxiliary_loss_clip": 0.01111983, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.04311264, + "balance_loss_mlp": 1.02728295, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 1.8799034309719507, + "language_loss": 0.87062925, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89219081, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 2.638258934020996 + }, + { + "auxiliary_loss_clip": 0.01086255, + "auxiliary_loss_mlp": 0.01050631, + "balance_loss_clip": 1.0383718, + "balance_loss_mlp": 1.03433478, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 2.0263702475977907, + "language_loss": 0.8072772, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82864606, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 2.676352024078369 + }, + { + "auxiliary_loss_clip": 0.01100899, + "auxiliary_loss_mlp": 0.01047116, + "balance_loss_clip": 1.03904867, + "balance_loss_mlp": 1.02752948, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 1.833514621051824, + "language_loss": 0.6778428, + "learning_rate": 3.710402943207354e-06, + "loss": 0.69932294, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 2.6701183319091797 + }, + { + "auxiliary_loss_clip": 0.0112927, + "auxiliary_loss_mlp": 0.0104383, + "balance_loss_clip": 1.04445386, + "balance_loss_mlp": 1.0278796, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.725912199374012, + "language_loss": 0.81322104, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83495206, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 2.5125482082366943 + }, + { + "auxiliary_loss_clip": 0.01113646, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.04428196, + "balance_loss_mlp": 1.02474296, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 9.572630483372233, + "language_loss": 0.84932184, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87090123, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.5952019691467285 + }, + { + "auxiliary_loss_clip": 0.01011085, + "auxiliary_loss_mlp": 0.01004378, + "balance_loss_clip": 1.01273477, + "balance_loss_mlp": 1.00202966, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7668277536904156, + "language_loss": 0.53255033, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55270493, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 3.141986608505249 + }, + { + "auxiliary_loss_clip": 0.01073292, + "auxiliary_loss_mlp": 0.01069083, + "balance_loss_clip": 1.03471446, + "balance_loss_mlp": 1.04820895, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.5718397129464157, + "language_loss": 0.7350353, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.756459, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 2.6135761737823486 + }, + { + "auxiliary_loss_clip": 0.01095063, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.04109764, + "balance_loss_mlp": 1.02533484, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.24111685484387, + "language_loss": 0.87991989, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90128464, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.6226534843444824 + }, + { + "auxiliary_loss_clip": 0.01104152, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_clip": 1.04702342, + "balance_loss_mlp": 1.02839494, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.8529878742786587, + "language_loss": 0.73934394, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76083946, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 2.8230104446411133 + }, + { + "auxiliary_loss_clip": 0.01120082, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_clip": 1.04395103, + "balance_loss_mlp": 1.02708697, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 2.02118434220711, + "language_loss": 0.75075036, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.77239275, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.6602180004119873 + }, + { + "auxiliary_loss_clip": 0.01111183, + "auxiliary_loss_mlp": 0.01036068, + "balance_loss_clip": 1.04218698, + "balance_loss_mlp": 1.01965892, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.6158958377339485, + "language_loss": 0.8607378, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88221025, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.599161386489868 + }, + { + "auxiliary_loss_clip": 0.01104496, + "auxiliary_loss_mlp": 0.01038274, + "balance_loss_clip": 1.038131, + "balance_loss_mlp": 1.02191889, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.6841412681903072, + "language_loss": 0.68476021, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70618796, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 2.6199452877044678 + }, + { + "auxiliary_loss_clip": 0.01093232, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.03722239, + "balance_loss_mlp": 1.02476048, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.5768980240203796, + "language_loss": 0.76603574, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78737217, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 2.6111128330230713 + }, + { + "auxiliary_loss_clip": 0.01131988, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.04514158, + "balance_loss_mlp": 1.02705598, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 2.0529416102831495, + "language_loss": 0.75538152, + "learning_rate": 3.708178601452737e-06, + "loss": 0.77713084, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 2.5960469245910645 + }, + { + "auxiliary_loss_clip": 0.01080994, + "auxiliary_loss_mlp": 0.01039825, + "balance_loss_clip": 1.04029751, + "balance_loss_mlp": 1.02250385, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.5657360590360228, + "language_loss": 0.75814593, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7793541, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 2.64620304107666 + }, + { + "auxiliary_loss_clip": 0.01117944, + "auxiliary_loss_mlp": 0.01044528, + "balance_loss_clip": 1.04113686, + "balance_loss_mlp": 1.02729058, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.485388023760089, + "language_loss": 0.87759846, + "learning_rate": 3.707773333313917e-06, + "loss": 0.89922315, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 4.085539817810059 + }, + { + "auxiliary_loss_clip": 0.01125782, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.03936911, + "balance_loss_mlp": 1.01871312, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 2.0170833078107173, + "language_loss": 0.64124572, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66285455, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01085364, + "auxiliary_loss_mlp": 0.01039467, + "balance_loss_clip": 1.03596783, + "balance_loss_mlp": 1.02330232, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.160155872779794, + "language_loss": 0.7385655, + "learning_rate": 3.707367806139355e-06, + "loss": 0.75981379, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.5993263721466064 + }, + { + "auxiliary_loss_clip": 0.01117385, + "auxiliary_loss_mlp": 0.01039296, + "balance_loss_clip": 1.04098487, + "balance_loss_mlp": 1.0237031, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 1.7689630818247188, + "language_loss": 0.83764052, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.85920739, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.531954050064087 + }, + { + "auxiliary_loss_clip": 0.01121445, + "auxiliary_loss_mlp": 0.01039565, + "balance_loss_clip": 1.04434037, + "balance_loss_mlp": 1.02353144, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.2929077187263998, + "language_loss": 0.80769438, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.82930446, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 4.204597234725952 + }, + { + "auxiliary_loss_clip": 0.0108237, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.03394115, + "balance_loss_mlp": 1.02417517, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.4569716081363182, + "language_loss": 0.87332821, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.8945483, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 2.5646214485168457 + }, + { + "auxiliary_loss_clip": 0.01098736, + "auxiliary_loss_mlp": 0.0075047, + "balance_loss_clip": 1.04672253, + "balance_loss_mlp": 1.00018525, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5295713932512827, + "language_loss": 0.70837712, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.72686923, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 2.7487595081329346 + }, + { + "auxiliary_loss_clip": 0.00992166, + "auxiliary_loss_mlp": 0.01021052, + "balance_loss_clip": 1.0136292, + "balance_loss_mlp": 1.01900125, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8380277358367235, + "language_loss": 0.66312671, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68325889, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.3102786540985107 + }, + { + "auxiliary_loss_clip": 0.01121538, + "auxiliary_loss_mlp": 0.01044564, + "balance_loss_clip": 1.04086924, + "balance_loss_mlp": 1.02789831, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 1.9634800013901959, + "language_loss": 0.74227834, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76393938, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.625912666320801 + }, + { + "auxiliary_loss_clip": 0.01093562, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.03944898, + "balance_loss_mlp": 1.02310288, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.8320066978509704, + "language_loss": 0.78897452, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81029463, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.7498745918273926 + }, + { + "auxiliary_loss_clip": 0.01112479, + "auxiliary_loss_mlp": 0.01039536, + "balance_loss_clip": 1.04378366, + "balance_loss_mlp": 1.02175033, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0412235697147683, + "language_loss": 0.75769472, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77921486, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 4.339300155639648 + }, + { + "auxiliary_loss_clip": 0.01098888, + "auxiliary_loss_mlp": 0.01038591, + "balance_loss_clip": 1.04161942, + "balance_loss_mlp": 1.02246237, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 1.6951305792284757, + "language_loss": 0.80259883, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82397366, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 2.721064567565918 + }, + { + "auxiliary_loss_clip": 0.01007345, + "auxiliary_loss_mlp": 0.01000078, + "balance_loss_clip": 1.00958323, + "balance_loss_mlp": 0.99792081, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.8627098705311883, + "language_loss": 0.65160608, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67168033, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 4.41469407081604 + }, + { + "auxiliary_loss_clip": 0.01023427, + "auxiliary_loss_mlp": 0.01005383, + "balance_loss_clip": 1.01850438, + "balance_loss_mlp": 1.00312984, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7932315502133743, + "language_loss": 0.56918412, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.58947223, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.24200701713562 + }, + { + "auxiliary_loss_clip": 0.0110675, + "auxiliary_loss_mlp": 0.00750208, + "balance_loss_clip": 1.04122329, + "balance_loss_mlp": 1.00014567, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.8288067145517706, + "language_loss": 0.80699778, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82556731, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.5798304080963135 + }, + { + "auxiliary_loss_clip": 0.0110644, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.03652322, + "balance_loss_mlp": 1.02430081, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.681177311045145, + "language_loss": 0.54261267, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56409132, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.602579116821289 + }, + { + "auxiliary_loss_clip": 0.01104317, + "auxiliary_loss_mlp": 0.01040871, + "balance_loss_clip": 1.03932953, + "balance_loss_mlp": 1.02515936, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.633891639875862, + "language_loss": 0.8584975, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.87994945, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 2.610675096511841 + }, + { + "auxiliary_loss_clip": 0.01125637, + "auxiliary_loss_mlp": 0.01039515, + "balance_loss_clip": 1.04153895, + "balance_loss_mlp": 1.02419686, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 2.06820211193314, + "language_loss": 0.71624207, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.73789358, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.500899076461792 + }, + { + "auxiliary_loss_clip": 0.01104935, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.03911662, + "balance_loss_mlp": 1.02206159, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 2.0660264978392098, + "language_loss": 0.76610643, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78755558, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.552947521209717 + }, + { + "auxiliary_loss_clip": 0.0108977, + "auxiliary_loss_mlp": 0.01037379, + "balance_loss_clip": 1.0346756, + "balance_loss_mlp": 1.02286577, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 1.6619574090952678, + "language_loss": 0.69301009, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.71428156, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 2.8853065967559814 + }, + { + "auxiliary_loss_clip": 0.010725, + "auxiliary_loss_mlp": 0.01056774, + "balance_loss_clip": 1.03377938, + "balance_loss_mlp": 1.0362463, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.5962293044827895, + "language_loss": 0.81603229, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83732504, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.7003555297851562 + }, + { + "auxiliary_loss_clip": 0.01114618, + "auxiliary_loss_mlp": 0.01042703, + "balance_loss_clip": 1.04063213, + "balance_loss_mlp": 1.02683568, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.1997247774829787, + "language_loss": 0.76508939, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78666264, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.586819648742676 + }, + { + "auxiliary_loss_clip": 0.0106514, + "auxiliary_loss_mlp": 0.01054811, + "balance_loss_clip": 1.03349864, + "balance_loss_mlp": 1.03555822, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.7766793012606157, + "language_loss": 0.78874356, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.80994308, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 2.7873642444610596 + }, + { + "auxiliary_loss_clip": 0.01027587, + "auxiliary_loss_mlp": 0.01060761, + "balance_loss_clip": 1.00897479, + "balance_loss_mlp": 1.05879402, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9490896406787869, + "language_loss": 0.61980474, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64068818, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 3.0460751056671143 + }, + { + "auxiliary_loss_clip": 0.01093125, + "auxiliary_loss_mlp": 0.00750519, + "balance_loss_clip": 1.04084492, + "balance_loss_mlp": 1.00022769, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1622936418706318, + "language_loss": 0.81568277, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.8341192, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 2.788318634033203 + }, + { + "auxiliary_loss_clip": 0.0107548, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.03957582, + "balance_loss_mlp": 1.02838063, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 1.8861722574492337, + "language_loss": 0.74416333, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76537728, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 2.796689033508301 + }, + { + "auxiliary_loss_clip": 0.01128352, + "auxiliary_loss_mlp": 0.01054761, + "balance_loss_clip": 1.04856169, + "balance_loss_mlp": 1.0384773, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 2.784888010739719, + "language_loss": 0.7961635, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.81799471, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.6120810508728027 + }, + { + "auxiliary_loss_clip": 0.01094418, + "auxiliary_loss_mlp": 0.01043958, + "balance_loss_clip": 1.0412724, + "balance_loss_mlp": 1.02530205, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.989579921144257, + "language_loss": 0.77797484, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.79935861, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 2.719838857650757 + }, + { + "auxiliary_loss_clip": 0.0113352, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.04599512, + "balance_loss_mlp": 1.0225625, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 2.435328172935235, + "language_loss": 0.69168746, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71342456, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 2.638399362564087 + }, + { + "auxiliary_loss_clip": 0.01092449, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.04128838, + "balance_loss_mlp": 1.03235817, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 1.819791158731816, + "language_loss": 0.69257098, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71398401, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 2.822042942047119 + }, + { + "auxiliary_loss_clip": 0.01103327, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.05058849, + "balance_loss_mlp": 1.02223587, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.234536229394914, + "language_loss": 0.66614938, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68756473, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 2.836164951324463 + }, + { + "auxiliary_loss_clip": 0.01120242, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.04214215, + "balance_loss_mlp": 1.01961684, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.343558998030287, + "language_loss": 0.74167758, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76325202, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 2.6057722568511963 + }, + { + "auxiliary_loss_clip": 0.01085501, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_clip": 1.03998423, + "balance_loss_mlp": 1.0242579, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 1.8300616199198947, + "language_loss": 0.71621251, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73747462, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.731165647506714 + }, + { + "auxiliary_loss_clip": 0.01087726, + "auxiliary_loss_mlp": 0.01042579, + "balance_loss_clip": 1.0402292, + "balance_loss_mlp": 1.02674842, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.9768516788452313, + "language_loss": 0.72594649, + "learning_rate": 3.701049056727384e-06, + "loss": 0.7472496, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 2.7448301315307617 + }, + { + "auxiliary_loss_clip": 0.0108936, + "auxiliary_loss_mlp": 0.01043321, + "balance_loss_clip": 1.03767514, + "balance_loss_mlp": 1.0259161, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.93584234357114, + "language_loss": 0.80784798, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.82917482, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 2.7038841247558594 + }, + { + "auxiliary_loss_clip": 0.01128695, + "auxiliary_loss_mlp": 0.0103912, + "balance_loss_clip": 1.04115784, + "balance_loss_mlp": 1.02291369, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.262698985776644, + "language_loss": 0.83477777, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85645592, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 2.628894805908203 + }, + { + "auxiliary_loss_clip": 0.01070263, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.03689957, + "balance_loss_mlp": 1.02048957, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.6162890061638135, + "language_loss": 0.67728436, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.69834346, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.696432590484619 + }, + { + "auxiliary_loss_clip": 0.01086689, + "auxiliary_loss_mlp": 0.01042378, + "balance_loss_clip": 1.03664041, + "balance_loss_mlp": 1.02667832, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.2014949917817375, + "language_loss": 0.73567557, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75696623, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 2.6577162742614746 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01043915, + "balance_loss_clip": 1.04165137, + "balance_loss_mlp": 1.02904987, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.6244702573140135, + "language_loss": 0.86729479, + "learning_rate": 3.70002409219765e-06, + "loss": 0.88890159, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 2.5836281776428223 + }, + { + "auxiliary_loss_clip": 0.01076728, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.03822923, + "balance_loss_mlp": 1.0197978, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.761553620730627, + "language_loss": 0.70964849, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73078638, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 4.207524061203003 + }, + { + "auxiliary_loss_clip": 0.01095444, + "auxiliary_loss_mlp": 0.01043205, + "balance_loss_clip": 1.04057693, + "balance_loss_mlp": 1.02631295, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.6781460146141758, + "language_loss": 0.71198219, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73336864, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.6534264087677 + }, + { + "auxiliary_loss_clip": 0.01101291, + "auxiliary_loss_mlp": 0.01042405, + "balance_loss_clip": 1.04026306, + "balance_loss_mlp": 1.02284276, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 2.338409198438272, + "language_loss": 0.75968301, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78111994, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.619387149810791 + }, + { + "auxiliary_loss_clip": 0.01108016, + "auxiliary_loss_mlp": 0.0104424, + "balance_loss_clip": 1.04007697, + "balance_loss_mlp": 1.02638292, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.5411109470434459, + "language_loss": 0.80554831, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82707089, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.647158145904541 + }, + { + "auxiliary_loss_clip": 0.01118185, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.04108465, + "balance_loss_mlp": 1.01940298, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.7791924358509474, + "language_loss": 0.80182654, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82336801, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 2.669154167175293 + }, + { + "auxiliary_loss_clip": 0.01099589, + "auxiliary_loss_mlp": 0.01043922, + "balance_loss_clip": 1.03938365, + "balance_loss_mlp": 1.02781034, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 1.9365921863072733, + "language_loss": 0.8966428, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91807795, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 4.1286914348602295 + }, + { + "auxiliary_loss_clip": 0.01026416, + "auxiliary_loss_mlp": 0.00747786, + "balance_loss_clip": 1.01691437, + "balance_loss_mlp": 0.99978334, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8231874853690803, + "language_loss": 0.55896986, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57671183, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.113588333129883 + }, + { + "auxiliary_loss_clip": 0.01102712, + "auxiliary_loss_mlp": 0.00750393, + "balance_loss_clip": 1.0398438, + "balance_loss_mlp": 1.00013387, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.51919056058839, + "language_loss": 0.84136736, + "learning_rate": 3.698380797170751e-06, + "loss": 0.85989839, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.6312756538391113 + }, + { + "auxiliary_loss_clip": 0.01097491, + "auxiliary_loss_mlp": 0.01046802, + "balance_loss_clip": 1.03889561, + "balance_loss_mlp": 1.0254041, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.420391248844403, + "language_loss": 0.69624257, + "learning_rate": 3.698175095398085e-06, + "loss": 0.7176854, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 2.6313204765319824 + }, + { + "auxiliary_loss_clip": 0.01107827, + "auxiliary_loss_mlp": 0.01041325, + "balance_loss_clip": 1.04065633, + "balance_loss_mlp": 1.02358651, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.822861782546531, + "language_loss": 0.72048432, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.74197584, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.7133631706237793 + }, + { + "auxiliary_loss_clip": 0.01109252, + "auxiliary_loss_mlp": 0.01042918, + "balance_loss_clip": 1.03641701, + "balance_loss_mlp": 1.02811241, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 1.6800277567874917, + "language_loss": 0.83257616, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85409778, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 2.6518137454986572 + }, + { + "auxiliary_loss_clip": 0.01029455, + "auxiliary_loss_mlp": 0.01000649, + "balance_loss_clip": 1.01097941, + "balance_loss_mlp": 0.99850291, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.785608354231934, + "language_loss": 0.58915412, + "learning_rate": 3.697557603741482e-06, + "loss": 0.60945511, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 4.620852708816528 + }, + { + "auxiliary_loss_clip": 0.01073832, + "auxiliary_loss_mlp": 0.01043676, + "balance_loss_clip": 1.04225159, + "balance_loss_mlp": 1.02611661, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 1.9951383953430635, + "language_loss": 0.62398279, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64515787, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 4.2837748527526855 + }, + { + "auxiliary_loss_clip": 0.01094709, + "auxiliary_loss_mlp": 0.01068618, + "balance_loss_clip": 1.0420239, + "balance_loss_mlp": 1.05099857, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 7.535758291357901, + "language_loss": 0.75480574, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77643901, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.6706225872039795 + }, + { + "auxiliary_loss_clip": 0.01110448, + "auxiliary_loss_mlp": 0.0075045, + "balance_loss_clip": 1.03914952, + "balance_loss_mlp": 1.00019205, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.8086391542978646, + "language_loss": 0.76759923, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78620821, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.6285178661346436 + }, + { + "auxiliary_loss_clip": 0.0111657, + "auxiliary_loss_mlp": 0.01047361, + "balance_loss_clip": 1.04021549, + "balance_loss_mlp": 1.03193569, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 1.4639410959521897, + "language_loss": 0.7512244, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77286375, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 2.5971763134002686 + }, + { + "auxiliary_loss_clip": 0.01084136, + "auxiliary_loss_mlp": 0.010477, + "balance_loss_clip": 1.03933084, + "balance_loss_mlp": 1.03024793, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 1.984621094216667, + "language_loss": 0.71193647, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73325479, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.727745771408081 + }, + { + "auxiliary_loss_clip": 0.01083741, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_clip": 1.04113543, + "balance_loss_mlp": 1.034711, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.9459392782425184, + "language_loss": 0.85782546, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87917113, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.6626522541046143 + }, + { + "auxiliary_loss_clip": 0.01088479, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_clip": 1.03808784, + "balance_loss_mlp": 1.02626133, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.537560933516592, + "language_loss": 0.69859052, + "learning_rate": 3.696114537236335e-06, + "loss": 0.71990132, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 2.7370452880859375 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01048211, + "balance_loss_clip": 1.0401485, + "balance_loss_mlp": 1.02920949, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 1.9350921762061664, + "language_loss": 0.68450743, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70616502, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.6871087551116943 + }, + { + "auxiliary_loss_clip": 0.01098693, + "auxiliary_loss_mlp": 0.01048413, + "balance_loss_clip": 1.04318738, + "balance_loss_mlp": 1.03148556, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 2.0788924635586894, + "language_loss": 0.77293193, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79440296, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.683777332305908 + }, + { + "auxiliary_loss_clip": 0.01107458, + "auxiliary_loss_mlp": 0.01053857, + "balance_loss_clip": 1.03857017, + "balance_loss_mlp": 1.03659558, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 2.4498044876759137, + "language_loss": 0.64434415, + "learning_rate": 3.695495115253795e-06, + "loss": 0.66595733, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.6683413982391357 + }, + { + "auxiliary_loss_clip": 0.0102887, + "auxiliary_loss_mlp": 0.01003504, + "balance_loss_clip": 1.01070917, + "balance_loss_mlp": 1.00145376, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6800632009008981, + "language_loss": 0.58162928, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60195303, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 3.2527620792388916 + }, + { + "auxiliary_loss_clip": 0.01091847, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.03720427, + "balance_loss_mlp": 1.02183485, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.7528583591160396, + "language_loss": 0.91489547, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.93620265, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 2.685203790664673 + }, + { + "auxiliary_loss_clip": 0.01109191, + "auxiliary_loss_mlp": 0.01051684, + "balance_loss_clip": 1.03942204, + "balance_loss_mlp": 1.0324074, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.7533548258020313, + "language_loss": 0.78630781, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80791652, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 2.655963897705078 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01044939, + "balance_loss_clip": 1.03585577, + "balance_loss_mlp": 1.02598464, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.8371162841823026, + "language_loss": 0.71785212, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73894113, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 2.914680004119873 + }, + { + "auxiliary_loss_clip": 0.01018597, + "auxiliary_loss_mlp": 0.0100066, + "balance_loss_clip": 1.01008081, + "balance_loss_mlp": 0.99855042, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9808803328081163, + "language_loss": 0.62534583, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64553833, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.1415600776672363 + }, + { + "auxiliary_loss_clip": 0.01127444, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.04255176, + "balance_loss_mlp": 1.0282433, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.5455833778563308, + "language_loss": 0.82610089, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84782249, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 2.5233888626098633 + }, + { + "auxiliary_loss_clip": 0.0111638, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.04147375, + "balance_loss_mlp": 1.01750886, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 1.9890533985146281, + "language_loss": 0.82053864, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.84205359, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.615246057510376 + }, + { + "auxiliary_loss_clip": 0.01101852, + "auxiliary_loss_mlp": 0.01045676, + "balance_loss_clip": 1.04237866, + "balance_loss_mlp": 1.02852154, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 1.8364050930581381, + "language_loss": 0.77430356, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79577887, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 2.61084246635437 + }, + { + "auxiliary_loss_clip": 0.01064705, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.04025304, + "balance_loss_mlp": 1.02212846, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.682850228662642, + "language_loss": 0.79965127, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.82072604, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 2.6995060443878174 + }, + { + "auxiliary_loss_clip": 0.01110535, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.03960049, + "balance_loss_mlp": 1.02029037, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.656618670137134, + "language_loss": 0.86756736, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.88903093, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 2.7250702381134033 + }, + { + "auxiliary_loss_clip": 0.01131029, + "auxiliary_loss_mlp": 0.01044836, + "balance_loss_clip": 1.04533458, + "balance_loss_mlp": 1.02727675, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 2.026845297423963, + "language_loss": 0.75188911, + "learning_rate": 3.693218952340186e-06, + "loss": 0.77364773, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.636561155319214 + }, + { + "auxiliary_loss_clip": 0.01088745, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.03466141, + "balance_loss_mlp": 1.02434206, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.6154352099102804, + "language_loss": 0.7909525, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81226164, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 2.5975353717803955 + }, + { + "auxiliary_loss_clip": 0.0109061, + "auxiliary_loss_mlp": 0.00750732, + "balance_loss_clip": 1.03848791, + "balance_loss_mlp": 1.00014615, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.694179403789869, + "language_loss": 0.80325544, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82166886, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 2.6814701557159424 + }, + { + "auxiliary_loss_clip": 0.01072041, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.03473914, + "balance_loss_mlp": 1.02110195, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 1.7861073711340276, + "language_loss": 0.74489301, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76600504, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 2.7207415103912354 + }, + { + "auxiliary_loss_clip": 0.01122922, + "auxiliary_loss_mlp": 0.01042167, + "balance_loss_clip": 1.04281664, + "balance_loss_mlp": 1.02340376, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.0748806631709953, + "language_loss": 0.76585847, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78750938, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 2.640056610107422 + }, + { + "auxiliary_loss_clip": 0.0108523, + "auxiliary_loss_mlp": 0.01052723, + "balance_loss_clip": 1.03962612, + "balance_loss_mlp": 1.03486562, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.8151372210425905, + "language_loss": 0.68493962, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70631909, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.722614049911499 + }, + { + "auxiliary_loss_clip": 0.0107876, + "auxiliary_loss_mlp": 0.01055006, + "balance_loss_clip": 1.03825653, + "balance_loss_mlp": 1.03680313, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 2.028710025133076, + "language_loss": 0.80836272, + "learning_rate": 3.691974133706947e-06, + "loss": 0.82970041, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.756019115447998 + }, + { + "auxiliary_loss_clip": 0.01100316, + "auxiliary_loss_mlp": 0.01037853, + "balance_loss_clip": 1.04194629, + "balance_loss_mlp": 1.02122331, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.086054400420601, + "language_loss": 0.79821765, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.81959939, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 2.5820388793945312 + }, + { + "auxiliary_loss_clip": 0.0112918, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.04238999, + "balance_loss_mlp": 1.02173424, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.6285810036993058, + "language_loss": 0.72083551, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74252129, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 4.0998899936676025 + }, + { + "auxiliary_loss_clip": 0.01115103, + "auxiliary_loss_mlp": 0.01045056, + "balance_loss_clip": 1.04155719, + "balance_loss_mlp": 1.0288552, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.8773977724097946, + "language_loss": 0.87246668, + "learning_rate": 3.691350858126404e-06, + "loss": 0.8940683, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.584667444229126 + }, + { + "auxiliary_loss_clip": 0.01093901, + "auxiliary_loss_mlp": 0.01044311, + "balance_loss_clip": 1.03748631, + "balance_loss_mlp": 1.02548766, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 2.1678550921114312, + "language_loss": 0.70969349, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73107564, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 2.6365301609039307 + }, + { + "auxiliary_loss_clip": 0.01095451, + "auxiliary_loss_mlp": 0.01044492, + "balance_loss_clip": 1.04139686, + "balance_loss_mlp": 1.02817202, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.11434583380033, + "language_loss": 0.86126691, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88266629, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 2.6785733699798584 + }, + { + "auxiliary_loss_clip": 0.01115985, + "auxiliary_loss_mlp": 0.01044451, + "balance_loss_clip": 1.03921652, + "balance_loss_mlp": 1.02815485, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.4911890453666792, + "language_loss": 0.80997157, + "learning_rate": 3.69072700532013e-06, + "loss": 0.83157593, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 2.6184635162353516 + }, + { + "auxiliary_loss_clip": 0.01089343, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.03396225, + "balance_loss_mlp": 1.02033281, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.641481256606824, + "language_loss": 0.8584097, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.87966323, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 4.115063667297363 + }, + { + "auxiliary_loss_clip": 0.01112598, + "auxiliary_loss_mlp": 0.01039662, + "balance_loss_clip": 1.04203939, + "balance_loss_mlp": 1.02425992, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.080639844931294, + "language_loss": 0.84318697, + "learning_rate": 3.69031078287345e-06, + "loss": 0.86470962, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 2.5625884532928467 + }, + { + "auxiliary_loss_clip": 0.0111536, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.03927684, + "balance_loss_mlp": 1.01544833, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.178605103719431, + "language_loss": 0.83825678, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85973167, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 2.5530712604522705 + }, + { + "auxiliary_loss_clip": 0.01084059, + "auxiliary_loss_mlp": 0.01037421, + "balance_loss_clip": 1.03604853, + "balance_loss_mlp": 1.02046943, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 3.00359052140992, + "language_loss": 0.77088296, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79209775, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.718071937561035 + }, + { + "auxiliary_loss_clip": 0.01101287, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.03911161, + "balance_loss_mlp": 1.02497673, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.627738440382253, + "language_loss": 0.87297726, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89439011, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 2.624823570251465 + }, + { + "auxiliary_loss_clip": 0.01094332, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_clip": 1.0404017, + "balance_loss_mlp": 1.02746606, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.003474755506645, + "language_loss": 0.78051651, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8019017, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 4.131056070327759 + }, + { + "auxiliary_loss_clip": 0.01113704, + "auxiliary_loss_mlp": 0.01039015, + "balance_loss_clip": 1.03860092, + "balance_loss_mlp": 1.02296984, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 2.0295391517701162, + "language_loss": 0.76362503, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78515226, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 4.096084356307983 + }, + { + "auxiliary_loss_clip": 0.01092462, + "auxiliary_loss_mlp": 0.00750257, + "balance_loss_clip": 1.0395422, + "balance_loss_mlp": 1.00016809, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.607527698748718, + "language_loss": 0.79695427, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81538147, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.7413954734802246 + }, + { + "auxiliary_loss_clip": 0.01102689, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.03683853, + "balance_loss_mlp": 1.01946306, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.6221499206651977, + "language_loss": 0.69613242, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71752322, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 2.778585433959961 + }, + { + "auxiliary_loss_clip": 0.01091087, + "auxiliary_loss_mlp": 0.01041165, + "balance_loss_clip": 1.03804159, + "balance_loss_mlp": 1.02422571, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 1.8016283259855224, + "language_loss": 0.80977416, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83109665, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.6446242332458496 + }, + { + "auxiliary_loss_clip": 0.01112338, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.03875208, + "balance_loss_mlp": 1.02085519, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 2.920094322007067, + "language_loss": 0.8304373, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85192978, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.5858750343322754 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.03571081, + "balance_loss_mlp": 1.02583265, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.6983516874722193, + "language_loss": 0.8585937, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88008094, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.668668270111084 + }, + { + "auxiliary_loss_clip": 0.01088546, + "auxiliary_loss_mlp": 0.01039343, + "balance_loss_clip": 1.03806114, + "balance_loss_mlp": 1.02349997, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.036255639417239, + "language_loss": 0.84529281, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86657172, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 2.6214067935943604 + }, + { + "auxiliary_loss_clip": 0.01123634, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.0412004, + "balance_loss_mlp": 1.01887643, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0802582884659366, + "language_loss": 0.67766601, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.6992408, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.562832832336426 + }, + { + "auxiliary_loss_clip": 0.01121019, + "auxiliary_loss_mlp": 0.01039957, + "balance_loss_clip": 1.03896999, + "balance_loss_mlp": 1.02456737, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.1583241939092317, + "language_loss": 0.84273189, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86434168, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.5731301307678223 + }, + { + "auxiliary_loss_clip": 0.01129043, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.04343367, + "balance_loss_mlp": 1.02010381, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.0816162476710436, + "language_loss": 0.63823992, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.65988779, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.558980941772461 + }, + { + "auxiliary_loss_clip": 0.011098, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.03806317, + "balance_loss_mlp": 1.01919532, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.3902698621934413, + "language_loss": 0.80147088, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82291526, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 2.5962750911712646 + }, + { + "auxiliary_loss_clip": 0.01073267, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.04062629, + "balance_loss_mlp": 1.01829982, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 5.5173657567740895, + "language_loss": 0.7625671, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78363574, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.7504818439483643 + }, + { + "auxiliary_loss_clip": 0.01104907, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.0408783, + "balance_loss_mlp": 1.01694953, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 1.8716414259881418, + "language_loss": 0.73454249, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75591332, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 2.591526746749878 + }, + { + "auxiliary_loss_clip": 0.01094825, + "auxiliary_loss_mlp": 0.01048455, + "balance_loss_clip": 1.0356878, + "balance_loss_mlp": 1.03143215, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.0112665568271617, + "language_loss": 0.77729124, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.79872406, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 2.6564090251922607 + }, + { + "auxiliary_loss_clip": 0.0107605, + "auxiliary_loss_mlp": 0.01040172, + "balance_loss_clip": 1.03677726, + "balance_loss_mlp": 1.02350605, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.8734426636867711, + "language_loss": 0.85087699, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.8720392, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.65069580078125 + }, + { + "auxiliary_loss_clip": 0.01112918, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.03877306, + "balance_loss_mlp": 1.0195992, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 2.0301052139294735, + "language_loss": 0.80687815, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.8283571, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.647967576980591 + }, + { + "auxiliary_loss_clip": 0.01054928, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.0343219, + "balance_loss_mlp": 1.02081549, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.9231378162859407, + "language_loss": 0.7253378, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.74624002, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 2.7657270431518555 + }, + { + "auxiliary_loss_clip": 0.01113982, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.03949594, + "balance_loss_mlp": 1.02335155, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.0731741111673627, + "language_loss": 0.78805077, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80958223, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 2.6386122703552246 + }, + { + "auxiliary_loss_clip": 0.01114389, + "auxiliary_loss_mlp": 0.01040299, + "balance_loss_clip": 1.03912377, + "balance_loss_mlp": 1.02385998, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.087841316463662, + "language_loss": 0.87349921, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89504611, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 2.5873239040374756 + }, + { + "auxiliary_loss_clip": 0.01111484, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.04468536, + "balance_loss_mlp": 1.02057946, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.163959016885338, + "language_loss": 0.62093985, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64242113, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.6601340770721436 + }, + { + "auxiliary_loss_clip": 0.01109956, + "auxiliary_loss_mlp": 0.01044281, + "balance_loss_clip": 1.04290056, + "balance_loss_mlp": 1.02660239, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.401464766475683, + "language_loss": 0.86394966, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88549197, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 2.6258208751678467 + }, + { + "auxiliary_loss_clip": 0.01071431, + "auxiliary_loss_mlp": 0.00750763, + "balance_loss_clip": 1.03386998, + "balance_loss_mlp": 1.00021327, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 4.790390389863736, + "language_loss": 0.70964527, + "learning_rate": 3.684876582881668e-06, + "loss": 0.72786725, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 2.778926372528076 + }, + { + "auxiliary_loss_clip": 0.01121578, + "auxiliary_loss_mlp": 0.01036804, + "balance_loss_clip": 1.03811622, + "balance_loss_mlp": 1.02077055, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 1.9984265852738259, + "language_loss": 0.7102226, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.7318064, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 2.5796539783477783 + }, + { + "auxiliary_loss_clip": 0.01027581, + "auxiliary_loss_mlp": 0.01026762, + "balance_loss_clip": 1.01011443, + "balance_loss_mlp": 1.02494991, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7498471259607177, + "language_loss": 0.5553937, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57593715, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 3.1846237182617188 + }, + { + "auxiliary_loss_clip": 0.01079807, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.03919315, + "balance_loss_mlp": 1.02015281, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.7785430060722, + "language_loss": 0.71441764, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73557305, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 2.7762610912323 + }, + { + "auxiliary_loss_clip": 0.01092661, + "auxiliary_loss_mlp": 0.00750446, + "balance_loss_clip": 1.040501, + "balance_loss_mlp": 1.00027084, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.5217739665392627, + "language_loss": 0.75423503, + "learning_rate": 3.684036715178351e-06, + "loss": 0.7726661, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.672060489654541 + }, + { + "auxiliary_loss_clip": 0.01075819, + "auxiliary_loss_mlp": 0.010533, + "balance_loss_clip": 1.03978717, + "balance_loss_mlp": 1.03711152, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.8636020754919096, + "language_loss": 0.87931508, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90060627, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.7113356590270996 + }, + { + "auxiliary_loss_clip": 0.01112395, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.0417695, + "balance_loss_mlp": 1.02498114, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.6138413710930486, + "language_loss": 0.77012455, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.79164863, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.6640303134918213 + }, + { + "auxiliary_loss_clip": 0.01128189, + "auxiliary_loss_mlp": 0.01041197, + "balance_loss_clip": 1.04271817, + "balance_loss_mlp": 1.02460349, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.532538372177304, + "language_loss": 0.73909271, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76078665, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 4.11780309677124 + }, + { + "auxiliary_loss_clip": 0.01099931, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.0359261, + "balance_loss_mlp": 1.02490926, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 1.8237275263761454, + "language_loss": 0.73529297, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75670695, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.694748878479004 + }, + { + "auxiliary_loss_clip": 0.01108031, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.04133904, + "balance_loss_mlp": 1.02634346, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.7554303485182536, + "language_loss": 0.85039729, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87190861, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 2.592085361480713 + }, + { + "auxiliary_loss_clip": 0.01054515, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_clip": 1.03549969, + "balance_loss_mlp": 1.03038096, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.6644071246861016, + "language_loss": 0.69211882, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.71313751, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 2.7206966876983643 + }, + { + "auxiliary_loss_clip": 0.01001203, + "auxiliary_loss_mlp": 0.01002005, + "balance_loss_clip": 1.01235199, + "balance_loss_mlp": 0.9999432, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8094112874128373, + "language_loss": 0.60218883, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62222099, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.372553586959839 + }, + { + "auxiliary_loss_clip": 0.01115166, + "auxiliary_loss_mlp": 0.0103921, + "balance_loss_clip": 1.04279387, + "balance_loss_mlp": 1.02376091, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.5965745885019529, + "language_loss": 0.7267673, + "learning_rate": 3.682353915057679e-06, + "loss": 0.7483111, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 2.575315237045288 + }, + { + "auxiliary_loss_clip": 0.01063011, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.03505969, + "balance_loss_mlp": 1.02654862, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.8359262165165262, + "language_loss": 0.8721056, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.89318061, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 4.237945795059204 + }, + { + "auxiliary_loss_clip": 0.0111859, + "auxiliary_loss_mlp": 0.01036917, + "balance_loss_clip": 1.04034078, + "balance_loss_mlp": 1.02181339, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.7595851281795216, + "language_loss": 0.68964493, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.7112, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 2.612457036972046 + }, + { + "auxiliary_loss_clip": 0.01092405, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.03835702, + "balance_loss_mlp": 1.01974559, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7698780582756326, + "language_loss": 0.89595675, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91724485, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 2.64357328414917 + }, + { + "auxiliary_loss_clip": 0.01085269, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.03808284, + "balance_loss_mlp": 1.01997519, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 2.4236568768954627, + "language_loss": 0.7644701, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78569424, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.7204065322875977 + }, + { + "auxiliary_loss_clip": 0.01111371, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.03762674, + "balance_loss_mlp": 1.02007127, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 1.952678813180083, + "language_loss": 0.77352208, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79498869, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 2.5687954425811768 + }, + { + "auxiliary_loss_clip": 0.01029166, + "auxiliary_loss_mlp": 0.01003953, + "balance_loss_clip": 1.00984454, + "balance_loss_mlp": 1.00191498, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.834422987649088, + "language_loss": 0.67122698, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69155812, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 4.575462579727173 + }, + { + "auxiliary_loss_clip": 0.01115773, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.03921819, + "balance_loss_mlp": 1.01933861, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 1.9836143056170594, + "language_loss": 0.84337598, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86488545, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 2.5762174129486084 + }, + { + "auxiliary_loss_clip": 0.01114669, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.04039967, + "balance_loss_mlp": 1.02244151, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 2.0712360790823316, + "language_loss": 0.84804046, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.86955947, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 4.083492755889893 + }, + { + "auxiliary_loss_clip": 0.01077042, + "auxiliary_loss_mlp": 0.01041806, + "balance_loss_clip": 1.03903508, + "balance_loss_mlp": 1.02548647, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.7830054877888473, + "language_loss": 0.8589803, + "learning_rate": 3.680455884806959e-06, + "loss": 0.8801688, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 2.763984441757202 + }, + { + "auxiliary_loss_clip": 0.01045412, + "auxiliary_loss_mlp": 0.01043691, + "balance_loss_clip": 1.03735447, + "balance_loss_mlp": 1.02654839, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 2.1025676471974153, + "language_loss": 0.72767037, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.74856138, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 2.821634292602539 + }, + { + "auxiliary_loss_clip": 0.01096638, + "auxiliary_loss_mlp": 0.00750318, + "balance_loss_clip": 1.03821015, + "balance_loss_mlp": 1.00030971, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.94196778248747, + "language_loss": 0.85284877, + "learning_rate": 3.680033399147797e-06, + "loss": 0.8713184, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 2.6432230472564697 + }, + { + "auxiliary_loss_clip": 0.01001862, + "auxiliary_loss_mlp": 0.009994, + "balance_loss_clip": 1.01370788, + "balance_loss_mlp": 0.99744493, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.698897769835842, + "language_loss": 0.57148051, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59149313, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.228987216949463 + }, + { + "auxiliary_loss_clip": 0.01123432, + "auxiliary_loss_mlp": 0.00750422, + "balance_loss_clip": 1.03999829, + "balance_loss_mlp": 1.00035357, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 2.9487130903799956, + "language_loss": 0.78323853, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80197704, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.6204516887664795 + }, + { + "auxiliary_loss_clip": 0.01122173, + "auxiliary_loss_mlp": 0.01042756, + "balance_loss_clip": 1.0404551, + "balance_loss_mlp": 1.02390885, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.407419532893214, + "language_loss": 0.62670177, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64835107, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.6813230514526367 + }, + { + "auxiliary_loss_clip": 0.01070649, + "auxiliary_loss_mlp": 0.01049561, + "balance_loss_clip": 1.03593004, + "balance_loss_mlp": 1.03162026, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.6887864640293369, + "language_loss": 0.85868967, + "learning_rate": 3.679187663409184e-06, + "loss": 0.87989175, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.7202341556549072 + }, + { + "auxiliary_loss_clip": 0.01100869, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.03648698, + "balance_loss_mlp": 1.02656257, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 2.0644350639088285, + "language_loss": 0.75641221, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77786541, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 2.6743218898773193 + }, + { + "auxiliary_loss_clip": 0.01111195, + "auxiliary_loss_mlp": 0.0104225, + "balance_loss_clip": 1.03956771, + "balance_loss_mlp": 1.02461863, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.9186914815165952, + "language_loss": 0.76343435, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78496879, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 2.6132583618164062 + }, + { + "auxiliary_loss_clip": 0.01097016, + "auxiliary_loss_mlp": 0.01046308, + "balance_loss_clip": 1.03812861, + "balance_loss_mlp": 1.02970195, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.4976966702913044, + "language_loss": 0.81887734, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84031057, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.7396161556243896 + }, + { + "auxiliary_loss_clip": 0.01037498, + "auxiliary_loss_mlp": 0.01001414, + "balance_loss_clip": 1.00875616, + "balance_loss_mlp": 0.99948311, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.8015107587294535, + "language_loss": 0.56654251, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58693159, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 3.003572463989258 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.00750592, + "balance_loss_clip": 1.03968823, + "balance_loss_mlp": 1.0002923, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 2.7827892269988648, + "language_loss": 0.88193786, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90041482, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.701530933380127 + }, + { + "auxiliary_loss_clip": 0.01119314, + "auxiliary_loss_mlp": 0.01042939, + "balance_loss_clip": 1.04300821, + "balance_loss_mlp": 1.02554655, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.5228172564950746, + "language_loss": 0.80059612, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82221866, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 2.6177923679351807 + }, + { + "auxiliary_loss_clip": 0.01072503, + "auxiliary_loss_mlp": 0.00750559, + "balance_loss_clip": 1.03272641, + "balance_loss_mlp": 1.00031364, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 2.5831410450473147, + "language_loss": 0.76938629, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78761685, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 2.7125587463378906 + }, + { + "auxiliary_loss_clip": 0.0108532, + "auxiliary_loss_mlp": 0.01043121, + "balance_loss_clip": 1.03539073, + "balance_loss_mlp": 1.0269686, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.6330351298950416, + "language_loss": 0.80374002, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82502443, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.653411388397217 + }, + { + "auxiliary_loss_clip": 0.01083469, + "auxiliary_loss_mlp": 0.00750654, + "balance_loss_clip": 1.04010379, + "balance_loss_mlp": 1.00041807, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5688233208937172, + "language_loss": 0.77738488, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.79572606, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 2.7528486251831055 + }, + { + "auxiliary_loss_clip": 0.01052174, + "auxiliary_loss_mlp": 0.01048135, + "balance_loss_clip": 1.0337975, + "balance_loss_mlp": 1.02838206, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 1.8071354984605026, + "language_loss": 0.83548796, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85649109, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 2.7713987827301025 + }, + { + "auxiliary_loss_clip": 0.0111342, + "auxiliary_loss_mlp": 0.00750505, + "balance_loss_clip": 1.03921485, + "balance_loss_mlp": 1.00046825, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.991968367279679, + "language_loss": 0.75995755, + "learning_rate": 3.676856638489272e-06, + "loss": 0.77859682, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 2.6334269046783447 + }, + { + "auxiliary_loss_clip": 0.01063667, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.03559375, + "balance_loss_mlp": 1.01989341, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.16869567733261, + "language_loss": 0.77782595, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79883003, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 2.7483503818511963 + }, + { + "auxiliary_loss_clip": 0.01066021, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_clip": 1.03802156, + "balance_loss_mlp": 1.0246439, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 1.8197822317471333, + "language_loss": 0.75903332, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.78011537, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.783855438232422 + }, + { + "auxiliary_loss_clip": 0.01096158, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.0350678, + "balance_loss_mlp": 1.01910555, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 3.6666696115762023, + "language_loss": 0.88015532, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90148473, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.641824245452881 + }, + { + "auxiliary_loss_clip": 0.00998854, + "auxiliary_loss_mlp": 0.00747639, + "balance_loss_clip": 1.00889587, + "balance_loss_mlp": 0.99969858, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.8106576054871029, + "language_loss": 0.59022868, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.60769361, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 3.329336643218994 + }, + { + "auxiliary_loss_clip": 0.01106979, + "auxiliary_loss_mlp": 0.01046482, + "balance_loss_clip": 1.03727341, + "balance_loss_mlp": 1.02815962, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.518071463743213, + "language_loss": 0.66139936, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68293399, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.6164722442626953 + }, + { + "auxiliary_loss_clip": 0.01098949, + "auxiliary_loss_mlp": 0.01044486, + "balance_loss_clip": 1.03964543, + "balance_loss_mlp": 1.02547264, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.185312540617892, + "language_loss": 0.83803052, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85946482, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.6633963584899902 + }, + { + "auxiliary_loss_clip": 0.01073861, + "auxiliary_loss_mlp": 0.01039007, + "balance_loss_clip": 1.03580391, + "balance_loss_mlp": 1.0217452, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.1885687792572774, + "language_loss": 0.81975031, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84087896, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.6587114334106445 + }, + { + "auxiliary_loss_clip": 0.01119661, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.04494119, + "balance_loss_mlp": 1.02384675, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.7721168818435915, + "language_loss": 0.81998992, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84157264, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.60198974609375 + }, + { + "auxiliary_loss_clip": 0.01125939, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.04294765, + "balance_loss_mlp": 1.02253842, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 2.1280137687229503, + "language_loss": 0.81793118, + "learning_rate": 3.674943713009518e-06, + "loss": 0.83957595, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 4.058325529098511 + }, + { + "auxiliary_loss_clip": 0.0112197, + "auxiliary_loss_mlp": 0.01050266, + "balance_loss_clip": 1.04288125, + "balance_loss_mlp": 1.03045344, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.8659511925176504, + "language_loss": 0.89936554, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92108786, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.628497838973999 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.04775286, + "balance_loss_mlp": 1.0269171, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 2.5894994304665855, + "language_loss": 0.76820844, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78972709, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 2.7503182888031006 + }, + { + "auxiliary_loss_clip": 0.01109997, + "auxiliary_loss_mlp": 0.01040132, + "balance_loss_clip": 1.04382479, + "balance_loss_mlp": 1.02285874, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.5921037005750973, + "language_loss": 0.760885, + "learning_rate": 3.674304927640011e-06, + "loss": 0.7823863, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.6375732421875 + }, + { + "auxiliary_loss_clip": 0.01098172, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.03741097, + "balance_loss_mlp": 1.02713132, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.6360929684998773, + "language_loss": 0.75949669, + "learning_rate": 3.67409187219312e-06, + "loss": 0.78093261, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 2.6757900714874268 + }, + { + "auxiliary_loss_clip": 0.01116688, + "auxiliary_loss_mlp": 0.01038566, + "balance_loss_clip": 1.04159296, + "balance_loss_mlp": 1.02203202, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.8857582160055173, + "language_loss": 0.84296489, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86451745, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 2.5042552947998047 + }, + { + "auxiliary_loss_clip": 0.01012752, + "auxiliary_loss_mlp": 0.01021634, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.01940525, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.891729631565096, + "language_loss": 0.63757467, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65791857, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 4.400430679321289 + }, + { + "auxiliary_loss_clip": 0.01110071, + "auxiliary_loss_mlp": 0.01043119, + "balance_loss_clip": 1.04127407, + "balance_loss_mlp": 1.0251894, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.9015526011409003, + "language_loss": 0.69868934, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72022128, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 2.906733274459839 + }, + { + "auxiliary_loss_clip": 0.01133868, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.0452534, + "balance_loss_mlp": 1.02808499, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.5541263842243733, + "language_loss": 0.70024347, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72203577, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.6050779819488525 + }, + { + "auxiliary_loss_clip": 0.01108091, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.04187155, + "balance_loss_mlp": 1.0216608, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 1.6693462872002347, + "language_loss": 0.89210773, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91357815, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.6137189865112305 + }, + { + "auxiliary_loss_clip": 0.01067198, + "auxiliary_loss_mlp": 0.01043779, + "balance_loss_clip": 1.03562438, + "balance_loss_mlp": 1.02599299, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.2006101346075457, + "language_loss": 0.6765281, + "learning_rate": 3.672812206678344e-06, + "loss": 0.69763786, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 2.681880235671997 + }, + { + "auxiliary_loss_clip": 0.01066394, + "auxiliary_loss_mlp": 0.0104482, + "balance_loss_clip": 1.03248203, + "balance_loss_mlp": 1.02608025, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 1.9943391924512688, + "language_loss": 0.844504, + "learning_rate": 3.672598707029127e-06, + "loss": 0.8656162, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 4.038580656051636 + }, + { + "auxiliary_loss_clip": 0.01086862, + "auxiliary_loss_mlp": 0.01059117, + "balance_loss_clip": 1.03920054, + "balance_loss_mlp": 1.03870833, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.474765906358119, + "language_loss": 0.74604142, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76750118, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.6160967350006104 + }, + { + "auxiliary_loss_clip": 0.01083484, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.03580236, + "balance_loss_mlp": 1.02356911, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.021203368116578, + "language_loss": 0.75806904, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77929193, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 4.097612380981445 + }, + { + "auxiliary_loss_clip": 0.01063845, + "auxiliary_loss_mlp": 0.01044562, + "balance_loss_clip": 1.03585744, + "balance_loss_mlp": 1.02682376, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 2.0373935979784554, + "language_loss": 0.85087526, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87195933, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.6771340370178223 + }, + { + "auxiliary_loss_clip": 0.01076721, + "auxiliary_loss_mlp": 0.01042634, + "balance_loss_clip": 1.04097259, + "balance_loss_mlp": 1.02499056, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 1.9825833698659974, + "language_loss": 0.7076695, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.72886306, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 2.7684836387634277 + }, + { + "auxiliary_loss_clip": 0.01105699, + "auxiliary_loss_mlp": 0.01047989, + "balance_loss_clip": 1.04071152, + "balance_loss_mlp": 1.03048944, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.8686082612421866, + "language_loss": 0.74978894, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77132583, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 2.613276720046997 + }, + { + "auxiliary_loss_clip": 0.01090935, + "auxiliary_loss_mlp": 0.01042618, + "balance_loss_clip": 1.04035389, + "balance_loss_mlp": 1.02447474, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.6724509072998182, + "language_loss": 0.70212114, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.72345662, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.6684534549713135 + }, + { + "auxiliary_loss_clip": 0.01058119, + "auxiliary_loss_mlp": 0.00750727, + "balance_loss_clip": 1.03615916, + "balance_loss_mlp": 1.00041914, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 2.098850898247411, + "language_loss": 0.83137906, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.84946746, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 2.6941537857055664 + }, + { + "auxiliary_loss_clip": 0.01114155, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.03914762, + "balance_loss_mlp": 1.03015971, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 1.82478788174612, + "language_loss": 0.87092799, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89253461, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 2.6599719524383545 + }, + { + "auxiliary_loss_clip": 0.01087396, + "auxiliary_loss_mlp": 0.01040141, + "balance_loss_clip": 1.0380379, + "balance_loss_mlp": 1.02200937, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 1.939781101488234, + "language_loss": 0.72551119, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74678659, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 2.667754888534546 + }, + { + "auxiliary_loss_clip": 0.01101924, + "auxiliary_loss_mlp": 0.01038761, + "balance_loss_clip": 1.04152179, + "balance_loss_mlp": 1.02186942, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.170192877840563, + "language_loss": 0.80243683, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.8238436, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 2.588466167449951 + }, + { + "auxiliary_loss_clip": 0.01126993, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.04005456, + "balance_loss_mlp": 1.02188516, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.733865666991512, + "language_loss": 0.72896624, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75061464, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 2.5202531814575195 + }, + { + "auxiliary_loss_clip": 0.01100384, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.04053056, + "balance_loss_mlp": 1.02882719, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 4.25325304105075, + "language_loss": 0.70363706, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72508967, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.607856512069702 + }, + { + "auxiliary_loss_clip": 0.01114859, + "auxiliary_loss_mlp": 0.00750634, + "balance_loss_clip": 1.03831482, + "balance_loss_mlp": 1.00041997, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 2.4463886864294775, + "language_loss": 0.79509664, + "learning_rate": 3.669817442854444e-06, + "loss": 0.81375158, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 2.572235107421875 + }, + { + "auxiliary_loss_clip": 0.01116309, + "auxiliary_loss_mlp": 0.00750505, + "balance_loss_clip": 1.04088306, + "balance_loss_mlp": 1.00035167, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.961946382520464, + "language_loss": 0.87008238, + "learning_rate": 3.669603055991502e-06, + "loss": 0.88875055, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 2.561971664428711 + }, + { + "auxiliary_loss_clip": 0.01081355, + "auxiliary_loss_mlp": 0.01038307, + "balance_loss_clip": 1.03360462, + "balance_loss_mlp": 1.02289319, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.8905731829992363, + "language_loss": 0.68954772, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.71074432, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 2.6772470474243164 + }, + { + "auxiliary_loss_clip": 0.01119924, + "auxiliary_loss_mlp": 0.0103689, + "balance_loss_clip": 1.04201937, + "balance_loss_mlp": 1.02068996, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 2.436260910216519, + "language_loss": 0.78839993, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.80996805, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 2.67271089553833 + }, + { + "auxiliary_loss_clip": 0.01093683, + "auxiliary_loss_mlp": 0.01041226, + "balance_loss_clip": 1.03779125, + "balance_loss_mlp": 1.0247159, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.6771170621431601, + "language_loss": 0.77166992, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79301906, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 2.6167185306549072 + }, + { + "auxiliary_loss_clip": 0.01114377, + "auxiliary_loss_mlp": 0.01047959, + "balance_loss_clip": 1.04369044, + "balance_loss_mlp": 1.03043485, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 1.8284496707603817, + "language_loss": 0.82134187, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84296525, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 2.6318612098693848 + }, + { + "auxiliary_loss_clip": 0.0112007, + "auxiliary_loss_mlp": 0.01046141, + "balance_loss_clip": 1.04208207, + "balance_loss_mlp": 1.02940452, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 1.663590443960079, + "language_loss": 0.6745593, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69622141, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 2.6216230392456055 + }, + { + "auxiliary_loss_clip": 0.01096503, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.03918636, + "balance_loss_mlp": 1.02272868, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 2.0948898614111586, + "language_loss": 0.80895245, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.8303141, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.675008773803711 + }, + { + "auxiliary_loss_clip": 0.01115761, + "auxiliary_loss_mlp": 0.01041002, + "balance_loss_clip": 1.04102325, + "balance_loss_mlp": 1.02515888, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 2.929444926325288, + "language_loss": 0.78770208, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80926967, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.6641416549682617 + }, + { + "auxiliary_loss_clip": 0.01103896, + "auxiliary_loss_mlp": 0.01037732, + "balance_loss_clip": 1.03982735, + "balance_loss_mlp": 1.02118587, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.6770941215232276, + "language_loss": 0.7411536, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.7625699, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.6747865676879883 + }, + { + "auxiliary_loss_clip": 0.01112713, + "auxiliary_loss_mlp": 0.01040017, + "balance_loss_clip": 1.03926086, + "balance_loss_mlp": 1.02381623, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.4710686475913384, + "language_loss": 0.7530269, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77455425, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 2.599876880645752 + }, + { + "auxiliary_loss_clip": 0.01070996, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.03677142, + "balance_loss_mlp": 1.01927495, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 2.005428644574367, + "language_loss": 0.77333975, + "learning_rate": 3.667455706571316e-06, + "loss": 0.79441124, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.7375528812408447 + }, + { + "auxiliary_loss_clip": 0.01073079, + "auxiliary_loss_mlp": 0.01047499, + "balance_loss_clip": 1.04024673, + "balance_loss_mlp": 1.02717352, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.292475444635804, + "language_loss": 0.78357494, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80478072, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.681873321533203 + }, + { + "auxiliary_loss_clip": 0.01095513, + "auxiliary_loss_mlp": 0.01045998, + "balance_loss_clip": 1.0395273, + "balance_loss_mlp": 1.02848613, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.6712789330877906, + "language_loss": 0.76817381, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.78958893, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.6640350818634033 + }, + { + "auxiliary_loss_clip": 0.01098152, + "auxiliary_loss_mlp": 0.01046468, + "balance_loss_clip": 1.04099393, + "balance_loss_mlp": 1.02971959, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.9428647435019826, + "language_loss": 0.63964963, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.6610958, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.6540801525115967 + }, + { + "auxiliary_loss_clip": 0.01114777, + "auxiliary_loss_mlp": 0.01041069, + "balance_loss_clip": 1.04072917, + "balance_loss_mlp": 1.02461815, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.5803339022044374, + "language_loss": 0.82038039, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84193885, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 2.6151282787323 + }, + { + "auxiliary_loss_clip": 0.01114031, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.03969431, + "balance_loss_mlp": 1.02857804, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 1.6928014557667936, + "language_loss": 0.75612462, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77771485, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 4.056120872497559 + }, + { + "auxiliary_loss_clip": 0.0113109, + "auxiliary_loss_mlp": 0.01038714, + "balance_loss_clip": 1.04258513, + "balance_loss_mlp": 1.02196538, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 3.3525389151519107, + "language_loss": 0.85475016, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87644821, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 2.4878692626953125 + }, + { + "auxiliary_loss_clip": 0.01099617, + "auxiliary_loss_mlp": 0.01039823, + "balance_loss_clip": 1.04334533, + "balance_loss_mlp": 1.02133322, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.6051855398781723, + "language_loss": 0.67703187, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.69842625, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.685898542404175 + }, + { + "auxiliary_loss_clip": 0.011319, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.04316568, + "balance_loss_mlp": 1.02431774, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.6522275927182748, + "language_loss": 0.72185779, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74358416, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.5973291397094727 + }, + { + "auxiliary_loss_clip": 0.01048433, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.03710055, + "balance_loss_mlp": 1.01910794, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.1116487577943537, + "language_loss": 0.69396013, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71482307, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 2.831831693649292 + }, + { + "auxiliary_loss_clip": 0.01120162, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.04163182, + "balance_loss_mlp": 1.02863061, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 2.197778486183978, + "language_loss": 0.73230946, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75398076, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 3.027637004852295 + }, + { + "auxiliary_loss_clip": 0.01096061, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.03829241, + "balance_loss_mlp": 1.01813209, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.8636426609544907, + "language_loss": 0.74342704, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76473367, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 4.208005905151367 + }, + { + "auxiliary_loss_clip": 0.01116178, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.04569292, + "balance_loss_mlp": 1.02142024, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7421019090616499, + "language_loss": 0.7649008, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78644568, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.617500066757202 + }, + { + "auxiliary_loss_clip": 0.01110033, + "auxiliary_loss_mlp": 0.01042812, + "balance_loss_clip": 1.04593432, + "balance_loss_mlp": 1.02545476, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 2.3561071232593376, + "language_loss": 0.68456447, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70609295, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 2.5715551376342773 + }, + { + "auxiliary_loss_clip": 0.01097592, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_clip": 1.04658866, + "balance_loss_mlp": 1.02856183, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.8801440952394, + "language_loss": 0.8496033, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87104857, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.6767795085906982 + }, + { + "auxiliary_loss_clip": 0.0110375, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.03892303, + "balance_loss_mlp": 1.02015996, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.5293291634044555, + "language_loss": 0.63133764, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65274179, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 4.149930715560913 + }, + { + "auxiliary_loss_clip": 0.01065632, + "auxiliary_loss_mlp": 0.0104602, + "balance_loss_clip": 1.03940153, + "balance_loss_mlp": 1.03029609, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.8259994002921405, + "language_loss": 0.89634037, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91745692, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.727644681930542 + }, + { + "auxiliary_loss_clip": 0.01111776, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_clip": 1.04296064, + "balance_loss_mlp": 1.02837062, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.684517422099727, + "language_loss": 0.81360906, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83518159, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.6383914947509766 + }, + { + "auxiliary_loss_clip": 0.01105104, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.04314184, + "balance_loss_mlp": 1.02629948, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.552829145904833, + "language_loss": 0.76232421, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78380167, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 4.167629241943359 + }, + { + "auxiliary_loss_clip": 0.01072278, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.03819966, + "balance_loss_mlp": 1.02368748, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 1.8871348444993628, + "language_loss": 0.75593126, + "learning_rate": 3.663358329538626e-06, + "loss": 0.77704394, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.712876081466675 + }, + { + "auxiliary_loss_clip": 0.01129319, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_clip": 1.04251289, + "balance_loss_mlp": 1.02642798, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 1.7673032643691189, + "language_loss": 0.70179439, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72352016, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.6811225414276123 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01042224, + "balance_loss_clip": 1.04073298, + "balance_loss_mlp": 1.02522516, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.129330855005296, + "language_loss": 0.7670455, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.78859735, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.5379183292388916 + }, + { + "auxiliary_loss_clip": 0.01107045, + "auxiliary_loss_mlp": 0.0104175, + "balance_loss_clip": 1.03857589, + "balance_loss_mlp": 1.02535868, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 1.8659825143395836, + "language_loss": 0.81409037, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83557832, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 2.667872428894043 + }, + { + "auxiliary_loss_clip": 0.01064949, + "auxiliary_loss_mlp": 0.01041179, + "balance_loss_clip": 1.03420568, + "balance_loss_mlp": 1.02382219, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 1.846076072093111, + "language_loss": 0.75421393, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77527523, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 2.764787197113037 + }, + { + "auxiliary_loss_clip": 0.01131737, + "auxiliary_loss_mlp": 0.01039457, + "balance_loss_clip": 1.04337227, + "balance_loss_mlp": 1.02251697, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.6752876913664194, + "language_loss": 0.77025247, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79196441, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.600381851196289 + }, + { + "auxiliary_loss_clip": 0.01127085, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.04114389, + "balance_loss_mlp": 1.02767575, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 2.0552937519332555, + "language_loss": 0.77812618, + "learning_rate": 3.662059687737528e-06, + "loss": 0.79985023, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 2.5251123905181885 + }, + { + "auxiliary_loss_clip": 0.01116183, + "auxiliary_loss_mlp": 0.0104582, + "balance_loss_clip": 1.04104495, + "balance_loss_mlp": 1.02914286, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.7343399682103906, + "language_loss": 0.81608135, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83770132, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 2.704082489013672 + }, + { + "auxiliary_loss_clip": 0.01109481, + "auxiliary_loss_mlp": 0.00750725, + "balance_loss_clip": 1.04081285, + "balance_loss_mlp": 1.00038671, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.6881771065914832, + "language_loss": 0.76580918, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78441125, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 2.662126302719116 + }, + { + "auxiliary_loss_clip": 0.01127602, + "auxiliary_loss_mlp": 0.01040936, + "balance_loss_clip": 1.04248202, + "balance_loss_mlp": 1.02500939, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 1.9962950074330168, + "language_loss": 0.82881427, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85049963, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 2.529059886932373 + }, + { + "auxiliary_loss_clip": 0.01096794, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.03992176, + "balance_loss_mlp": 1.02193654, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.6431660908973247, + "language_loss": 0.72875845, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75013453, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 2.578887701034546 + }, + { + "auxiliary_loss_clip": 0.01102242, + "auxiliary_loss_mlp": 0.01042456, + "balance_loss_clip": 1.04403281, + "balance_loss_mlp": 1.02424109, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.6268885324629236, + "language_loss": 0.73769832, + "learning_rate": 3.660975752961054e-06, + "loss": 0.75914532, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 2.629647970199585 + }, + { + "auxiliary_loss_clip": 0.0112365, + "auxiliary_loss_mlp": 0.01043008, + "balance_loss_clip": 1.04336858, + "balance_loss_mlp": 1.02604437, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.7590653685605118, + "language_loss": 0.71147215, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73313868, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 2.6802449226379395 + }, + { + "auxiliary_loss_clip": 0.01108468, + "auxiliary_loss_mlp": 0.01039072, + "balance_loss_clip": 1.04226017, + "balance_loss_mlp": 1.0213933, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9775236706074537, + "language_loss": 0.71918148, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.74065685, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 2.6985714435577393 + }, + { + "auxiliary_loss_clip": 0.01115749, + "auxiliary_loss_mlp": 0.01047477, + "balance_loss_clip": 1.04221392, + "balance_loss_mlp": 1.03120518, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 1.9035565182908218, + "language_loss": 0.70627695, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72790921, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 2.662415027618408 + }, + { + "auxiliary_loss_clip": 0.01131177, + "auxiliary_loss_mlp": 0.0104421, + "balance_loss_clip": 1.04244936, + "balance_loss_mlp": 1.0275563, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.7985062649426915, + "language_loss": 0.87626845, + "learning_rate": 3.660107471371981e-06, + "loss": 0.89802229, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 2.5294370651245117 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.0075054, + "balance_loss_clip": 1.03951812, + "balance_loss_mlp": 1.00037277, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.6922383971054675, + "language_loss": 0.80588889, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82453299, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.5799052715301514 + }, + { + "auxiliary_loss_clip": 0.0105137, + "auxiliary_loss_mlp": 0.01042329, + "balance_loss_clip": 1.03127122, + "balance_loss_mlp": 1.02494836, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.5368186508270039, + "language_loss": 0.87144339, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89238036, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.721684694290161 + }, + { + "auxiliary_loss_clip": 0.01095541, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.03763425, + "balance_loss_mlp": 1.02466667, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 35.588373061718904, + "language_loss": 0.57893026, + "learning_rate": 3.659455599161237e-06, + "loss": 0.60029781, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.5981252193450928 + }, + { + "auxiliary_loss_clip": 0.01128918, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.04226995, + "balance_loss_mlp": 1.02146924, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 1.7781717709258864, + "language_loss": 0.7571097, + "learning_rate": 3.659238182559888e-06, + "loss": 0.77878094, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 2.6161015033721924 + }, + { + "auxiliary_loss_clip": 0.01080746, + "auxiliary_loss_mlp": 0.01039087, + "balance_loss_clip": 1.03732705, + "balance_loss_mlp": 1.02289796, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.6209873363678216, + "language_loss": 0.69321454, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71441293, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.7049927711486816 + }, + { + "auxiliary_loss_clip": 0.01124827, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.04196048, + "balance_loss_mlp": 1.02037978, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.6859803659255057, + "language_loss": 0.76023793, + "learning_rate": 3.658803160610004e-06, + "loss": 0.78184319, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.5823636054992676 + }, + { + "auxiliary_loss_clip": 0.01104811, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.04321098, + "balance_loss_mlp": 1.01688242, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.7105755347560496, + "language_loss": 0.66336268, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68473697, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.5887107849121094 + }, + { + "auxiliary_loss_clip": 0.0109452, + "auxiliary_loss_mlp": 0.01038413, + "balance_loss_clip": 1.03924179, + "balance_loss_mlp": 1.02284408, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.7448006905851206, + "language_loss": 0.71237969, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73370904, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.6334447860717773 + }, + { + "auxiliary_loss_clip": 0.01104007, + "auxiliary_loss_mlp": 0.01043234, + "balance_loss_clip": 1.04222238, + "balance_loss_mlp": 1.02737916, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.6511254855865427, + "language_loss": 0.72010469, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74157703, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 2.6540398597717285 + }, + { + "auxiliary_loss_clip": 0.0108561, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.03972459, + "balance_loss_mlp": 1.02469432, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.702811645773169, + "language_loss": 0.79969788, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82096511, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.6751880645751953 + }, + { + "auxiliary_loss_clip": 0.01128832, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.04130554, + "balance_loss_mlp": 1.02168727, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 2.705493307680037, + "language_loss": 0.75027537, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.77194649, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 4.423128366470337 + }, + { + "auxiliary_loss_clip": 0.0109033, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_clip": 1.04134905, + "balance_loss_mlp": 1.03055215, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.0497789332041956, + "language_loss": 0.74210471, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76349187, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.618112087249756 + }, + { + "auxiliary_loss_clip": 0.01093999, + "auxiliary_loss_mlp": 0.01038234, + "balance_loss_clip": 1.04258204, + "balance_loss_mlp": 1.02216411, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 4.205558878978757, + "language_loss": 0.80360144, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82492375, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.636829376220703 + }, + { + "auxiliary_loss_clip": 0.01127227, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.04492688, + "balance_loss_mlp": 1.02469063, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 2.018194875544763, + "language_loss": 0.88144708, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90311581, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 2.5221498012542725 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.01041466, + "balance_loss_clip": 1.0425514, + "balance_loss_mlp": 1.02558768, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 1.8780608883999879, + "language_loss": 0.83670425, + "learning_rate": 3.656842449140983e-06, + "loss": 0.85838854, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.5479021072387695 + }, + { + "auxiliary_loss_clip": 0.01109556, + "auxiliary_loss_mlp": 0.01044796, + "balance_loss_clip": 1.03918171, + "balance_loss_mlp": 1.02848804, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 4.071248266976794, + "language_loss": 0.76997864, + "learning_rate": 3.656624278062713e-06, + "loss": 0.79152215, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.551323890686035 + }, + { + "auxiliary_loss_clip": 0.01112186, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.04027772, + "balance_loss_mlp": 1.01907897, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.565838236717142, + "language_loss": 0.72540987, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74686533, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 4.133237838745117 + }, + { + "auxiliary_loss_clip": 0.01068905, + "auxiliary_loss_mlp": 0.00750429, + "balance_loss_clip": 1.03662133, + "balance_loss_mlp": 1.00039697, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.9101123226887875, + "language_loss": 0.68015528, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69834864, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 2.727179765701294 + }, + { + "auxiliary_loss_clip": 0.01089785, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.04086185, + "balance_loss_mlp": 1.02087808, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 1.681241218704096, + "language_loss": 0.64997703, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.67125046, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.8078901767730713 + }, + { + "auxiliary_loss_clip": 0.01114787, + "auxiliary_loss_mlp": 0.01044776, + "balance_loss_clip": 1.04104853, + "balance_loss_mlp": 1.0277884, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 5.032959738417581, + "language_loss": 0.7257961, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.7473917, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.6052749156951904 + }, + { + "auxiliary_loss_clip": 0.01117514, + "auxiliary_loss_mlp": 0.00750439, + "balance_loss_clip": 1.04610145, + "balance_loss_mlp": 1.00036335, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6979517136891749, + "language_loss": 0.66684723, + "learning_rate": 3.655532480546528e-06, + "loss": 0.68552685, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 4.231218099594116 + }, + { + "auxiliary_loss_clip": 0.01132598, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.04224801, + "balance_loss_mlp": 1.01789272, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.7086375477706786, + "language_loss": 0.79508293, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81675375, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 2.588266611099243 + }, + { + "auxiliary_loss_clip": 0.01125084, + "auxiliary_loss_mlp": 0.01036093, + "balance_loss_clip": 1.04117513, + "balance_loss_mlp": 1.02118611, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.7592867201378724, + "language_loss": 0.68090785, + "learning_rate": 3.655095322036373e-06, + "loss": 0.70251966, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.615295171737671 + }, + { + "auxiliary_loss_clip": 0.01119097, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.04258299, + "balance_loss_mlp": 1.02205396, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8610610846324838, + "language_loss": 0.72905701, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75063276, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 4.087384223937988 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.0397296, + "balance_loss_mlp": 1.02398729, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.370143455442149, + "language_loss": 0.7720511, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79347974, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 2.6165809631347656 + }, + { + "auxiliary_loss_clip": 0.01125753, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.04254127, + "balance_loss_mlp": 1.01782894, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.4359372492252969, + "language_loss": 0.84443581, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.86602974, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.5477330684661865 + }, + { + "auxiliary_loss_clip": 0.01126972, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.04264569, + "balance_loss_mlp": 1.02164471, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.4551695210167592, + "language_loss": 0.76415521, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78579509, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 2.6286749839782715 + }, + { + "auxiliary_loss_clip": 0.01101413, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.04182577, + "balance_loss_mlp": 1.02458787, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 2.4867461871917667, + "language_loss": 0.88929975, + "learning_rate": 3.654001327581981e-06, + "loss": 0.91072208, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 2.5831356048583984 + }, + { + "auxiliary_loss_clip": 0.01022622, + "auxiliary_loss_mlp": 0.01014487, + "balance_loss_clip": 1.01656115, + "balance_loss_mlp": 1.01221013, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8354745708689861, + "language_loss": 0.5232532, + "learning_rate": 3.653782340498215e-06, + "loss": 0.5436244, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.132528305053711 + }, + { + "auxiliary_loss_clip": 0.01108768, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.03861284, + "balance_loss_mlp": 1.01942289, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 2.4628999037936423, + "language_loss": 0.67301214, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69443691, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.5296850204467773 + }, + { + "auxiliary_loss_clip": 0.01092202, + "auxiliary_loss_mlp": 0.01042186, + "balance_loss_clip": 1.03800964, + "balance_loss_mlp": 1.02596211, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.5315046953646818, + "language_loss": 0.73939866, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76074255, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 2.6901533603668213 + }, + { + "auxiliary_loss_clip": 0.01110863, + "auxiliary_loss_mlp": 0.01042122, + "balance_loss_clip": 1.03963923, + "balance_loss_mlp": 1.02699471, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.721055938572017, + "language_loss": 0.77817988, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79970968, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 2.513962745666504 + }, + { + "auxiliary_loss_clip": 0.01119299, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.0412569, + "balance_loss_mlp": 1.01997113, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.378119627185229, + "language_loss": 0.69993871, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.721515, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 2.646263837814331 + }, + { + "auxiliary_loss_clip": 0.01128417, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.04189277, + "balance_loss_mlp": 1.02571249, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 3.0619708843764966, + "language_loss": 0.79089069, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.81259578, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 2.490248680114746 + }, + { + "auxiliary_loss_clip": 0.0110872, + "auxiliary_loss_mlp": 0.01043942, + "balance_loss_clip": 1.04031563, + "balance_loss_mlp": 1.02472579, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 73.7466494898727, + "language_loss": 0.82450491, + "learning_rate": 3.652467101342991e-06, + "loss": 0.84603155, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.613605260848999 + }, + { + "auxiliary_loss_clip": 0.01108421, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.04336298, + "balance_loss_mlp": 1.02250838, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 4.459737062251317, + "language_loss": 0.64952201, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67099929, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 2.6403374671936035 + }, + { + "auxiliary_loss_clip": 0.01120013, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.03927755, + "balance_loss_mlp": 1.02261281, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 1.7996916968571, + "language_loss": 0.75549316, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77707511, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.505798101425171 + }, + { + "auxiliary_loss_clip": 0.01110732, + "auxiliary_loss_mlp": 0.01037542, + "balance_loss_clip": 1.03935719, + "balance_loss_mlp": 1.02136517, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.9190703979883692, + "language_loss": 0.72090137, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74238414, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 2.529635429382324 + }, + { + "auxiliary_loss_clip": 0.01097091, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.03861451, + "balance_loss_mlp": 1.01818681, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.9034080157401652, + "language_loss": 0.68472695, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70604223, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.538025140762329 + }, + { + "auxiliary_loss_clip": 0.01109587, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_clip": 1.03886139, + "balance_loss_mlp": 1.0242939, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 1.793971870558713, + "language_loss": 0.88465428, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90617728, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.4809060096740723 + }, + { + "auxiliary_loss_clip": 0.01021296, + "auxiliary_loss_mlp": 0.01011762, + "balance_loss_clip": 1.01076722, + "balance_loss_mlp": 1.01008117, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8026375902091953, + "language_loss": 0.56210911, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58243972, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 3.077653646469116 + }, + { + "auxiliary_loss_clip": 0.01114587, + "auxiliary_loss_mlp": 0.00750471, + "balance_loss_clip": 1.04093337, + "balance_loss_mlp": 1.00055671, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.739361268467863, + "language_loss": 0.88921314, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90786374, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.5337162017822266 + }, + { + "auxiliary_loss_clip": 0.01112979, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.03778386, + "balance_loss_mlp": 1.02717876, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 2.9692623203011776, + "language_loss": 0.78134751, + "learning_rate": 3.650709940390972e-06, + "loss": 0.8029114, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.5118954181671143 + }, + { + "auxiliary_loss_clip": 0.01117411, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.04259348, + "balance_loss_mlp": 1.02330208, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.888393921540466, + "language_loss": 0.72624266, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.74781477, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 2.5829339027404785 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.0407269, + "balance_loss_mlp": 1.02041936, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 1.9939125392547121, + "language_loss": 0.71391529, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.73542249, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.5833921432495117 + }, + { + "auxiliary_loss_clip": 0.01124904, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.03991342, + "balance_loss_mlp": 1.02068937, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 3.229023701782681, + "language_loss": 0.83609748, + "learning_rate": 3.650049971985889e-06, + "loss": 0.85772789, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.4981203079223633 + }, + { + "auxiliary_loss_clip": 0.0110666, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.03989172, + "balance_loss_mlp": 1.02491796, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 2.628607166284909, + "language_loss": 0.8287794, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85026193, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 2.680689573287964 + }, + { + "auxiliary_loss_clip": 0.01090201, + "auxiliary_loss_mlp": 0.00750504, + "balance_loss_clip": 1.03710079, + "balance_loss_mlp": 1.00050998, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.276433679518801, + "language_loss": 0.89781952, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.91622657, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 2.6882171630859375 + }, + { + "auxiliary_loss_clip": 0.01116199, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_clip": 1.04257035, + "balance_loss_mlp": 1.02572417, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 1.6767381497729081, + "language_loss": 0.7387864, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76037395, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 2.54594349861145 + }, + { + "auxiliary_loss_clip": 0.0109512, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.04466867, + "balance_loss_mlp": 1.02685809, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 1.6199511316981174, + "language_loss": 0.83244097, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85381871, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.7053306102752686 + }, + { + "auxiliary_loss_clip": 0.01082498, + "auxiliary_loss_mlp": 0.00750687, + "balance_loss_clip": 1.03799617, + "balance_loss_mlp": 1.00050306, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.7683686158961751, + "language_loss": 0.76278317, + "learning_rate": 3.648948773354224e-06, + "loss": 0.781115, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.8115978240966797 + }, + { + "auxiliary_loss_clip": 0.01108972, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.03762007, + "balance_loss_mlp": 1.02158582, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.8243088808001706, + "language_loss": 0.81117272, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83264732, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 4.213339328765869 + }, + { + "auxiliary_loss_clip": 0.01130607, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.04434848, + "balance_loss_mlp": 1.02009201, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 1.69091621089468, + "language_loss": 0.72305268, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74471855, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 2.55379581451416 + }, + { + "auxiliary_loss_clip": 0.01108741, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_clip": 1.03872728, + "balance_loss_mlp": 1.02407932, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 1.6751225232838372, + "language_loss": 0.84102595, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86253744, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.60249924659729 + }, + { + "auxiliary_loss_clip": 0.01101002, + "auxiliary_loss_mlp": 0.01045332, + "balance_loss_clip": 1.04274857, + "balance_loss_mlp": 1.02548361, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 1.7021972205537415, + "language_loss": 0.69317937, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71464276, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 2.711034059524536 + }, + { + "auxiliary_loss_clip": 0.01092556, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.03895485, + "balance_loss_mlp": 1.03028834, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 4.468808813086233, + "language_loss": 0.84165281, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86306244, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.647310733795166 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01045802, + "balance_loss_clip": 1.04033637, + "balance_loss_mlp": 1.02750373, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.4562003806676347, + "language_loss": 0.7517482, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.7732029, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 4.155431032180786 + }, + { + "auxiliary_loss_clip": 0.01118501, + "auxiliary_loss_mlp": 0.01040181, + "balance_loss_clip": 1.0435375, + "balance_loss_mlp": 1.02269268, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 1.5190957940747067, + "language_loss": 0.80795789, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82954466, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.6205039024353027 + }, + { + "auxiliary_loss_clip": 0.01100134, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.04256082, + "balance_loss_mlp": 1.0213871, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 1.9458454952250945, + "language_loss": 0.78953588, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81093091, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.606889486312866 + }, + { + "auxiliary_loss_clip": 0.01057949, + "auxiliary_loss_mlp": 0.01047363, + "balance_loss_clip": 1.03808641, + "balance_loss_mlp": 1.03094792, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 2.3947889355946956, + "language_loss": 0.83069944, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85175258, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.697970390319824 + }, + { + "auxiliary_loss_clip": 0.01109104, + "auxiliary_loss_mlp": 0.00750883, + "balance_loss_clip": 1.0417881, + "balance_loss_mlp": 1.00056243, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5498852198457873, + "language_loss": 0.80777389, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82637376, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 2.625718355178833 + }, + { + "auxiliary_loss_clip": 0.01093791, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.03696859, + "balance_loss_mlp": 1.02719414, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.6491324142990549, + "language_loss": 0.81931609, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.84071344, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 4.2379560470581055 + }, + { + "auxiliary_loss_clip": 0.01086744, + "auxiliary_loss_mlp": 0.00750802, + "balance_loss_clip": 1.03971243, + "balance_loss_mlp": 1.00044656, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 1.7025400486393063, + "language_loss": 0.76047635, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.77885187, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.6546123027801514 + }, + { + "auxiliary_loss_clip": 0.0108784, + "auxiliary_loss_mlp": 0.01048616, + "balance_loss_clip": 1.03967488, + "balance_loss_mlp": 1.03277326, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.7791411959985868, + "language_loss": 0.79794455, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.81930912, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 4.287313222885132 + }, + { + "auxiliary_loss_clip": 0.01130785, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.04319787, + "balance_loss_mlp": 1.03012395, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.6120634554320779, + "language_loss": 0.83144367, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85322797, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.554471492767334 + }, + { + "auxiliary_loss_clip": 0.01130548, + "auxiliary_loss_mlp": 0.01044996, + "balance_loss_clip": 1.04274499, + "balance_loss_mlp": 1.02786589, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 2.062599638207901, + "language_loss": 0.74557495, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76733041, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 2.505378246307373 + }, + { + "auxiliary_loss_clip": 0.01087154, + "auxiliary_loss_mlp": 0.01049718, + "balance_loss_clip": 1.03807855, + "balance_loss_mlp": 1.0314666, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 2.486107750992085, + "language_loss": 0.74100375, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76237249, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.649503469467163 + }, + { + "auxiliary_loss_clip": 0.01114841, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.04131186, + "balance_loss_mlp": 1.02325702, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 1.8067635891399598, + "language_loss": 0.80112171, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82266879, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.5603532791137695 + }, + { + "auxiliary_loss_clip": 0.01046307, + "auxiliary_loss_mlp": 0.01001728, + "balance_loss_clip": 1.01652861, + "balance_loss_mlp": 0.99939126, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6977522748435057, + "language_loss": 0.58349597, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60397637, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 3.1890852451324463 + }, + { + "auxiliary_loss_clip": 0.01132138, + "auxiliary_loss_mlp": 0.01042489, + "balance_loss_clip": 1.04271984, + "balance_loss_mlp": 1.02451229, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.099087125297662, + "language_loss": 0.73141491, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75316113, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 2.540947198867798 + }, + { + "auxiliary_loss_clip": 0.01111883, + "auxiliary_loss_mlp": 0.01046229, + "balance_loss_clip": 1.04239988, + "balance_loss_mlp": 1.02837157, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.017662826079439, + "language_loss": 0.76669019, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78827131, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 2.5457658767700195 + }, + { + "auxiliary_loss_clip": 0.01057181, + "auxiliary_loss_mlp": 0.01048836, + "balance_loss_clip": 1.0366869, + "balance_loss_mlp": 1.03201604, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.8185094336397774, + "language_loss": 0.74492168, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76598191, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 2.731537103652954 + }, + { + "auxiliary_loss_clip": 0.01107129, + "auxiliary_loss_mlp": 0.01047229, + "balance_loss_clip": 1.04076409, + "balance_loss_mlp": 1.02990842, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.9819532319429087, + "language_loss": 0.88465917, + "learning_rate": 3.6440849425579e-06, + "loss": 0.90620279, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 2.5394392013549805 + }, + { + "auxiliary_loss_clip": 0.01131386, + "auxiliary_loss_mlp": 0.01044884, + "balance_loss_clip": 1.04339767, + "balance_loss_mlp": 1.02756262, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 1.6071523335407174, + "language_loss": 0.7756083, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79737103, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 2.513493776321411 + }, + { + "auxiliary_loss_clip": 0.01062947, + "auxiliary_loss_mlp": 0.01049691, + "balance_loss_clip": 1.03655219, + "balance_loss_mlp": 1.03254914, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 1.9471615254217671, + "language_loss": 0.63572693, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65685326, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 2.6491634845733643 + }, + { + "auxiliary_loss_clip": 0.01062063, + "auxiliary_loss_mlp": 0.01052572, + "balance_loss_clip": 1.03415394, + "balance_loss_mlp": 1.03336763, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 2.010234949227331, + "language_loss": 0.75495565, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77610201, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.663651466369629 + }, + { + "auxiliary_loss_clip": 0.01080893, + "auxiliary_loss_mlp": 0.01055683, + "balance_loss_clip": 1.0396564, + "balance_loss_mlp": 1.03650224, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 1.9198861950320745, + "language_loss": 0.71337366, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73473942, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 2.5949761867523193 + }, + { + "auxiliary_loss_clip": 0.01117627, + "auxiliary_loss_mlp": 0.01045304, + "balance_loss_clip": 1.04242837, + "balance_loss_mlp": 1.02828074, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.7035419285191078, + "language_loss": 0.73245525, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75408459, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.5956015586853027 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_clip": 1.03872597, + "balance_loss_mlp": 1.02701521, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.179400926988446, + "language_loss": 0.90146977, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92308998, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.555488109588623 + }, + { + "auxiliary_loss_clip": 0.0107627, + "auxiliary_loss_mlp": 0.01040514, + "balance_loss_clip": 1.03703868, + "balance_loss_mlp": 1.02264488, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.2958958487374397, + "language_loss": 0.81665677, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83782464, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.651432991027832 + }, + { + "auxiliary_loss_clip": 0.01107932, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.04089582, + "balance_loss_mlp": 1.0249244, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.629537857225853, + "language_loss": 0.75665677, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77814865, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.6004083156585693 + }, + { + "auxiliary_loss_clip": 0.01123341, + "auxiliary_loss_mlp": 0.01046985, + "balance_loss_clip": 1.04294419, + "balance_loss_mlp": 1.02931762, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 2.0449006827679534, + "language_loss": 0.69466007, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71636331, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.584078788757324 + }, + { + "auxiliary_loss_clip": 0.01122887, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_clip": 1.04559255, + "balance_loss_mlp": 1.02616596, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.5989293501564419, + "language_loss": 0.78375775, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80542123, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 2.5549328327178955 + }, + { + "auxiliary_loss_clip": 0.0112449, + "auxiliary_loss_mlp": 0.01040478, + "balance_loss_clip": 1.04078209, + "balance_loss_mlp": 1.02451587, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 1.680301518806132, + "language_loss": 0.79541421, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81706393, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.524353504180908 + }, + { + "auxiliary_loss_clip": 0.01117728, + "auxiliary_loss_mlp": 0.01038004, + "balance_loss_clip": 1.04113078, + "balance_loss_mlp": 1.0221256, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.8471598891774335, + "language_loss": 0.87644714, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89800441, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.5663869380950928 + }, + { + "auxiliary_loss_clip": 0.01122162, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.04297233, + "balance_loss_mlp": 1.01974297, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 1.8958428717633389, + "language_loss": 0.76893508, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79054308, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 2.5335617065429688 + }, + { + "auxiliary_loss_clip": 0.01097009, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_clip": 1.04231775, + "balance_loss_mlp": 1.02995849, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 1.9350884562557544, + "language_loss": 0.84777272, + "learning_rate": 3.640974061218741e-06, + "loss": 0.86922407, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.555561065673828 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01049507, + "balance_loss_clip": 1.040555, + "balance_loss_mlp": 1.03216195, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.480029101265496, + "language_loss": 0.77601957, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79762101, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 2.537745237350464 + }, + { + "auxiliary_loss_clip": 0.0102579, + "auxiliary_loss_mlp": 0.01008997, + "balance_loss_clip": 1.00778484, + "balance_loss_mlp": 1.0071969, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8142526804300079, + "language_loss": 0.60731316, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62766111, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 3.1520345211029053 + }, + { + "auxiliary_loss_clip": 0.01098224, + "auxiliary_loss_mlp": 0.00750699, + "balance_loss_clip": 1.03793573, + "balance_loss_mlp": 1.00039148, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.9125120616677407, + "language_loss": 0.89969289, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.91818213, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.613678455352783 + }, + { + "auxiliary_loss_clip": 0.01072004, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.04099035, + "balance_loss_mlp": 1.02142143, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.5956262000314791, + "language_loss": 0.7344777, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75559247, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 2.7588233947753906 + }, + { + "auxiliary_loss_clip": 0.0112573, + "auxiliary_loss_mlp": 0.01038431, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.02130103, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 3.6279703407557258, + "language_loss": 0.77010882, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79175043, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 3.988513708114624 + }, + { + "auxiliary_loss_clip": 0.01117528, + "auxiliary_loss_mlp": 0.01039987, + "balance_loss_clip": 1.04178023, + "balance_loss_mlp": 1.02401257, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5824265641541106, + "language_loss": 0.71274865, + "learning_rate": 3.63963709145597e-06, + "loss": 0.7343238, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 2.6658263206481934 + }, + { + "auxiliary_loss_clip": 0.01060575, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.03621519, + "balance_loss_mlp": 1.02429891, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.8024615096338747, + "language_loss": 0.76869094, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78969061, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 2.730476140975952 + }, + { + "auxiliary_loss_clip": 0.01129019, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.04175413, + "balance_loss_mlp": 1.02326608, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.1620730621371163, + "language_loss": 0.75704885, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77873212, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 2.504986047744751 + }, + { + "auxiliary_loss_clip": 0.01125886, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.04073811, + "balance_loss_mlp": 1.02204514, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 2.20933746969271, + "language_loss": 0.83905792, + "learning_rate": 3.638967767095249e-06, + "loss": 0.86069155, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.5075063705444336 + }, + { + "auxiliary_loss_clip": 0.01096542, + "auxiliary_loss_mlp": 0.01045149, + "balance_loss_clip": 1.0428555, + "balance_loss_mlp": 1.02877009, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.752570412066604, + "language_loss": 0.81690526, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.8383221, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 4.084911346435547 + }, + { + "auxiliary_loss_clip": 0.01119874, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.04404521, + "balance_loss_mlp": 1.02393496, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 1.7323474173672049, + "language_loss": 0.75536019, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77696025, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 2.6261186599731445 + }, + { + "auxiliary_loss_clip": 0.0110157, + "auxiliary_loss_mlp": 0.01047848, + "balance_loss_clip": 1.0426023, + "balance_loss_mlp": 1.0313971, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 1.9366012550625342, + "language_loss": 0.88198733, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90348148, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.574587345123291 + }, + { + "auxiliary_loss_clip": 0.01097464, + "auxiliary_loss_mlp": 0.00750574, + "balance_loss_clip": 1.04470778, + "balance_loss_mlp": 1.0003829, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 1.9729296050472116, + "language_loss": 0.75709724, + "learning_rate": 3.638074464556311e-06, + "loss": 0.77557755, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 2.6232573986053467 + }, + { + "auxiliary_loss_clip": 0.01114398, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.04466534, + "balance_loss_mlp": 1.02135646, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 2.387676765640713, + "language_loss": 0.89466166, + "learning_rate": 3.63785098361053e-06, + "loss": 0.91619503, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 4.133086442947388 + }, + { + "auxiliary_loss_clip": 0.01114732, + "auxiliary_loss_mlp": 0.01047478, + "balance_loss_clip": 1.04067326, + "balance_loss_mlp": 1.03050232, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.2429691529284166, + "language_loss": 0.89612496, + "learning_rate": 3.637627440557275e-06, + "loss": 0.91774702, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.5168840885162354 + }, + { + "auxiliary_loss_clip": 0.01106555, + "auxiliary_loss_mlp": 0.00750545, + "balance_loss_clip": 1.04159832, + "balance_loss_mlp": 1.00031543, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.6346201068956332, + "language_loss": 0.79315966, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81173068, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 4.149393320083618 + }, + { + "auxiliary_loss_clip": 0.01125243, + "auxiliary_loss_mlp": 0.01046951, + "balance_loss_clip": 1.04809546, + "balance_loss_mlp": 1.02846169, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.0717163110353827, + "language_loss": 0.7169627, + "learning_rate": 3.637180168162255e-06, + "loss": 0.73868465, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.6343021392822266 + }, + { + "auxiliary_loss_clip": 0.01103935, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.04209232, + "balance_loss_mlp": 1.02618325, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 2.549937729479439, + "language_loss": 0.81086922, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83232641, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 2.633310556411743 + }, + { + "auxiliary_loss_clip": 0.01123246, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.04535198, + "balance_loss_mlp": 1.02205372, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 1.6875436108748212, + "language_loss": 0.71680069, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73842776, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 2.595346212387085 + }, + { + "auxiliary_loss_clip": 0.01127734, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.04212523, + "balance_loss_mlp": 1.02530456, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 1.8761578042464244, + "language_loss": 0.68420094, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70590359, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 2.7625300884246826 + }, + { + "auxiliary_loss_clip": 0.01130793, + "auxiliary_loss_mlp": 0.01039815, + "balance_loss_clip": 1.04227471, + "balance_loss_mlp": 1.0235672, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.046146501949901, + "language_loss": 0.77507269, + "learning_rate": 3.636284878455669e-06, + "loss": 0.7967788, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 2.554544687271118 + }, + { + "auxiliary_loss_clip": 0.0111313, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.04165745, + "balance_loss_mlp": 1.0273788, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.5531620595269655, + "language_loss": 0.82696486, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84852159, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 2.5468227863311768 + }, + { + "auxiliary_loss_clip": 0.01111457, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.04081523, + "balance_loss_mlp": 1.01920295, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.8477252954056385, + "language_loss": 0.83032858, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85178518, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 2.552093744277954 + }, + { + "auxiliary_loss_clip": 0.01120956, + "auxiliary_loss_mlp": 0.01042778, + "balance_loss_clip": 1.03852367, + "balance_loss_mlp": 1.02726889, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.634056928728185, + "language_loss": 0.72317994, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74481726, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.5619544982910156 + }, + { + "auxiliary_loss_clip": 0.01081905, + "auxiliary_loss_mlp": 0.01047998, + "balance_loss_clip": 1.03699589, + "balance_loss_mlp": 1.02822149, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.4977291521735814, + "language_loss": 0.74127662, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76257563, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 2.6439409255981445 + }, + { + "auxiliary_loss_clip": 0.01106796, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.03764153, + "balance_loss_mlp": 1.01926565, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 1.8623679476116872, + "language_loss": 0.86620367, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88761663, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 2.56012225151062 + }, + { + "auxiliary_loss_clip": 0.01098602, + "auxiliary_loss_mlp": 0.01038227, + "balance_loss_clip": 1.03563142, + "balance_loss_mlp": 1.02156186, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.090438473638531, + "language_loss": 0.83922803, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.86059636, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 2.666504383087158 + }, + { + "auxiliary_loss_clip": 0.01111946, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.03764582, + "balance_loss_mlp": 1.02365565, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 2.1298991487228327, + "language_loss": 0.7422359, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76375699, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.5284597873687744 + }, + { + "auxiliary_loss_clip": 0.00999927, + "auxiliary_loss_mlp": 0.01009485, + "balance_loss_clip": 1.01103544, + "balance_loss_mlp": 1.00752962, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7347188346783603, + "language_loss": 0.51518196, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53527606, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.2196667194366455 + }, + { + "auxiliary_loss_clip": 0.01093774, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.03837752, + "balance_loss_mlp": 1.03073859, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.886306278547655, + "language_loss": 0.75471503, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77612472, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 2.666224241256714 + }, + { + "auxiliary_loss_clip": 0.01121296, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.04414654, + "balance_loss_mlp": 1.0237186, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.7200592618737685, + "language_loss": 0.72647655, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74809772, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.576591968536377 + }, + { + "auxiliary_loss_clip": 0.01098452, + "auxiliary_loss_mlp": 0.01039405, + "balance_loss_clip": 1.04002166, + "balance_loss_mlp": 1.0234313, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 3.042502697294043, + "language_loss": 0.8047244, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.82610297, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.60160493850708 + }, + { + "auxiliary_loss_clip": 0.01087879, + "auxiliary_loss_mlp": 0.00750522, + "balance_loss_clip": 1.03980517, + "balance_loss_mlp": 1.00053406, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 1.8924138600372051, + "language_loss": 0.84940988, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86779392, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.642348527908325 + }, + { + "auxiliary_loss_clip": 0.01112292, + "auxiliary_loss_mlp": 0.01035573, + "balance_loss_clip": 1.04185176, + "balance_loss_mlp": 1.01881194, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.63877979003981, + "language_loss": 0.80630243, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82778102, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.6775596141815186 + }, + { + "auxiliary_loss_clip": 0.01009435, + "auxiliary_loss_mlp": 0.01007703, + "balance_loss_clip": 1.01007557, + "balance_loss_mlp": 1.00559282, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7793549100821885, + "language_loss": 0.58238399, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60255539, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 3.2230210304260254 + }, + { + "auxiliary_loss_clip": 0.01079218, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.03628135, + "balance_loss_mlp": 1.02137733, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.398906275424179, + "language_loss": 0.74273431, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76391059, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.6665923595428467 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01037241, + "balance_loss_clip": 1.04021287, + "balance_loss_mlp": 1.02086163, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.838844990269305, + "language_loss": 0.81379932, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83532214, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.595551013946533 + }, + { + "auxiliary_loss_clip": 0.01088775, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.03446734, + "balance_loss_mlp": 1.03025365, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.6577323618938682, + "language_loss": 0.73028952, + "learning_rate": 3.632468828196102e-06, + "loss": 0.7516517, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 2.6376359462738037 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01042304, + "balance_loss_clip": 1.04081821, + "balance_loss_mlp": 1.02805853, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6225417093318855, + "language_loss": 0.78076637, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80220854, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.6017725467681885 + }, + { + "auxiliary_loss_clip": 0.01109793, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.04190326, + "balance_loss_mlp": 1.02866554, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 1.8022675628366602, + "language_loss": 0.80179989, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82337421, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 2.5687105655670166 + }, + { + "auxiliary_loss_clip": 0.01105675, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.04096711, + "balance_loss_mlp": 1.02241468, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.9993589858669085, + "language_loss": 0.76454234, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.7860086, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.5534727573394775 + }, + { + "auxiliary_loss_clip": 0.01096963, + "auxiliary_loss_mlp": 0.01039516, + "balance_loss_clip": 1.0399406, + "balance_loss_mlp": 1.02416205, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.864783950409728, + "language_loss": 0.97926283, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00062764, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.5682854652404785 + }, + { + "auxiliary_loss_clip": 0.01110569, + "auxiliary_loss_mlp": 0.00750544, + "balance_loss_clip": 1.03794813, + "balance_loss_mlp": 1.00034571, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.6043795131181502, + "language_loss": 0.80828452, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82689571, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.686159610748291 + }, + { + "auxiliary_loss_clip": 0.01115235, + "auxiliary_loss_mlp": 0.01045303, + "balance_loss_clip": 1.04118681, + "balance_loss_mlp": 1.02738583, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.935558351489562, + "language_loss": 0.77137762, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79298306, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 4.160914421081543 + }, + { + "auxiliary_loss_clip": 0.01111822, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.04303217, + "balance_loss_mlp": 1.02487087, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.6449238153035568, + "language_loss": 0.71394432, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73547375, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.6214098930358887 + }, + { + "auxiliary_loss_clip": 0.01121915, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.03830624, + "balance_loss_mlp": 1.01914346, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 1.7082586553911367, + "language_loss": 0.85312068, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87468386, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 2.5097475051879883 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.03975523, + "balance_loss_mlp": 1.01779711, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.785332521250232, + "language_loss": 0.7689839, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79035085, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.7353193759918213 + }, + { + "auxiliary_loss_clip": 0.01098921, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.04052997, + "balance_loss_mlp": 1.02092373, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.073036882879438, + "language_loss": 0.80612326, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82748425, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 2.508004665374756 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.01045381, + "balance_loss_clip": 1.04235768, + "balance_loss_mlp": 1.02974045, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.9698447047124419, + "language_loss": 0.73085797, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75248003, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 4.05115818977356 + }, + { + "auxiliary_loss_clip": 0.01088964, + "auxiliary_loss_mlp": 0.01044485, + "balance_loss_clip": 1.04200149, + "balance_loss_mlp": 1.0271759, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9104307246087953, + "language_loss": 0.76426822, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78560269, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 2.726008653640747 + }, + { + "auxiliary_loss_clip": 0.01125087, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.04208302, + "balance_loss_mlp": 1.02368307, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 1.75705207153631, + "language_loss": 0.74022126, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76187313, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 2.4802300930023193 + }, + { + "auxiliary_loss_clip": 0.01124579, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.03999496, + "balance_loss_mlp": 1.02694154, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.6833463537219981, + "language_loss": 0.80226988, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82394624, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 2.5689268112182617 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.0104711, + "balance_loss_clip": 1.03712177, + "balance_loss_mlp": 1.03150558, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.9012407143334513, + "language_loss": 0.7536574, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77513713, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.520063638687134 + }, + { + "auxiliary_loss_clip": 0.01085695, + "auxiliary_loss_mlp": 0.01040767, + "balance_loss_clip": 1.03734255, + "balance_loss_mlp": 1.02486455, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 1.7611883784004643, + "language_loss": 0.83010489, + "learning_rate": 3.628860908251712e-06, + "loss": 0.8513695, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 4.135207891464233 + }, + { + "auxiliary_loss_clip": 0.01057749, + "auxiliary_loss_mlp": 0.01046245, + "balance_loss_clip": 1.03422689, + "balance_loss_mlp": 1.02950811, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.6066420539994293, + "language_loss": 0.89186084, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91290081, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 2.668301820755005 + }, + { + "auxiliary_loss_clip": 0.01119803, + "auxiliary_loss_mlp": 0.01047567, + "balance_loss_clip": 1.04301167, + "balance_loss_mlp": 1.03078282, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 5.388413661129423, + "language_loss": 0.8672626, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88893628, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 4.0445029735565186 + }, + { + "auxiliary_loss_clip": 0.01083783, + "auxiliary_loss_mlp": 0.01041446, + "balance_loss_clip": 1.03833258, + "balance_loss_mlp": 1.02573466, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.693185220240875, + "language_loss": 0.8161267, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.83737904, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.7014896869659424 + }, + { + "auxiliary_loss_clip": 0.01116791, + "auxiliary_loss_mlp": 0.00750279, + "balance_loss_clip": 1.03823113, + "balance_loss_mlp": 1.00045395, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.158483631231983, + "language_loss": 0.79752374, + "learning_rate": 3.62795645623335e-06, + "loss": 0.81619442, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 2.507254123687744 + }, + { + "auxiliary_loss_clip": 0.01091665, + "auxiliary_loss_mlp": 0.01043069, + "balance_loss_clip": 1.03691638, + "balance_loss_mlp": 1.02539027, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.5868796236960694, + "language_loss": 0.77795976, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79930711, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.614053249359131 + }, + { + "auxiliary_loss_clip": 0.01095598, + "auxiliary_loss_mlp": 0.01042787, + "balance_loss_clip": 1.03414118, + "balance_loss_mlp": 1.02717102, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.1143415938439087, + "language_loss": 0.7294488, + "learning_rate": 3.627503859796234e-06, + "loss": 0.75083268, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 2.622973680496216 + }, + { + "auxiliary_loss_clip": 0.01051698, + "auxiliary_loss_mlp": 0.01041971, + "balance_loss_clip": 1.03356528, + "balance_loss_mlp": 1.02517402, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 2.1585601620146737, + "language_loss": 0.80407596, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82501268, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 2.7398645877838135 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.03725445, + "balance_loss_mlp": 1.02574587, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 2.065165375237946, + "language_loss": 0.86997449, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.8915416, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 2.54781436920166 + }, + { + "auxiliary_loss_clip": 0.01107647, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.03744674, + "balance_loss_mlp": 1.02158368, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 2.0058923984530663, + "language_loss": 0.77424204, + "learning_rate": 3.626824502298707e-06, + "loss": 0.7956959, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 2.554309844970703 + }, + { + "auxiliary_loss_clip": 0.01091064, + "auxiliary_loss_mlp": 0.01047777, + "balance_loss_clip": 1.03546417, + "balance_loss_mlp": 1.02995551, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 2.0038310613670784, + "language_loss": 0.84963602, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87102437, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 2.5927751064300537 + }, + { + "auxiliary_loss_clip": 0.01078795, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.03772187, + "balance_loss_mlp": 1.02087581, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 1.857539720757142, + "language_loss": 0.81287968, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.8340466, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.6829452514648438 + }, + { + "auxiliary_loss_clip": 0.01096242, + "auxiliary_loss_mlp": 0.01039324, + "balance_loss_clip": 1.03611135, + "balance_loss_mlp": 1.0233146, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 1.8680183490727527, + "language_loss": 0.69713187, + "learning_rate": 3.626144589597061e-06, + "loss": 0.71848756, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 2.566787004470825 + }, + { + "auxiliary_loss_clip": 0.01118192, + "auxiliary_loss_mlp": 0.00750678, + "balance_loss_clip": 1.04031277, + "balance_loss_mlp": 1.00052011, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.642374483648128, + "language_loss": 0.72516429, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74385297, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.5401463508605957 + }, + { + "auxiliary_loss_clip": 0.01113208, + "auxiliary_loss_mlp": 0.01038375, + "balance_loss_clip": 1.04139245, + "balance_loss_mlp": 1.02139997, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 1.8693350744085868, + "language_loss": 0.71043456, + "learning_rate": 3.625691006130477e-06, + "loss": 0.7319504, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.5481619834899902 + }, + { + "auxiliary_loss_clip": 0.01114515, + "auxiliary_loss_mlp": 0.0104161, + "balance_loss_clip": 1.03901505, + "balance_loss_mlp": 1.02558231, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 1.8843589886577192, + "language_loss": 0.87223983, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89380109, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 2.5751287937164307 + }, + { + "auxiliary_loss_clip": 0.01112238, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.04076838, + "balance_loss_mlp": 1.01667285, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 2.2746360025250705, + "language_loss": 0.85912895, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.88056207, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.5181431770324707 + }, + { + "auxiliary_loss_clip": 0.01076606, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.03139687, + "balance_loss_mlp": 1.01955795, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 2.347049782496368, + "language_loss": 0.69336998, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.714499, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.617342472076416 + }, + { + "auxiliary_loss_clip": 0.01085759, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.03695834, + "balance_loss_mlp": 1.02130413, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4755053918831909, + "language_loss": 0.71383512, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73505533, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.6194067001342773 + }, + { + "auxiliary_loss_clip": 0.01113602, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.03900576, + "balance_loss_mlp": 1.02122259, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.6236847781921189, + "language_loss": 0.87352681, + "learning_rate": 3.624555968803217e-06, + "loss": 0.89503437, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.5504536628723145 + }, + { + "auxiliary_loss_clip": 0.01090055, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.03519917, + "balance_loss_mlp": 1.02472067, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.5421781347113326, + "language_loss": 0.66109437, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68238938, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.7194008827209473 + }, + { + "auxiliary_loss_clip": 0.01110795, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.03931427, + "balance_loss_mlp": 1.01881921, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.7798237036391376, + "language_loss": 0.82068324, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84214985, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 2.6468758583068848 + }, + { + "auxiliary_loss_clip": 0.01099428, + "auxiliary_loss_mlp": 0.01039174, + "balance_loss_clip": 1.03900433, + "balance_loss_mlp": 1.02263975, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.488191194035635, + "language_loss": 0.79592347, + "learning_rate": 3.62387420709809e-06, + "loss": 0.8173095, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.602853298187256 + }, + { + "auxiliary_loss_clip": 0.01081899, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_clip": 1.04023552, + "balance_loss_mlp": 1.02334118, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 2.0645053019160264, + "language_loss": 0.71945953, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74068546, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 2.8831870555877686 + }, + { + "auxiliary_loss_clip": 0.01109046, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.03690493, + "balance_loss_mlp": 1.02161837, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.6875654063414314, + "language_loss": 0.79753375, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.81899202, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 2.641239643096924 + }, + { + "auxiliary_loss_clip": 0.0110084, + "auxiliary_loss_mlp": 0.01036213, + "balance_loss_clip": 1.03406, + "balance_loss_mlp": 1.02057314, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 1.953428952805531, + "language_loss": 0.77570868, + "learning_rate": 3.623191891195723e-06, + "loss": 0.79707921, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.5506136417388916 + }, + { + "auxiliary_loss_clip": 0.01114272, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.03974342, + "balance_loss_mlp": 1.01894724, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.8390051595162171, + "language_loss": 0.7455318, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.7670421, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.6014950275421143 + }, + { + "auxiliary_loss_clip": 0.01077775, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.03845561, + "balance_loss_mlp": 1.02419007, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.9046738692534977, + "language_loss": 0.64863241, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66980571, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 2.8510935306549072 + }, + { + "auxiliary_loss_clip": 0.0101573, + "auxiliary_loss_mlp": 0.01015527, + "balance_loss_clip": 1.02516389, + "balance_loss_mlp": 1.0137862, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.1747945323684652, + "language_loss": 0.65086615, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.6711787, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 3.050626277923584 + }, + { + "auxiliary_loss_clip": 0.01084188, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.03472936, + "balance_loss_mlp": 1.02276897, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 2.2856404742277476, + "language_loss": 0.80704463, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82827389, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 2.6321418285369873 + }, + { + "auxiliary_loss_clip": 0.01122208, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.03986073, + "balance_loss_mlp": 1.02230692, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 1.8851341235726047, + "language_loss": 0.78264815, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80424488, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.5670580863952637 + }, + { + "auxiliary_loss_clip": 0.01097118, + "auxiliary_loss_mlp": 0.01039335, + "balance_loss_clip": 1.03729582, + "balance_loss_mlp": 1.0233016, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 2.150816188557067, + "language_loss": 0.80083454, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82219905, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 4.235834360122681 + }, + { + "auxiliary_loss_clip": 0.01110153, + "auxiliary_loss_mlp": 0.007506, + "balance_loss_clip": 1.03888106, + "balance_loss_mlp": 1.00051117, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 2.2088233169851073, + "language_loss": 0.68520957, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70381707, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.6331493854522705 + }, + { + "auxiliary_loss_clip": 0.01076496, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_clip": 1.03343844, + "balance_loss_mlp": 1.02688885, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 1.971049918590065, + "language_loss": 0.90710545, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92830503, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.660236120223999 + }, + { + "auxiliary_loss_clip": 0.01095728, + "auxiliary_loss_mlp": 0.01051101, + "balance_loss_clip": 1.03994489, + "balance_loss_mlp": 1.03442407, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.132569112189012, + "language_loss": 0.88810802, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.9095763, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 2.518357515335083 + }, + { + "auxiliary_loss_clip": 0.01124149, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.04286504, + "balance_loss_mlp": 1.03270674, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 3.080146657383427, + "language_loss": 0.7486434, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77037501, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 2.52535080909729 + }, + { + "auxiliary_loss_clip": 0.01077299, + "auxiliary_loss_mlp": 0.01037642, + "balance_loss_clip": 1.04324269, + "balance_loss_mlp": 1.02190673, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 2.1882084770479544, + "language_loss": 0.62431353, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64546287, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 4.308011054992676 + }, + { + "auxiliary_loss_clip": 0.01091121, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.03864408, + "balance_loss_mlp": 1.02024031, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.7758425451864155, + "language_loss": 0.79061013, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81187749, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 2.6492884159088135 + }, + { + "auxiliary_loss_clip": 0.01089433, + "auxiliary_loss_mlp": 0.01046326, + "balance_loss_clip": 1.04541278, + "balance_loss_mlp": 1.03006589, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.7714503401295518, + "language_loss": 0.76421702, + "learning_rate": 3.620228790579645e-06, + "loss": 0.78557467, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.6067726612091064 + }, + { + "auxiliary_loss_clip": 0.01095186, + "auxiliary_loss_mlp": 0.01044878, + "balance_loss_clip": 1.0363642, + "balance_loss_mlp": 1.02935719, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.4832016535910872, + "language_loss": 0.78802663, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.80942732, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 4.066227912902832 + }, + { + "auxiliary_loss_clip": 0.0105077, + "auxiliary_loss_mlp": 0.01040742, + "balance_loss_clip": 1.03437507, + "balance_loss_mlp": 1.02392173, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 1.7385932375689976, + "language_loss": 0.68029118, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70120627, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.7219676971435547 + }, + { + "auxiliary_loss_clip": 0.01096269, + "auxiliary_loss_mlp": 0.01039618, + "balance_loss_clip": 1.03583562, + "balance_loss_mlp": 1.02205801, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.5584621542577002, + "language_loss": 0.80492675, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82628554, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 4.093183755874634 + }, + { + "auxiliary_loss_clip": 0.01106352, + "auxiliary_loss_mlp": 0.01052502, + "balance_loss_clip": 1.04242337, + "balance_loss_mlp": 1.03470421, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 1.915932078456617, + "language_loss": 0.86827201, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88986051, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 2.5669264793395996 + }, + { + "auxiliary_loss_clip": 0.01091989, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.03968799, + "balance_loss_mlp": 1.018188, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.9644277312697491, + "language_loss": 0.74227983, + "learning_rate": 3.619086370692945e-06, + "loss": 0.7635411, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 2.6104938983917236 + }, + { + "auxiliary_loss_clip": 0.01127498, + "auxiliary_loss_mlp": 0.01040795, + "balance_loss_clip": 1.04205477, + "balance_loss_mlp": 1.02465463, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.1031105894784887, + "language_loss": 0.79067004, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81235296, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 2.479945659637451 + }, + { + "auxiliary_loss_clip": 0.01086705, + "auxiliary_loss_mlp": 0.01033227, + "balance_loss_clip": 1.03920054, + "balance_loss_mlp": 1.01917219, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.6175835088758914, + "language_loss": 0.82355475, + "learning_rate": 3.618628972906178e-06, + "loss": 0.8447541, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.671201467514038 + }, + { + "auxiliary_loss_clip": 0.01127352, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.04280281, + "balance_loss_mlp": 1.02712107, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 1.8751921733711392, + "language_loss": 0.84948003, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.87118852, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 2.5496909618377686 + }, + { + "auxiliary_loss_clip": 0.01089312, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.03569508, + "balance_loss_mlp": 1.02067447, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 1.784344641129975, + "language_loss": 0.78870845, + "learning_rate": 3.618171329605121e-06, + "loss": 0.80996585, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 2.635376453399658 + }, + { + "auxiliary_loss_clip": 0.01067468, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.03854167, + "balance_loss_mlp": 1.02034473, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.6318283480723468, + "language_loss": 0.77148199, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79251587, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 2.6896793842315674 + }, + { + "auxiliary_loss_clip": 0.01122286, + "auxiliary_loss_mlp": 0.01040029, + "balance_loss_clip": 1.04151225, + "balance_loss_mlp": 1.02199268, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.7234353789420336, + "language_loss": 0.72191113, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74353427, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 2.5234627723693848 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.04037428, + "balance_loss_mlp": 1.02268624, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.1027574032666263, + "language_loss": 0.86730802, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.888991, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.543599843978882 + }, + { + "auxiliary_loss_clip": 0.01092132, + "auxiliary_loss_mlp": 0.01050521, + "balance_loss_clip": 1.03742814, + "balance_loss_mlp": 1.03140032, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.2141993250685847, + "language_loss": 0.80303776, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82446426, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.6428985595703125 + }, + { + "auxiliary_loss_clip": 0.01096229, + "auxiliary_loss_mlp": 0.01044592, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.02966726, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.5648611685246832, + "language_loss": 0.86628556, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88769376, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.01094226, + "auxiliary_loss_mlp": 0.00750384, + "balance_loss_clip": 1.03712952, + "balance_loss_mlp": 1.00051773, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.831742142120035, + "language_loss": 0.73100251, + "learning_rate": 3.616796927310559e-06, + "loss": 0.7494486, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.5930287837982178 + }, + { + "auxiliary_loss_clip": 0.01094171, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.03994584, + "balance_loss_mlp": 1.02027893, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 1.8059604772006237, + "language_loss": 0.75141311, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77271819, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 2.648198127746582 + }, + { + "auxiliary_loss_clip": 0.01124152, + "auxiliary_loss_mlp": 0.01048935, + "balance_loss_clip": 1.04133487, + "balance_loss_mlp": 1.03354502, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.6208121008869376, + "language_loss": 0.8792302, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90096104, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.578766345977783 + }, + { + "auxiliary_loss_clip": 0.01077897, + "auxiliary_loss_mlp": 0.01039337, + "balance_loss_clip": 1.03613997, + "balance_loss_mlp": 1.02261198, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.5483982281659878, + "language_loss": 0.8476063, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86877871, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.693925142288208 + }, + { + "auxiliary_loss_clip": 0.01098089, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_clip": 1.0382278, + "balance_loss_mlp": 1.02824914, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.5645930001225228, + "language_loss": 0.76440406, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.78582406, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.673417806625366 + }, + { + "auxiliary_loss_clip": 0.01105134, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.04104555, + "balance_loss_mlp": 1.02416635, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 1.7092968035511085, + "language_loss": 0.84762388, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86906111, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 2.602064371109009 + }, + { + "auxiliary_loss_clip": 0.01085636, + "auxiliary_loss_mlp": 0.01037675, + "balance_loss_clip": 1.04003227, + "balance_loss_mlp": 1.02191591, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 1.644902096871743, + "language_loss": 0.8600421, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88127524, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.6501264572143555 + }, + { + "auxiliary_loss_clip": 0.01125361, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.04079556, + "balance_loss_mlp": 1.02462578, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 1.7530922165615102, + "language_loss": 0.78805804, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.80972713, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.552950620651245 + }, + { + "auxiliary_loss_clip": 0.01089182, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.03680992, + "balance_loss_mlp": 1.02033806, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.558463920274425, + "language_loss": 0.76406038, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78530777, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.01081435, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_clip": 1.03568447, + "balance_loss_mlp": 1.027004, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.7208066753763158, + "language_loss": 0.74074674, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76201761, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.6464881896972656 + }, + { + "auxiliary_loss_clip": 0.01119663, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.0391469, + "balance_loss_mlp": 1.01792049, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.3392844395958443, + "language_loss": 0.75739151, + "learning_rate": 3.614501353019939e-06, + "loss": 0.77892208, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 2.541450023651123 + }, + { + "auxiliary_loss_clip": 0.01101673, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.04179764, + "balance_loss_mlp": 1.01767302, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.6786319892873898, + "language_loss": 0.87961268, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.90095353, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.562298536300659 + }, + { + "auxiliary_loss_clip": 0.01062288, + "auxiliary_loss_mlp": 0.01051027, + "balance_loss_clip": 1.03488588, + "balance_loss_mlp": 1.03418231, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 1.8417226684765473, + "language_loss": 0.81786335, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83899653, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.7337446212768555 + }, + { + "auxiliary_loss_clip": 0.01110381, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.03837407, + "balance_loss_mlp": 1.01792634, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 2.1748271585121906, + "language_loss": 0.62987667, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65131021, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 2.6633474826812744 + }, + { + "auxiliary_loss_clip": 0.01107113, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.03841972, + "balance_loss_mlp": 1.02078927, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 4.457570589987184, + "language_loss": 0.75948238, + "learning_rate": 3.613581408598489e-06, + "loss": 0.7809208, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 2.6546292304992676 + }, + { + "auxiliary_loss_clip": 0.01086988, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.03836989, + "balance_loss_mlp": 1.02024519, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.7790672632321516, + "language_loss": 0.80859423, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.82982737, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 2.5882420539855957 + }, + { + "auxiliary_loss_clip": 0.0110845, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.03648305, + "balance_loss_mlp": 1.0230577, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.0225960464176795, + "language_loss": 0.85835326, + "learning_rate": 3.613121069229862e-06, + "loss": 0.87982208, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.5728487968444824 + }, + { + "auxiliary_loss_clip": 0.01108088, + "auxiliary_loss_mlp": 0.00750433, + "balance_loss_clip": 1.03654909, + "balance_loss_mlp": 1.00053716, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.6772302217074746, + "language_loss": 0.76658374, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78516901, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 2.6280646324157715 + }, + { + "auxiliary_loss_clip": 0.01124458, + "auxiliary_loss_mlp": 0.01040711, + "balance_loss_clip": 1.0409925, + "balance_loss_mlp": 1.02479684, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.687290678961746, + "language_loss": 0.79934448, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.82099617, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.583411693572998 + }, + { + "auxiliary_loss_clip": 0.01097072, + "auxiliary_loss_mlp": 0.01036632, + "balance_loss_clip": 1.03801441, + "balance_loss_mlp": 1.021945, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.794081701793909, + "language_loss": 0.79307032, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81440729, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 3.9855172634124756 + }, + { + "auxiliary_loss_clip": 0.01065474, + "auxiliary_loss_mlp": 0.0104333, + "balance_loss_clip": 1.0374012, + "balance_loss_mlp": 1.02684379, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 1.7139905460972646, + "language_loss": 0.81675762, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83784568, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 2.7106246948242188 + }, + { + "auxiliary_loss_clip": 0.0108768, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.03795111, + "balance_loss_mlp": 1.02194941, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 2.16147157459892, + "language_loss": 0.83892739, + "learning_rate": 3.611969150491165e-06, + "loss": 0.8601855, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 2.5556440353393555 + }, + { + "auxiliary_loss_clip": 0.01118406, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.03811049, + "balance_loss_mlp": 1.01892638, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.6764309046461863, + "language_loss": 0.78189862, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80341524, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.547297716140747 + }, + { + "auxiliary_loss_clip": 0.01096251, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.03708124, + "balance_loss_mlp": 1.01741219, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.7427899002155978, + "language_loss": 0.78450251, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80579257, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 4.125721216201782 + }, + { + "auxiliary_loss_clip": 0.01097665, + "auxiliary_loss_mlp": 0.01036699, + "balance_loss_clip": 1.03901815, + "balance_loss_mlp": 1.02058792, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.7583776083363458, + "language_loss": 0.70448738, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72583103, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 2.605994701385498 + }, + { + "auxiliary_loss_clip": 0.01095076, + "auxiliary_loss_mlp": 0.01045368, + "balance_loss_clip": 1.03975546, + "balance_loss_mlp": 1.0297519, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.078431736626573, + "language_loss": 0.77504075, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79644525, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 2.650681972503662 + }, + { + "auxiliary_loss_clip": 0.01099603, + "auxiliary_loss_mlp": 0.01046095, + "balance_loss_clip": 1.04122746, + "balance_loss_mlp": 1.02932215, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 16.938820432214747, + "language_loss": 0.82479775, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84625471, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 4.17397403717041 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.01037873, + "balance_loss_clip": 1.03851414, + "balance_loss_mlp": 1.02170837, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.8453690148289201, + "language_loss": 0.73060536, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75207609, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 2.603358745574951 + }, + { + "auxiliary_loss_clip": 0.01102502, + "auxiliary_loss_mlp": 0.01039871, + "balance_loss_clip": 1.03919232, + "balance_loss_mlp": 1.02347994, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.1699072615495565, + "language_loss": 0.77243209, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79385591, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 4.14855170249939 + }, + { + "auxiliary_loss_clip": 0.01074919, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.03373265, + "balance_loss_mlp": 1.02085757, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.85012397311998, + "language_loss": 0.78876603, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80988759, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 3.0352816581726074 + }, + { + "auxiliary_loss_clip": 0.01012965, + "auxiliary_loss_mlp": 0.00999281, + "balance_loss_clip": 1.01527286, + "balance_loss_mlp": 0.99739784, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.948692057039302, + "language_loss": 0.60050648, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62062895, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 3.21108078956604 + }, + { + "auxiliary_loss_clip": 0.01088432, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.0375607, + "balance_loss_mlp": 1.02283573, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.653139537582533, + "language_loss": 0.77403378, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79530919, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.780787944793701 + }, + { + "auxiliary_loss_clip": 0.01096536, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.03805876, + "balance_loss_mlp": 1.01904678, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 1.7729641492126151, + "language_loss": 0.78882462, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.81015962, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 2.7082977294921875 + }, + { + "auxiliary_loss_clip": 0.01113276, + "auxiliary_loss_mlp": 0.01046338, + "balance_loss_clip": 1.04260731, + "balance_loss_mlp": 1.02895713, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.6791817736084635, + "language_loss": 0.91586614, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93746227, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 2.6267740726470947 + }, + { + "auxiliary_loss_clip": 0.01098695, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_clip": 1.036273, + "balance_loss_mlp": 1.02743363, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.8429031141151886, + "language_loss": 0.75258738, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77401638, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 2.7821874618530273 + }, + { + "auxiliary_loss_clip": 0.01109088, + "auxiliary_loss_mlp": 0.01038987, + "balance_loss_clip": 1.03814125, + "balance_loss_mlp": 1.02360952, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 1.933026019852925, + "language_loss": 0.89278209, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91426283, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": 2.6418261528015137 + }, + { + "auxiliary_loss_clip": 0.01092746, + "auxiliary_loss_mlp": 0.01037018, + "balance_loss_clip": 1.03741121, + "balance_loss_mlp": 1.02098441, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.7084793590684508, + "language_loss": 0.74580598, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76710355, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 2.733693838119507 + }, + { + "auxiliary_loss_clip": 0.01107929, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.03539097, + "balance_loss_mlp": 1.02159166, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4024823958268167, + "language_loss": 0.71805191, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73951125, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 2.6009154319763184 + }, + { + "auxiliary_loss_clip": 0.01112948, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.04258692, + "balance_loss_mlp": 1.03432417, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.6496634222560111, + "language_loss": 0.78456974, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.8062073, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 2.630810260772705 + }, + { + "auxiliary_loss_clip": 0.01090193, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.03283882, + "balance_loss_mlp": 1.02290368, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 2.2717548598423747, + "language_loss": 0.68625808, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70756406, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.6530070304870605 + }, + { + "auxiliary_loss_clip": 0.01121669, + "auxiliary_loss_mlp": 0.01039286, + "balance_loss_clip": 1.03816926, + "balance_loss_mlp": 1.02370548, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.556715747182349, + "language_loss": 0.80316436, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82477391, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 2.5560529232025146 + }, + { + "auxiliary_loss_clip": 0.01083244, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.03647447, + "balance_loss_mlp": 1.02700257, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.7253591523708798, + "language_loss": 0.7878477, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.80910277, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 2.639258623123169 + }, + { + "auxiliary_loss_clip": 0.00997437, + "auxiliary_loss_mlp": 0.01006646, + "balance_loss_clip": 1.01687479, + "balance_loss_mlp": 1.00472701, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.652016172124595, + "language_loss": 0.54354513, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56358588, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 3.395545482635498 + }, + { + "auxiliary_loss_clip": 0.01088805, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.03824294, + "balance_loss_mlp": 1.0187943, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 2.1757874786901277, + "language_loss": 0.70034766, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72158241, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.917316198348999 + }, + { + "auxiliary_loss_clip": 0.01094933, + "auxiliary_loss_mlp": 0.01039437, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.02335572, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.127883223345619, + "language_loss": 0.74286491, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76420861, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.6888766288757324 + }, + { + "auxiliary_loss_clip": 0.01119421, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.03678286, + "balance_loss_mlp": 1.02176452, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 1.9766739189169924, + "language_loss": 0.81876427, + "learning_rate": 3.606418687985928e-06, + "loss": 0.84032989, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.572228193283081 + }, + { + "auxiliary_loss_clip": 0.01101189, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.03688979, + "balance_loss_mlp": 1.01992965, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 1.8843893792421027, + "language_loss": 0.82412624, + "learning_rate": 3.606186656428641e-06, + "loss": 0.84549797, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.6361703872680664 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.04105806, + "balance_loss_mlp": 1.01934171, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 1.89401297978192, + "language_loss": 0.7228924, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74423254, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 2.613983154296875 + }, + { + "auxiliary_loss_clip": 0.01078584, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.03706837, + "balance_loss_mlp": 1.01724887, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.1742816697340364, + "language_loss": 0.638327, + "learning_rate": 3.605722410602591e-06, + "loss": 0.65944856, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 2.691032648086548 + }, + { + "auxiliary_loss_clip": 0.01098265, + "auxiliary_loss_mlp": 0.01042234, + "balance_loss_clip": 1.03704047, + "balance_loss_mlp": 1.02673697, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.6874869449919572, + "language_loss": 0.70578974, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72719479, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 2.5716092586517334 + }, + { + "auxiliary_loss_clip": 0.01110494, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.03955531, + "balance_loss_mlp": 1.02104247, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 3.059912083821366, + "language_loss": 0.89530951, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91679513, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.5289666652679443 + }, + { + "auxiliary_loss_clip": 0.01123122, + "auxiliary_loss_mlp": 0.01040476, + "balance_loss_clip": 1.0390929, + "balance_loss_mlp": 1.02367973, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.119897150899117, + "language_loss": 0.74318099, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.764817, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 2.4793410301208496 + }, + { + "auxiliary_loss_clip": 0.01096828, + "auxiliary_loss_mlp": 0.01040169, + "balance_loss_clip": 1.0355953, + "balance_loss_mlp": 1.02536273, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.4167696303841903, + "language_loss": 0.82411629, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84548616, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.642336845397949 + }, + { + "auxiliary_loss_clip": 0.01097998, + "auxiliary_loss_mlp": 0.01041095, + "balance_loss_clip": 1.03779483, + "balance_loss_mlp": 1.02456045, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 1.7833605360436124, + "language_loss": 0.75961077, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78100169, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 2.6309263706207275 + }, + { + "auxiliary_loss_clip": 0.01117428, + "auxiliary_loss_mlp": 0.01042516, + "balance_loss_clip": 1.03658724, + "balance_loss_mlp": 1.02629173, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.7358861754565802, + "language_loss": 0.70944071, + "learning_rate": 3.604328212066594e-06, + "loss": 0.73104018, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 2.5027902126312256 + }, + { + "auxiliary_loss_clip": 0.01010628, + "auxiliary_loss_mlp": 0.01005442, + "balance_loss_clip": 1.01215529, + "balance_loss_mlp": 1.00358284, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8918267600398788, + "language_loss": 0.61906874, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63922942, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.163512706756592 + }, + { + "auxiliary_loss_clip": 0.01102708, + "auxiliary_loss_mlp": 0.01039528, + "balance_loss_clip": 1.03879619, + "balance_loss_mlp": 1.02317262, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.7306487556467998, + "language_loss": 0.86853284, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88995522, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.5649166107177734 + }, + { + "auxiliary_loss_clip": 0.0109765, + "auxiliary_loss_mlp": 0.01035725, + "balance_loss_clip": 1.03763509, + "balance_loss_mlp": 1.02078843, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.2883484128051288, + "language_loss": 0.72878832, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.75012207, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 2.6437249183654785 + }, + { + "auxiliary_loss_clip": 0.01094936, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.03655231, + "balance_loss_mlp": 1.01672912, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.31285273897269, + "language_loss": 0.66835475, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.68962562, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 2.544213056564331 + }, + { + "auxiliary_loss_clip": 0.01081028, + "auxiliary_loss_mlp": 0.0104124, + "balance_loss_clip": 1.03243804, + "balance_loss_mlp": 1.0243485, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.807717738449369, + "language_loss": 0.7612524, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78247511, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 4.076700925827026 + }, + { + "auxiliary_loss_clip": 0.01061568, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.03244591, + "balance_loss_mlp": 1.02218997, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 3.541974728523072, + "language_loss": 0.91100293, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93201095, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 2.657268762588501 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01033383, + "balance_loss_clip": 1.03530598, + "balance_loss_mlp": 1.01792121, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.8207440822199972, + "language_loss": 0.82194626, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84339345, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.6152594089508057 + }, + { + "auxiliary_loss_clip": 0.01039405, + "auxiliary_loss_mlp": 0.01004391, + "balance_loss_clip": 1.00997329, + "balance_loss_mlp": 1.00269842, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1525320253983922, + "language_loss": 0.6566714, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67710936, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 4.289496421813965 + }, + { + "auxiliary_loss_clip": 0.01125646, + "auxiliary_loss_mlp": 0.01049522, + "balance_loss_clip": 1.03943205, + "balance_loss_mlp": 1.0328691, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 1.8872487487385536, + "language_loss": 0.76800072, + "learning_rate": 3.602232808409293e-06, + "loss": 0.78975236, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.537083148956299 + }, + { + "auxiliary_loss_clip": 0.01074245, + "auxiliary_loss_mlp": 0.01045938, + "balance_loss_clip": 1.03273118, + "balance_loss_mlp": 1.02787757, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 1.8797865291974303, + "language_loss": 0.80823451, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82943636, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.6256890296936035 + }, + { + "auxiliary_loss_clip": 0.01106088, + "auxiliary_loss_mlp": 0.01043165, + "balance_loss_clip": 1.03594172, + "balance_loss_mlp": 1.02742982, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.7427940522553385, + "language_loss": 0.77196956, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79346204, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 2.5339601039886475 + }, + { + "auxiliary_loss_clip": 0.01082614, + "auxiliary_loss_mlp": 0.00750544, + "balance_loss_clip": 1.0363512, + "balance_loss_mlp": 1.00048661, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.238903905327653, + "language_loss": 0.95663375, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.97496533, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 4.155503034591675 + }, + { + "auxiliary_loss_clip": 0.01108265, + "auxiliary_loss_mlp": 0.00750275, + "balance_loss_clip": 1.03647494, + "balance_loss_mlp": 1.00055385, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.6111598607723674, + "language_loss": 0.81008017, + "learning_rate": 3.601299937834666e-06, + "loss": 0.82866555, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 2.581814765930176 + }, + { + "auxiliary_loss_clip": 0.01085679, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.03559399, + "balance_loss_mlp": 1.02072263, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 1.9655675710617655, + "language_loss": 0.79268992, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.8139236, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 2.6200003623962402 + }, + { + "auxiliary_loss_clip": 0.01091446, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_clip": 1.03559351, + "balance_loss_mlp": 1.03298962, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.6773822515042334, + "language_loss": 0.75523603, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77666152, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 4.088385820388794 + }, + { + "auxiliary_loss_clip": 0.01093883, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.03619671, + "balance_loss_mlp": 1.02098286, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.7609011116099687, + "language_loss": 0.63719046, + "learning_rate": 3.600599647297484e-06, + "loss": 0.65848815, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 2.632035970687866 + }, + { + "auxiliary_loss_clip": 0.01100519, + "auxiliary_loss_mlp": 0.01034935, + "balance_loss_clip": 1.03969073, + "balance_loss_mlp": 1.0201354, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.7093141190047443, + "language_loss": 0.81544924, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83680379, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.617473602294922 + }, + { + "auxiliary_loss_clip": 0.01095307, + "auxiliary_loss_mlp": 0.01047918, + "balance_loss_clip": 1.03836048, + "balance_loss_mlp": 1.03181267, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7494749407506154, + "language_loss": 0.7904923, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81192452, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.6840481758117676 + }, + { + "auxiliary_loss_clip": 0.01082345, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.0323205, + "balance_loss_mlp": 1.02140236, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.690805774653886, + "language_loss": 0.84822959, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.86943144, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 2.634965181350708 + }, + { + "auxiliary_loss_clip": 0.01113078, + "auxiliary_loss_mlp": 0.01036725, + "balance_loss_clip": 1.03910851, + "balance_loss_mlp": 1.02148998, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 1.9576302354174566, + "language_loss": 0.76016492, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78166294, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.576291561126709 + }, + { + "auxiliary_loss_clip": 0.01102527, + "auxiliary_loss_mlp": 0.00750458, + "balance_loss_clip": 1.03801918, + "balance_loss_mlp": 1.00045538, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.372580033286236, + "language_loss": 0.78458464, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.80311453, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 2.605633020401001 + }, + { + "auxiliary_loss_clip": 0.01092202, + "auxiliary_loss_mlp": 0.01044323, + "balance_loss_clip": 1.03694558, + "balance_loss_mlp": 1.02712083, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.021447855352334, + "language_loss": 0.69604802, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71741331, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 2.733339786529541 + }, + { + "auxiliary_loss_clip": 0.01118302, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.04242051, + "balance_loss_mlp": 1.03053689, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 8.338168534020888, + "language_loss": 0.65222788, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67388022, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.5780649185180664 + }, + { + "auxiliary_loss_clip": 0.01071232, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.03624153, + "balance_loss_mlp": 1.03026485, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 2.052634833584158, + "language_loss": 0.74676365, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76795363, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 2.6518585681915283 + }, + { + "auxiliary_loss_clip": 0.01098808, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.03844666, + "balance_loss_mlp": 1.02188039, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.5101792556161975, + "language_loss": 0.81848335, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83984542, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.6410043239593506 + }, + { + "auxiliary_loss_clip": 0.011079, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.04217243, + "balance_loss_mlp": 1.01918244, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.2326946621139183, + "language_loss": 0.78344476, + "learning_rate": 3.598261401682441e-06, + "loss": 0.80486357, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 2.5818710327148438 + }, + { + "auxiliary_loss_clip": 0.01100354, + "auxiliary_loss_mlp": 0.00750303, + "balance_loss_clip": 1.03913164, + "balance_loss_mlp": 1.00042558, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 2.052289619815891, + "language_loss": 0.82919729, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84770387, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.593278646469116 + }, + { + "auxiliary_loss_clip": 0.01059089, + "auxiliary_loss_mlp": 0.01052931, + "balance_loss_clip": 1.03670073, + "balance_loss_mlp": 1.03358364, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 2.6706563655640987, + "language_loss": 0.82635367, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84747386, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.6954100131988525 + }, + { + "auxiliary_loss_clip": 0.0110373, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.03693628, + "balance_loss_mlp": 1.02381778, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 1.9129249409712494, + "language_loss": 0.70487559, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72630984, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.674989700317383 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01041374, + "balance_loss_clip": 1.03633678, + "balance_loss_mlp": 1.02516174, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.4185723552000757, + "language_loss": 0.66702688, + "learning_rate": 3.597324405965139e-06, + "loss": 0.68854284, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 2.615644693374634 + }, + { + "auxiliary_loss_clip": 0.01109981, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_clip": 1.03855383, + "balance_loss_mlp": 1.02661586, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.8180575514882864, + "language_loss": 0.83237916, + "learning_rate": 3.597090005586848e-06, + "loss": 0.8538965, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.6212100982666016 + }, + { + "auxiliary_loss_clip": 0.0110954, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.03905606, + "balance_loss_mlp": 1.01772237, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.1241008215402095, + "language_loss": 0.87116754, + "learning_rate": 3.596855544646742e-06, + "loss": 0.8925969, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 2.558903694152832 + }, + { + "auxiliary_loss_clip": 0.01096859, + "auxiliary_loss_mlp": 0.0103778, + "balance_loss_clip": 1.03824019, + "balance_loss_mlp": 1.02204406, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 1.7380495964944034, + "language_loss": 0.7480076, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.76935393, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 2.674952268600464 + }, + { + "auxiliary_loss_clip": 0.0111368, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.04083395, + "balance_loss_mlp": 1.02267671, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.7570276822664828, + "language_loss": 0.74614549, + "learning_rate": 3.596386441116659e-06, + "loss": 0.7676717, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.5870840549468994 + }, + { + "auxiliary_loss_clip": 0.01112197, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.0402019, + "balance_loss_mlp": 1.0214262, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.8860458972329817, + "language_loss": 0.80407155, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.82556605, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 2.7011704444885254 + }, + { + "auxiliary_loss_clip": 0.01103639, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.04003906, + "balance_loss_mlp": 1.02172494, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.0930940672310694, + "language_loss": 0.69249976, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71391726, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 2.6054065227508545 + }, + { + "auxiliary_loss_clip": 0.01073954, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.03927255, + "balance_loss_mlp": 1.01503146, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.8997943936012671, + "language_loss": 0.82946718, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85051513, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.687666893005371 + }, + { + "auxiliary_loss_clip": 0.01122477, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.03926539, + "balance_loss_mlp": 1.02207899, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.5891880336403217, + "language_loss": 0.66292536, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68453217, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 2.5655367374420166 + }, + { + "auxiliary_loss_clip": 0.01033429, + "auxiliary_loss_mlp": 0.01003467, + "balance_loss_clip": 1.0199008, + "balance_loss_mlp": 1.0016669, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8120560562423476, + "language_loss": 0.56804502, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58841395, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.198718309402466 + }, + { + "auxiliary_loss_clip": 0.01097215, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.03849053, + "balance_loss_mlp": 1.02036214, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.371615895113937, + "language_loss": 0.73582053, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75714719, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.613481044769287 + }, + { + "auxiliary_loss_clip": 0.01117572, + "auxiliary_loss_mlp": 0.01043665, + "balance_loss_clip": 1.04374909, + "balance_loss_mlp": 1.02719057, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.9325848732817816, + "language_loss": 0.87380654, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89541888, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 2.6247193813323975 + }, + { + "auxiliary_loss_clip": 0.01104085, + "auxiliary_loss_mlp": 0.01043987, + "balance_loss_clip": 1.04065752, + "balance_loss_mlp": 1.02722669, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.459697990659982, + "language_loss": 0.81585836, + "learning_rate": 3.594507606303083e-06, + "loss": 0.8373391, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 2.7018535137176514 + }, + { + "auxiliary_loss_clip": 0.01055281, + "auxiliary_loss_mlp": 0.01042726, + "balance_loss_clip": 1.03608167, + "balance_loss_mlp": 1.02603686, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.7546784085575216, + "language_loss": 0.86625093, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88723099, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 2.6189255714416504 + }, + { + "auxiliary_loss_clip": 0.01099246, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.04011095, + "balance_loss_mlp": 1.02670646, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 1.7651122578196903, + "language_loss": 0.70907009, + "learning_rate": 3.594037292782607e-06, + "loss": 0.73049927, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 2.6023945808410645 + }, + { + "auxiliary_loss_clip": 0.01058918, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.03566813, + "balance_loss_mlp": 1.0213896, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.7249911774947795, + "language_loss": 0.84338546, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86433852, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 4.25813364982605 + }, + { + "auxiliary_loss_clip": 0.01111036, + "auxiliary_loss_mlp": 0.01044998, + "balance_loss_clip": 1.04331684, + "balance_loss_mlp": 1.02926254, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.6436447609308396, + "language_loss": 0.67206609, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69362646, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 2.7463786602020264 + }, + { + "auxiliary_loss_clip": 0.01081929, + "auxiliary_loss_mlp": 0.01046483, + "balance_loss_clip": 1.03864527, + "balance_loss_mlp": 1.02970457, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 3.1711881159181536, + "language_loss": 0.75536603, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77665013, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.7518463134765625 + }, + { + "auxiliary_loss_clip": 0.01076183, + "auxiliary_loss_mlp": 0.01039413, + "balance_loss_clip": 1.03830338, + "balance_loss_mlp": 1.02323639, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 2.778158399194301, + "language_loss": 0.87570047, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89685643, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 4.143010854721069 + }, + { + "auxiliary_loss_clip": 0.01087285, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.0373044, + "balance_loss_mlp": 1.02383411, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 1.6287738292601224, + "language_loss": 0.74972004, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77099204, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 2.6678106784820557 + }, + { + "auxiliary_loss_clip": 0.01069865, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.03330255, + "balance_loss_mlp": 1.03591132, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.8377284944782544, + "language_loss": 0.86376774, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88501501, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 2.6751370429992676 + }, + { + "auxiliary_loss_clip": 0.01083856, + "auxiliary_loss_mlp": 0.01055787, + "balance_loss_clip": 1.03633034, + "balance_loss_mlp": 1.03754807, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.1793640928396347, + "language_loss": 0.82065189, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84204835, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 2.6418159008026123 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.01040871, + "balance_loss_clip": 1.0433054, + "balance_loss_mlp": 1.02480197, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.9563430757525186, + "language_loss": 0.79707634, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81862307, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 4.046863794326782 + }, + { + "auxiliary_loss_clip": 0.01018159, + "auxiliary_loss_mlp": 0.01020257, + "balance_loss_clip": 1.01760578, + "balance_loss_mlp": 1.01855266, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.8920715560625507, + "language_loss": 0.65433174, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67471594, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 3.1134374141693115 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.01044042, + "balance_loss_clip": 1.04082227, + "balance_loss_mlp": 1.02908683, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 1.9559562264382835, + "language_loss": 0.75623786, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77778196, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.555237293243408 + }, + { + "auxiliary_loss_clip": 0.01092528, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.03858435, + "balance_loss_mlp": 1.02134371, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 2.7579703221225236, + "language_loss": 0.68623239, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70753717, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 4.086850166320801 + }, + { + "auxiliary_loss_clip": 0.01128492, + "auxiliary_loss_mlp": 0.0103707, + "balance_loss_clip": 1.04412031, + "balance_loss_mlp": 1.01984406, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 1.9762134662401492, + "language_loss": 0.79375458, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81541014, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.535000801086426 + }, + { + "auxiliary_loss_clip": 0.0111506, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.04221964, + "balance_loss_mlp": 1.02423286, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 1.8070664491459518, + "language_loss": 0.82889867, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85043609, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.6080806255340576 + }, + { + "auxiliary_loss_clip": 0.01113952, + "auxiliary_loss_mlp": 0.01042141, + "balance_loss_clip": 1.03959107, + "balance_loss_mlp": 1.02576137, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.8350558963665469, + "language_loss": 0.66513687, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68669778, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 2.756148338317871 + }, + { + "auxiliary_loss_clip": 0.0110947, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_clip": 1.03939641, + "balance_loss_mlp": 1.02722502, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.6684329205626451, + "language_loss": 0.76705605, + "learning_rate": 3.590502239439987e-06, + "loss": 0.78858358, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.6256966590881348 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.0390079, + "balance_loss_mlp": 1.0215373, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 2.1708975200850213, + "language_loss": 0.78029376, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80176824, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 2.5806596279144287 + }, + { + "auxiliary_loss_clip": 0.01085908, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.01731479, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.121782927173339, + "language_loss": 0.7619468, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78311986, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.6435022354125977 + }, + { + "auxiliary_loss_clip": 0.01103152, + "auxiliary_loss_mlp": 0.01040244, + "balance_loss_clip": 1.03992796, + "balance_loss_mlp": 1.02437687, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 1.8413699421490912, + "language_loss": 0.69565034, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71708435, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.6272385120391846 + }, + { + "auxiliary_loss_clip": 0.01036387, + "auxiliary_loss_mlp": 0.01005142, + "balance_loss_clip": 1.02411628, + "balance_loss_mlp": 1.0030793, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.8022558124626393, + "language_loss": 0.6106559, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63107121, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 3.0432190895080566 + }, + { + "auxiliary_loss_clip": 0.01111057, + "auxiliary_loss_mlp": 0.01040169, + "balance_loss_clip": 1.03930402, + "balance_loss_mlp": 1.02394438, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 1.931095072309743, + "language_loss": 0.78304124, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80455351, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.5664920806884766 + }, + { + "auxiliary_loss_clip": 0.01110906, + "auxiliary_loss_mlp": 0.01038342, + "balance_loss_clip": 1.0390054, + "balance_loss_mlp": 1.02201056, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 2.4615039791340707, + "language_loss": 0.71026504, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73175752, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 2.727118492126465 + }, + { + "auxiliary_loss_clip": 0.01090038, + "auxiliary_loss_mlp": 0.00750355, + "balance_loss_clip": 1.03549314, + "balance_loss_mlp": 1.00046659, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 1.8299072654163142, + "language_loss": 0.76875865, + "learning_rate": 3.588847902019718e-06, + "loss": 0.7871626, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 2.649251937866211 + }, + { + "auxiliary_loss_clip": 0.01122122, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.0401144, + "balance_loss_mlp": 1.02265429, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.950151155423219, + "language_loss": 0.69372207, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71532309, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.5565598011016846 + }, + { + "auxiliary_loss_clip": 0.01078156, + "auxiliary_loss_mlp": 0.01045269, + "balance_loss_clip": 1.03639984, + "balance_loss_mlp": 1.02828145, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 3.069366430712927, + "language_loss": 0.66580915, + "learning_rate": 3.588374691807428e-06, + "loss": 0.68704331, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.5826849937438965 + }, + { + "auxiliary_loss_clip": 0.01115696, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.04114151, + "balance_loss_mlp": 1.01758194, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.815880970336074, + "language_loss": 0.80001485, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82150948, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.6939172744750977 + }, + { + "auxiliary_loss_clip": 0.01086425, + "auxiliary_loss_mlp": 0.01047605, + "balance_loss_clip": 1.03642869, + "balance_loss_mlp": 1.02841187, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 2.7712628127971652, + "language_loss": 0.65412092, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67546123, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 2.6570053100585938 + }, + { + "auxiliary_loss_clip": 0.01124488, + "auxiliary_loss_mlp": 0.01045724, + "balance_loss_clip": 1.03971481, + "balance_loss_mlp": 1.03033435, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.0837730727430137, + "language_loss": 0.71057856, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.73228061, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.6332497596740723 + }, + { + "auxiliary_loss_clip": 0.01079036, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.03959346, + "balance_loss_mlp": 1.01950002, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 2.0106759311201516, + "language_loss": 0.77562857, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79676032, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 2.782879590988159 + }, + { + "auxiliary_loss_clip": 0.01106675, + "auxiliary_loss_mlp": 0.00750538, + "balance_loss_clip": 1.0389874, + "balance_loss_mlp": 1.00040221, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.450912016101704, + "language_loss": 0.91301996, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93159211, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 2.575690507888794 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.01036329, + "balance_loss_clip": 1.04033113, + "balance_loss_mlp": 1.02130818, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.7705477991609477, + "language_loss": 0.76151216, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78261793, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.6911282539367676 + }, + { + "auxiliary_loss_clip": 0.01110705, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.03899503, + "balance_loss_mlp": 1.0173316, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.7123629767021202, + "language_loss": 0.84063268, + "learning_rate": 3.58671655924898e-06, + "loss": 0.8620671, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.6212546825408936 + }, + { + "auxiliary_loss_clip": 0.01065031, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.03457844, + "balance_loss_mlp": 1.02289915, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 1.9869657469456827, + "language_loss": 0.82699066, + "learning_rate": 3.586479442423508e-06, + "loss": 0.84803796, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.666893482208252 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.00750423, + "balance_loss_clip": 1.03849506, + "balance_loss_mlp": 1.00033951, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 2.0044420755561054, + "language_loss": 0.85790849, + "learning_rate": 3.586242265438576e-06, + "loss": 0.87643403, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 2.582317352294922 + }, + { + "auxiliary_loss_clip": 0.01087154, + "auxiliary_loss_mlp": 0.01040696, + "balance_loss_clip": 1.03879392, + "balance_loss_mlp": 1.02662325, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.3950332202871678, + "language_loss": 0.74779767, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.76907611, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.6297988891601562 + }, + { + "auxiliary_loss_clip": 0.01087314, + "auxiliary_loss_mlp": 0.01041802, + "balance_loss_clip": 1.04249263, + "balance_loss_mlp": 1.02747965, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.8148449378718245, + "language_loss": 0.74408555, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76537669, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 2.6766843795776367 + }, + { + "auxiliary_loss_clip": 0.01123992, + "auxiliary_loss_mlp": 0.0103562, + "balance_loss_clip": 1.0411104, + "balance_loss_mlp": 1.02017021, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 1.7092336607024206, + "language_loss": 0.705755, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72735113, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.636322259902954 + }, + { + "auxiliary_loss_clip": 0.01133791, + "auxiliary_loss_mlp": 0.01049989, + "balance_loss_clip": 1.04385591, + "balance_loss_mlp": 1.03217959, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 2.118732266626748, + "language_loss": 0.9439342, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.96577203, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 2.606306314468384 + }, + { + "auxiliary_loss_clip": 0.01106942, + "auxiliary_loss_mlp": 0.01040891, + "balance_loss_clip": 1.03949928, + "balance_loss_mlp": 1.02521539, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.469505330140019, + "language_loss": 0.73203713, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75351542, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 2.541581392288208 + }, + { + "auxiliary_loss_clip": 0.01106754, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.04124236, + "balance_loss_mlp": 1.02079368, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.8596250320426897, + "language_loss": 0.82367891, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84511602, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 2.5852153301239014 + }, + { + "auxiliary_loss_clip": 0.0109885, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.03701437, + "balance_loss_mlp": 1.02501583, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.7453545554417114, + "language_loss": 0.73082519, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75221741, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 2.6222612857818604 + }, + { + "auxiliary_loss_clip": 0.01110564, + "auxiliary_loss_mlp": 0.01047623, + "balance_loss_clip": 1.04114592, + "balance_loss_mlp": 1.03232217, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.7593418956108908, + "language_loss": 0.79257858, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81416035, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 4.1349427700042725 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.04298162, + "balance_loss_mlp": 1.02233529, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 1.7736553167893203, + "language_loss": 0.70722604, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72889805, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 2.5120861530303955 + }, + { + "auxiliary_loss_clip": 0.0111614, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_clip": 1.04381895, + "balance_loss_mlp": 1.03278875, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.152316228820492, + "language_loss": 0.69019139, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71186113, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 2.5677099227905273 + }, + { + "auxiliary_loss_clip": 0.01120287, + "auxiliary_loss_mlp": 0.01042396, + "balance_loss_clip": 1.04094088, + "balance_loss_mlp": 1.02518225, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.654051129066767, + "language_loss": 0.77906388, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80069065, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 4.222899436950684 + }, + { + "auxiliary_loss_clip": 0.01021902, + "auxiliary_loss_mlp": 0.01015842, + "balance_loss_clip": 1.01357603, + "balance_loss_mlp": 1.01397002, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8570481106208652, + "language_loss": 0.60545772, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62583512, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.0332140922546387 + }, + { + "auxiliary_loss_clip": 0.01101579, + "auxiliary_loss_mlp": 0.01040545, + "balance_loss_clip": 1.04000163, + "balance_loss_mlp": 1.02415359, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.29660330272354, + "language_loss": 0.80390304, + "learning_rate": 3.583153494218927e-06, + "loss": 0.8253243, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 2.6247196197509766 + }, + { + "auxiliary_loss_clip": 0.01124537, + "auxiliary_loss_mlp": 0.00750142, + "balance_loss_clip": 1.04338861, + "balance_loss_mlp": 1.00036204, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.727036993787822, + "language_loss": 0.61405861, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63280535, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.5841169357299805 + }, + { + "auxiliary_loss_clip": 0.01093205, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.04070807, + "balance_loss_mlp": 1.02563334, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.7295113590112778, + "language_loss": 0.70831668, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72967827, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 4.216597557067871 + }, + { + "auxiliary_loss_clip": 0.01115706, + "auxiliary_loss_mlp": 0.01044623, + "balance_loss_clip": 1.04191864, + "balance_loss_mlp": 1.02785039, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 11.243762921049004, + "language_loss": 0.81175077, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83335412, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.5915684700012207 + }, + { + "auxiliary_loss_clip": 0.01064811, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.03560519, + "balance_loss_mlp": 1.02047372, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.8500565614679958, + "language_loss": 0.75204062, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.77307492, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 2.8129663467407227 + }, + { + "auxiliary_loss_clip": 0.01070593, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_clip": 1.03732741, + "balance_loss_mlp": 1.02793694, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.43960266752066, + "language_loss": 0.8905319, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.91169012, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 4.171555280685425 + }, + { + "auxiliary_loss_clip": 0.01107026, + "auxiliary_loss_mlp": 0.01042898, + "balance_loss_clip": 1.0428623, + "balance_loss_mlp": 1.02665019, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.7308046157250256, + "language_loss": 0.71872842, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74022758, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.6136021614074707 + }, + { + "auxiliary_loss_clip": 0.01125061, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.04171383, + "balance_loss_mlp": 1.02339268, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.6599415884255047, + "language_loss": 0.68143231, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70307863, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.590576410293579 + }, + { + "auxiliary_loss_clip": 0.01087074, + "auxiliary_loss_mlp": 0.0105187, + "balance_loss_clip": 1.0389936, + "balance_loss_mlp": 1.03452504, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 1.9295567149185573, + "language_loss": 0.76823461, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78962409, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 2.7182352542877197 + }, + { + "auxiliary_loss_clip": 0.01021282, + "auxiliary_loss_mlp": 0.01001222, + "balance_loss_clip": 1.01240945, + "balance_loss_mlp": 0.99925536, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 2.13371462353639, + "language_loss": 0.59130096, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61152601, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.287127733230591 + }, + { + "auxiliary_loss_clip": 0.01096307, + "auxiliary_loss_mlp": 0.01036223, + "balance_loss_clip": 1.04195619, + "balance_loss_mlp": 1.0203563, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 1.6319992009224331, + "language_loss": 0.80303633, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82436168, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 2.7361514568328857 + }, + { + "auxiliary_loss_clip": 0.0111253, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.04306769, + "balance_loss_mlp": 1.01899099, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.3314000251410723, + "language_loss": 0.88152528, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90300584, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.5629823207855225 + }, + { + "auxiliary_loss_clip": 0.01129724, + "auxiliary_loss_mlp": 0.01037702, + "balance_loss_clip": 1.04482472, + "balance_loss_mlp": 1.02126288, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 1.956118504329654, + "language_loss": 0.72904193, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75071615, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.6398580074310303 + }, + { + "auxiliary_loss_clip": 0.01116612, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.04169512, + "balance_loss_mlp": 1.01939213, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8256008921014353, + "language_loss": 0.8423081, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86382711, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 2.647108793258667 + }, + { + "auxiliary_loss_clip": 0.01098288, + "auxiliary_loss_mlp": 0.01052766, + "balance_loss_clip": 1.04136825, + "balance_loss_mlp": 1.03420532, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 2.2491245818391308, + "language_loss": 0.87258625, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89409679, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 2.5892434120178223 + }, + { + "auxiliary_loss_clip": 0.01117906, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.04344356, + "balance_loss_mlp": 1.02061176, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 2.510141244485217, + "language_loss": 0.76951468, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79105842, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.534158229827881 + }, + { + "auxiliary_loss_clip": 0.01080396, + "auxiliary_loss_mlp": 0.0075036, + "balance_loss_clip": 1.03846502, + "balance_loss_mlp": 1.00037634, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.7617598892769262, + "language_loss": 0.7268132, + "learning_rate": 3.579338004009412e-06, + "loss": 0.74512076, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 2.8874361515045166 + }, + { + "auxiliary_loss_clip": 0.01121636, + "auxiliary_loss_mlp": 0.01038231, + "balance_loss_clip": 1.04119492, + "balance_loss_mlp": 1.02290058, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.9960939081339386, + "language_loss": 0.82928693, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.85088551, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.586729049682617 + }, + { + "auxiliary_loss_clip": 0.01070412, + "auxiliary_loss_mlp": 0.01043245, + "balance_loss_clip": 1.03448713, + "balance_loss_mlp": 1.02482724, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 1.8109413418004978, + "language_loss": 0.64912462, + "learning_rate": 3.578859988977082e-06, + "loss": 0.6702612, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 2.822795867919922 + }, + { + "auxiliary_loss_clip": 0.01081014, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.03923249, + "balance_loss_mlp": 1.01861477, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.197179652970162, + "language_loss": 0.79226238, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81341982, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 2.687807559967041 + }, + { + "auxiliary_loss_clip": 0.01111175, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.03930604, + "balance_loss_mlp": 1.02499509, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.401349985722607, + "language_loss": 0.81765121, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.8391636, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.645843982696533 + }, + { + "auxiliary_loss_clip": 0.01115256, + "auxiliary_loss_mlp": 0.0104462, + "balance_loss_clip": 1.04188311, + "balance_loss_mlp": 1.02853847, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.4703325816915163, + "language_loss": 0.80731308, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82891184, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 2.582324266433716 + }, + { + "auxiliary_loss_clip": 0.01100872, + "auxiliary_loss_mlp": 0.01038993, + "balance_loss_clip": 1.03776991, + "balance_loss_mlp": 1.02182651, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 2.0128845096655854, + "language_loss": 0.8322506, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85364926, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 2.6132442951202393 + }, + { + "auxiliary_loss_clip": 0.01115438, + "auxiliary_loss_mlp": 0.01048394, + "balance_loss_clip": 1.04425395, + "balance_loss_mlp": 1.03143024, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.625167470491007, + "language_loss": 0.79153442, + "learning_rate": 3.577663903820705e-06, + "loss": 0.8131727, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.5329790115356445 + }, + { + "auxiliary_loss_clip": 0.01084342, + "auxiliary_loss_mlp": 0.01046557, + "balance_loss_clip": 1.03792715, + "balance_loss_mlp": 1.03079772, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 1.872685483427136, + "language_loss": 0.74084437, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76215333, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.6788854598999023 + }, + { + "auxiliary_loss_clip": 0.01088696, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.0380888, + "balance_loss_mlp": 1.02587104, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 2.0576721219793526, + "language_loss": 0.7521444, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77345479, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.6983132362365723 + }, + { + "auxiliary_loss_clip": 0.01077427, + "auxiliary_loss_mlp": 0.01046247, + "balance_loss_clip": 1.04016197, + "balance_loss_mlp": 1.02954578, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 2.514030890252376, + "language_loss": 0.66862798, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.68986475, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 2.7009730339050293 + }, + { + "auxiliary_loss_clip": 0.01001456, + "auxiliary_loss_mlp": 0.01008617, + "balance_loss_clip": 1.01317286, + "balance_loss_mlp": 1.00614929, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.8078748296690694, + "language_loss": 0.58246744, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60256815, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 3.182525634765625 + }, + { + "auxiliary_loss_clip": 0.01105879, + "auxiliary_loss_mlp": 0.01039346, + "balance_loss_clip": 1.04280066, + "balance_loss_mlp": 1.02294278, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 2.3771956036396737, + "language_loss": 0.79964811, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82110035, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.5977389812469482 + }, + { + "auxiliary_loss_clip": 0.01065493, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.03331518, + "balance_loss_mlp": 1.01939678, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 1.895532309501659, + "language_loss": 0.81812835, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.83914685, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.7197353839874268 + }, + { + "auxiliary_loss_clip": 0.01124524, + "auxiliary_loss_mlp": 0.01043299, + "balance_loss_clip": 1.0421834, + "balance_loss_mlp": 1.02721727, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9540687916977053, + "language_loss": 0.71270752, + "learning_rate": 3.57598687219895e-06, + "loss": 0.73438579, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 2.572594404220581 + }, + { + "auxiliary_loss_clip": 0.01123367, + "auxiliary_loss_mlp": 0.01033787, + "balance_loss_clip": 1.04196429, + "balance_loss_mlp": 1.01836181, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 2.0649051382082675, + "language_loss": 0.71318209, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73475367, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 2.571054220199585 + }, + { + "auxiliary_loss_clip": 0.01119088, + "auxiliary_loss_mlp": 0.01039152, + "balance_loss_clip": 1.03950608, + "balance_loss_mlp": 1.0210917, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 1.9036837218981235, + "language_loss": 0.73043507, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75201744, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 2.6461341381073 + }, + { + "auxiliary_loss_clip": 0.0111242, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.03921425, + "balance_loss_mlp": 1.0289197, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.8355750742087646, + "language_loss": 0.72870511, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75028044, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 2.5364582538604736 + }, + { + "auxiliary_loss_clip": 0.0104051, + "auxiliary_loss_mlp": 0.01003289, + "balance_loss_clip": 1.03130078, + "balance_loss_mlp": 1.00150132, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0291476904569812, + "language_loss": 0.73402739, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75446546, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 2.9352643489837646 + }, + { + "auxiliary_loss_clip": 0.011182, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.0426712, + "balance_loss_mlp": 1.02017105, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.732741570202818, + "language_loss": 0.87989497, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.901443, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 4.11045503616333 + }, + { + "auxiliary_loss_clip": 0.01113605, + "auxiliary_loss_mlp": 0.01038406, + "balance_loss_clip": 1.04076767, + "balance_loss_mlp": 1.02290869, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 3.1658465414075416, + "language_loss": 0.75952166, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78104174, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 2.612741231918335 + }, + { + "auxiliary_loss_clip": 0.01109626, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.03981316, + "balance_loss_mlp": 1.02798152, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 2.7015899223184445, + "language_loss": 0.81714028, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83866024, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 2.585737943649292 + }, + { + "auxiliary_loss_clip": 0.01100665, + "auxiliary_loss_mlp": 0.01048705, + "balance_loss_clip": 1.03972423, + "balance_loss_mlp": 1.03249884, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 2.5335763648288534, + "language_loss": 0.71600425, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73749799, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 4.164759159088135 + }, + { + "auxiliary_loss_clip": 0.01119764, + "auxiliary_loss_mlp": 0.00750657, + "balance_loss_clip": 1.04207873, + "balance_loss_mlp": 1.00037408, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7351228816297701, + "language_loss": 0.76122737, + "learning_rate": 3.57382638628884e-06, + "loss": 0.77993155, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.6298084259033203 + }, + { + "auxiliary_loss_clip": 0.0106545, + "auxiliary_loss_mlp": 0.01037835, + "balance_loss_clip": 1.04003167, + "balance_loss_mlp": 1.02023947, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 9.344832492452975, + "language_loss": 0.89651358, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.91754645, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.8322699069976807 + }, + { + "auxiliary_loss_clip": 0.01023503, + "auxiliary_loss_mlp": 0.01001856, + "balance_loss_clip": 1.01563334, + "balance_loss_mlp": 0.99990129, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8140704909680357, + "language_loss": 0.59443176, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6146853, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 4.553088665008545 + }, + { + "auxiliary_loss_clip": 0.01005495, + "auxiliary_loss_mlp": 0.01006865, + "balance_loss_clip": 1.0211134, + "balance_loss_mlp": 1.005041, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7570338276048169, + "language_loss": 0.49415854, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51428211, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.1943130493164062 + }, + { + "auxiliary_loss_clip": 0.01091703, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.03965068, + "balance_loss_mlp": 1.03734708, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 1.9134483068078014, + "language_loss": 0.76875627, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.79021513, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.716508626937866 + }, + { + "auxiliary_loss_clip": 0.01073847, + "auxiliary_loss_mlp": 0.0104829, + "balance_loss_clip": 1.03737938, + "balance_loss_mlp": 1.03166044, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 1.8037574704059385, + "language_loss": 0.69343525, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71465659, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.6613709926605225 + }, + { + "auxiliary_loss_clip": 0.01088166, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.04201782, + "balance_loss_mlp": 1.02051866, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.723658101331602, + "language_loss": 0.70636988, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72762132, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 4.2590014934539795 + }, + { + "auxiliary_loss_clip": 0.01102047, + "auxiliary_loss_mlp": 0.01045973, + "balance_loss_clip": 1.04061174, + "balance_loss_mlp": 1.03054714, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.6706536842662887, + "language_loss": 0.76977348, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79125369, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.683279037475586 + }, + { + "auxiliary_loss_clip": 0.01091528, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.03997135, + "balance_loss_mlp": 1.02407074, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.1925886888401367, + "language_loss": 0.74730819, + "learning_rate": 3.571901895946612e-06, + "loss": 0.76862979, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 2.6042561531066895 + }, + { + "auxiliary_loss_clip": 0.0109744, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.04087234, + "balance_loss_mlp": 1.02044976, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.34948033343224, + "language_loss": 0.80392337, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82525337, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.6455490589141846 + }, + { + "auxiliary_loss_clip": 0.01066171, + "auxiliary_loss_mlp": 0.01044999, + "balance_loss_clip": 1.03779376, + "balance_loss_mlp": 1.02788055, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.8424680712718293, + "language_loss": 0.74428618, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76539785, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 2.6495516300201416 + }, + { + "auxiliary_loss_clip": 0.01130041, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.04610956, + "balance_loss_mlp": 1.02436256, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 1.8842457950573746, + "language_loss": 0.83118665, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.8528803, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.549055576324463 + }, + { + "auxiliary_loss_clip": 0.01103267, + "auxiliary_loss_mlp": 0.01044122, + "balance_loss_clip": 1.04027855, + "balance_loss_mlp": 1.0273968, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.9105951509236603, + "language_loss": 0.59233427, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61380816, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.6817362308502197 + }, + { + "auxiliary_loss_clip": 0.01105729, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.04258001, + "balance_loss_mlp": 1.02405, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.7621005299071506, + "language_loss": 0.71763295, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73908132, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.6430630683898926 + }, + { + "auxiliary_loss_clip": 0.01097047, + "auxiliary_loss_mlp": 0.01040441, + "balance_loss_clip": 1.03823984, + "balance_loss_mlp": 1.02539659, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 2.5906434751536547, + "language_loss": 0.74975979, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77113461, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 2.601858615875244 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.0104378, + "balance_loss_clip": 1.03926635, + "balance_loss_mlp": 1.02685261, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.7806846111791446, + "language_loss": 0.8191359, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.84056216, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.5586438179016113 + }, + { + "auxiliary_loss_clip": 0.01134889, + "auxiliary_loss_mlp": 0.01042044, + "balance_loss_clip": 1.04642463, + "balance_loss_mlp": 1.02417493, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 1.7982348162544006, + "language_loss": 0.71718872, + "learning_rate": 3.569973590777789e-06, + "loss": 0.73895806, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 2.5325305461883545 + }, + { + "auxiliary_loss_clip": 0.01124606, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.0414027, + "balance_loss_mlp": 1.02336645, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 2.491172347367528, + "language_loss": 0.7404741, + "learning_rate": 3.569732284634665e-06, + "loss": 0.76212019, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 2.6808435916900635 + }, + { + "auxiliary_loss_clip": 0.01115914, + "auxiliary_loss_mlp": 0.01039605, + "balance_loss_clip": 1.04338181, + "balance_loss_mlp": 1.02242708, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 1.997809316723934, + "language_loss": 0.80534416, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82689935, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.5829813480377197 + }, + { + "auxiliary_loss_clip": 0.01085797, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.04025936, + "balance_loss_mlp": 1.02322006, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.6110784335954824, + "language_loss": 0.85726649, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87849647, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 2.6958744525909424 + }, + { + "auxiliary_loss_clip": 0.01075408, + "auxiliary_loss_mlp": 0.01040791, + "balance_loss_clip": 1.03878152, + "balance_loss_mlp": 1.0223372, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.2391327458908195, + "language_loss": 0.82315612, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84431815, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.630887031555176 + }, + { + "auxiliary_loss_clip": 0.01127572, + "auxiliary_loss_mlp": 0.01036947, + "balance_loss_clip": 1.04331422, + "balance_loss_mlp": 1.02100873, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.9882021172819173, + "language_loss": 0.78553951, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80718476, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 2.5432586669921875 + }, + { + "auxiliary_loss_clip": 0.0111208, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.04177618, + "balance_loss_mlp": 1.01901484, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.6023651801807102, + "language_loss": 0.79318511, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81464809, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 2.5898659229278564 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01036052, + "balance_loss_clip": 1.03969848, + "balance_loss_mlp": 1.02016187, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.4469278440453852, + "language_loss": 0.78876382, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81016147, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.56416392326355 + }, + { + "auxiliary_loss_clip": 0.01110651, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.04235435, + "balance_loss_mlp": 1.02004755, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 1.8745984468624308, + "language_loss": 0.85587835, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87732697, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.5740206241607666 + }, + { + "auxiliary_loss_clip": 0.0111991, + "auxiliary_loss_mlp": 0.01044336, + "balance_loss_clip": 1.04019833, + "balance_loss_mlp": 1.0291605, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.636524298985105, + "language_loss": 0.94167143, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96331394, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 2.5104820728302 + }, + { + "auxiliary_loss_clip": 0.01124914, + "auxiliary_loss_mlp": 0.01041157, + "balance_loss_clip": 1.03991914, + "balance_loss_mlp": 1.02452731, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6078302192398455, + "language_loss": 0.82131207, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84297276, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 2.5453104972839355 + }, + { + "auxiliary_loss_clip": 0.01103564, + "auxiliary_loss_mlp": 0.00750538, + "balance_loss_clip": 1.04001427, + "balance_loss_mlp": 1.00038755, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.5996998710667794, + "language_loss": 0.89302927, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91157031, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.5763158798217773 + }, + { + "auxiliary_loss_clip": 0.01120455, + "auxiliary_loss_mlp": 0.01037691, + "balance_loss_clip": 1.03667045, + "balance_loss_mlp": 1.02085876, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 3.853027546350548, + "language_loss": 0.84556109, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86714256, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.5020434856414795 + }, + { + "auxiliary_loss_clip": 0.01084134, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_clip": 1.03874862, + "balance_loss_mlp": 1.02576065, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 3.1536739996031153, + "language_loss": 0.81043792, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83170551, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 2.6581714153289795 + }, + { + "auxiliary_loss_clip": 0.01089577, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_clip": 1.0379616, + "balance_loss_mlp": 1.02419126, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.2744048145611178, + "language_loss": 0.67163444, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69295353, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 2.6002979278564453 + }, + { + "auxiliary_loss_clip": 0.01092898, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.03776145, + "balance_loss_mlp": 1.01925468, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 2.0047568147090264, + "language_loss": 0.75781667, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77910721, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 2.616302967071533 + }, + { + "auxiliary_loss_clip": 0.01100788, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.0376451, + "balance_loss_mlp": 1.02630675, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.4983765192316105, + "language_loss": 0.63447726, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65590692, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 2.5854735374450684 + }, + { + "auxiliary_loss_clip": 0.01104669, + "auxiliary_loss_mlp": 0.01038389, + "balance_loss_clip": 1.03622103, + "balance_loss_mlp": 1.02178276, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.1358221548506426, + "language_loss": 0.77091128, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79234189, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 2.558912754058838 + }, + { + "auxiliary_loss_clip": 0.01122603, + "auxiliary_loss_mlp": 0.01038399, + "balance_loss_clip": 1.04632652, + "balance_loss_mlp": 1.02136409, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.5907674127291451, + "language_loss": 0.80693913, + "learning_rate": 3.565620980442944e-06, + "loss": 0.82854915, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 2.6621766090393066 + }, + { + "auxiliary_loss_clip": 0.01105803, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.04143608, + "balance_loss_mlp": 1.02663147, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 1.7837490517569239, + "language_loss": 0.80353439, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82501942, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 2.61460018157959 + }, + { + "auxiliary_loss_clip": 0.0109992, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.03963506, + "balance_loss_mlp": 1.02155328, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.735839321766097, + "language_loss": 0.73131752, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75269979, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 4.0355212688446045 + }, + { + "auxiliary_loss_clip": 0.01120958, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.04046643, + "balance_loss_mlp": 1.01953316, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 1.957484793662746, + "language_loss": 0.72885048, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75040054, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 2.5316219329833984 + }, + { + "auxiliary_loss_clip": 0.01102179, + "auxiliary_loss_mlp": 0.010402, + "balance_loss_clip": 1.04286849, + "balance_loss_mlp": 1.0235827, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.7713505844878312, + "language_loss": 0.73643827, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75786209, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 4.009505748748779 + }, + { + "auxiliary_loss_clip": 0.01071906, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.03414822, + "balance_loss_mlp": 1.02393746, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.9233984224298497, + "language_loss": 0.7091006, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73020917, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.6600940227508545 + }, + { + "auxiliary_loss_clip": 0.01126427, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.04186881, + "balance_loss_mlp": 1.02651262, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.8021042750026561, + "language_loss": 0.81212258, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83382565, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.5657846927642822 + }, + { + "auxiliary_loss_clip": 0.01103778, + "auxiliary_loss_mlp": 0.01039174, + "balance_loss_clip": 1.04088736, + "balance_loss_mlp": 1.02219903, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.4682893940954855, + "language_loss": 0.66095287, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.6823824, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.570241928100586 + }, + { + "auxiliary_loss_clip": 0.01124875, + "auxiliary_loss_mlp": 0.01049371, + "balance_loss_clip": 1.04133058, + "balance_loss_mlp": 1.03357613, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.5116454938127992, + "language_loss": 0.84064555, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86238807, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 4.084832429885864 + }, + { + "auxiliary_loss_clip": 0.01078272, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.03882742, + "balance_loss_mlp": 1.02463806, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.0263096329673576, + "language_loss": 0.84664208, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.86782479, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.6677045822143555 + }, + { + "auxiliary_loss_clip": 0.01064807, + "auxiliary_loss_mlp": 0.01043292, + "balance_loss_clip": 1.03724337, + "balance_loss_mlp": 1.02899849, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.088817999573284, + "language_loss": 0.7010783, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72215933, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.7240865230560303 + }, + { + "auxiliary_loss_clip": 0.01076458, + "auxiliary_loss_mlp": 0.01042373, + "balance_loss_clip": 1.03700209, + "balance_loss_mlp": 1.02390778, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 2.361066478020214, + "language_loss": 0.66415626, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68534458, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.628317356109619 + }, + { + "auxiliary_loss_clip": 0.01079691, + "auxiliary_loss_mlp": 0.01041098, + "balance_loss_clip": 1.03851366, + "balance_loss_mlp": 1.02641129, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.6438647025518203, + "language_loss": 0.72245002, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74365795, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 4.180355548858643 + }, + { + "auxiliary_loss_clip": 0.01053522, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.04739022, + "balance_loss_mlp": 1.02040291, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.5882247292737748, + "language_loss": 0.74134135, + "learning_rate": 3.562465462704307e-06, + "loss": 0.76224804, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 2.859961748123169 + }, + { + "auxiliary_loss_clip": 0.0112664, + "auxiliary_loss_mlp": 0.0104287, + "balance_loss_clip": 1.04069531, + "balance_loss_mlp": 1.02601409, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 1.8159376970174388, + "language_loss": 0.65695167, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.67864674, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 2.6207005977630615 + }, + { + "auxiliary_loss_clip": 0.01095551, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.03682911, + "balance_loss_mlp": 1.02332914, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.8407304679181307, + "language_loss": 0.74642861, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76777601, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 2.630251169204712 + }, + { + "auxiliary_loss_clip": 0.01096462, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.04300153, + "balance_loss_mlp": 1.02356803, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.0618152559051217, + "language_loss": 0.77287102, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79423726, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 2.6279730796813965 + }, + { + "auxiliary_loss_clip": 0.01082477, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.03883886, + "balance_loss_mlp": 1.02522564, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 1.9466188305131438, + "language_loss": 0.71293998, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73416948, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.6691365242004395 + }, + { + "auxiliary_loss_clip": 0.01089015, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.03861988, + "balance_loss_mlp": 1.02115238, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.7422920666468993, + "language_loss": 0.78350121, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80475295, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.6298112869262695 + }, + { + "auxiliary_loss_clip": 0.01099809, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.04049802, + "balance_loss_mlp": 1.02325296, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 1.836172955730194, + "language_loss": 0.6880551, + "learning_rate": 3.561005691492797e-06, + "loss": 0.70943856, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 2.6117756366729736 + }, + { + "auxiliary_loss_clip": 0.01093441, + "auxiliary_loss_mlp": 0.01047622, + "balance_loss_clip": 1.04063404, + "balance_loss_mlp": 1.03198123, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 1.8675214983996595, + "language_loss": 0.67545313, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.69686377, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 2.5869154930114746 + }, + { + "auxiliary_loss_clip": 0.0107679, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_clip": 1.04115272, + "balance_loss_mlp": 1.02755761, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 2.0146278188901148, + "language_loss": 0.77369249, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.79489136, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 2.7199690341949463 + }, + { + "auxiliary_loss_clip": 0.01097232, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.03973079, + "balance_loss_mlp": 1.01949883, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.1678899880465816, + "language_loss": 0.77031797, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.79163635, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.6366782188415527 + }, + { + "auxiliary_loss_clip": 0.01085673, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.03657579, + "balance_loss_mlp": 1.02788556, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 1.8744977562340739, + "language_loss": 0.8496145, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87091374, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.64095139503479 + }, + { + "auxiliary_loss_clip": 0.01036291, + "auxiliary_loss_mlp": 0.01001733, + "balance_loss_clip": 1.01794076, + "balance_loss_mlp": 0.99999231, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 4.230778702357852, + "language_loss": 0.62821984, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.6486001, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 3.177868127822876 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.04084146, + "balance_loss_mlp": 1.01973414, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 2.8042137347493505, + "language_loss": 0.81655312, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.83789712, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 2.5738883018493652 + }, + { + "auxiliary_loss_clip": 0.01087723, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.03680849, + "balance_loss_mlp": 1.02646506, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.607733200880675, + "language_loss": 0.79203773, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81333816, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.617008924484253 + }, + { + "auxiliary_loss_clip": 0.01105656, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.0394901, + "balance_loss_mlp": 1.0242033, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 2.103135748029752, + "language_loss": 0.84680003, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86826581, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.4947404861450195 + }, + { + "auxiliary_loss_clip": 0.01098922, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.03728056, + "balance_loss_mlp": 1.02291584, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.202046272329623, + "language_loss": 0.84299445, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.86436886, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 2.5811004638671875 + }, + { + "auxiliary_loss_clip": 0.01053163, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.0365932, + "balance_loss_mlp": 1.0197165, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.7695801528701747, + "language_loss": 0.74407816, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76494962, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 2.718573570251465 + }, + { + "auxiliary_loss_clip": 0.01125174, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.04265583, + "balance_loss_mlp": 1.0222075, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6057694401797868, + "language_loss": 0.71735585, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.73898733, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 2.56447172164917 + }, + { + "auxiliary_loss_clip": 0.01096361, + "auxiliary_loss_mlp": 0.0104159, + "balance_loss_clip": 1.0384022, + "balance_loss_mlp": 1.0259614, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.2592697675448674, + "language_loss": 0.78365493, + "learning_rate": 3.558079758168997e-06, + "loss": 0.8050344, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.6303551197052 + }, + { + "auxiliary_loss_clip": 0.01097647, + "auxiliary_loss_mlp": 0.01046101, + "balance_loss_clip": 1.037467, + "balance_loss_mlp": 1.03010345, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 2.2011721617002724, + "language_loss": 0.8165803, + "learning_rate": 3.557835546134977e-06, + "loss": 0.83801776, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.647202491760254 + }, + { + "auxiliary_loss_clip": 0.0107159, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.03838611, + "balance_loss_mlp": 1.01946962, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.821413830417799, + "language_loss": 0.83914208, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86021185, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.6214189529418945 + }, + { + "auxiliary_loss_clip": 0.01100662, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.04094589, + "balance_loss_mlp": 1.02186847, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 1.9635431906529692, + "language_loss": 0.76888198, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79026854, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 2.704566240310669 + }, + { + "auxiliary_loss_clip": 0.01087202, + "auxiliary_loss_mlp": 0.01039054, + "balance_loss_clip": 1.04014671, + "balance_loss_mlp": 1.02399158, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 1.9322859085006978, + "language_loss": 0.78574353, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.807006, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 2.610915184020996 + }, + { + "auxiliary_loss_clip": 0.01111434, + "auxiliary_loss_mlp": 0.00750376, + "balance_loss_clip": 1.04113448, + "balance_loss_mlp": 1.00047672, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.673297608166178, + "language_loss": 0.72957599, + "learning_rate": 3.556858107358737e-06, + "loss": 0.74819404, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.6200475692749023 + }, + { + "auxiliary_loss_clip": 0.01078252, + "auxiliary_loss_mlp": 0.01051243, + "balance_loss_clip": 1.03861713, + "balance_loss_mlp": 1.03386199, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.424957112795332, + "language_loss": 0.78646648, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.80776143, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 2.6482419967651367 + }, + { + "auxiliary_loss_clip": 0.01074298, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.03674555, + "balance_loss_mlp": 1.02686524, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 1.8244089891174144, + "language_loss": 0.73465705, + "learning_rate": 3.556369033716254e-06, + "loss": 0.75582755, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.7322049140930176 + }, + { + "auxiliary_loss_clip": 0.01116767, + "auxiliary_loss_mlp": 0.01048193, + "balance_loss_clip": 1.04054904, + "balance_loss_mlp": 1.0325762, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 1.853225962996581, + "language_loss": 0.88114291, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90279245, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 2.62689208984375 + }, + { + "auxiliary_loss_clip": 0.01106893, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.04022622, + "balance_loss_mlp": 1.02174866, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 16.51650876744448, + "language_loss": 0.82933044, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85075194, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 2.5320563316345215 + }, + { + "auxiliary_loss_clip": 0.01109311, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.03826404, + "balance_loss_mlp": 1.01949108, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.6506599033233489, + "language_loss": 0.85287404, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87432313, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.536830186843872 + }, + { + "auxiliary_loss_clip": 0.01121431, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.03915346, + "balance_loss_mlp": 1.02311742, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.1113355616360954, + "language_loss": 0.84304214, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86463779, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 3.9610633850097656 + }, + { + "auxiliary_loss_clip": 0.01105685, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.03768313, + "balance_loss_mlp": 1.02158117, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.6060645330241758, + "language_loss": 0.7558524, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77727532, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.5765697956085205 + }, + { + "auxiliary_loss_clip": 0.01021372, + "auxiliary_loss_mlp": 0.01006034, + "balance_loss_clip": 1.01776671, + "balance_loss_mlp": 1.00437737, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.9010864547867934, + "language_loss": 0.63821971, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65849376, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 4.479059457778931 + }, + { + "auxiliary_loss_clip": 0.01033338, + "auxiliary_loss_mlp": 0.01000661, + "balance_loss_clip": 1.0153259, + "balance_loss_mlp": 0.99908745, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.761260757909747, + "language_loss": 0.63024431, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65058428, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.1963753700256348 + }, + { + "auxiliary_loss_clip": 0.01083701, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.03947914, + "balance_loss_mlp": 1.02397776, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.858056536375575, + "language_loss": 0.76916945, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79040956, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.644681692123413 + }, + { + "auxiliary_loss_clip": 0.01097433, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_clip": 1.03765249, + "balance_loss_mlp": 1.03101861, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.4798608905676944, + "language_loss": 0.78160489, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.8030653, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.653444766998291 + }, + { + "auxiliary_loss_clip": 0.01012572, + "auxiliary_loss_mlp": 0.01000011, + "balance_loss_clip": 1.01417542, + "balance_loss_mlp": 0.99819893, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.9056797512333364, + "language_loss": 0.63484502, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65497088, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 4.717845678329468 + }, + { + "auxiliary_loss_clip": 0.01103197, + "auxiliary_loss_mlp": 0.01039707, + "balance_loss_clip": 1.03923011, + "balance_loss_mlp": 1.02368522, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.3548070334683624, + "language_loss": 0.69489825, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.71632725, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.558380365371704 + }, + { + "auxiliary_loss_clip": 0.01111262, + "auxiliary_loss_mlp": 0.01039517, + "balance_loss_clip": 1.0390172, + "balance_loss_mlp": 1.02371001, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.832190846814002, + "language_loss": 0.87215495, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89366281, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.4976444244384766 + }, + { + "auxiliary_loss_clip": 0.01101924, + "auxiliary_loss_mlp": 0.01034359, + "balance_loss_clip": 1.03517401, + "balance_loss_mlp": 1.01843238, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.7685538474403992, + "language_loss": 0.75937301, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78073585, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.524397373199463 + }, + { + "auxiliary_loss_clip": 0.01090926, + "auxiliary_loss_mlp": 0.01044818, + "balance_loss_clip": 1.03740346, + "balance_loss_mlp": 1.02868867, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.827793837384316, + "language_loss": 0.72450423, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74586165, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 4.048647880554199 + }, + { + "auxiliary_loss_clip": 0.01117139, + "auxiliary_loss_mlp": 0.01037121, + "balance_loss_clip": 1.04295826, + "balance_loss_mlp": 1.02077699, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 1.6938313413592843, + "language_loss": 0.66485071, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68639326, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 2.554851770401001 + }, + { + "auxiliary_loss_clip": 0.01123448, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.04014111, + "balance_loss_mlp": 1.02158785, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 1.6761068828128542, + "language_loss": 0.82491833, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.8465367, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.5360288619995117 + }, + { + "auxiliary_loss_clip": 0.0108423, + "auxiliary_loss_mlp": 0.01038989, + "balance_loss_clip": 1.03916132, + "balance_loss_mlp": 1.02365863, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.831493409697511, + "language_loss": 0.82685328, + "learning_rate": 3.552202383898897e-06, + "loss": 0.84808546, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 2.6492815017700195 + }, + { + "auxiliary_loss_clip": 0.0109202, + "auxiliary_loss_mlp": 0.01038145, + "balance_loss_clip": 1.0399456, + "balance_loss_mlp": 1.02187312, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.0116570297616523, + "language_loss": 0.87490678, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89620841, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 2.563776731491089 + }, + { + "auxiliary_loss_clip": 0.01090995, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.03600812, + "balance_loss_mlp": 1.02999806, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 4.511698783720269, + "language_loss": 0.77751648, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79888672, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.6346163749694824 + }, + { + "auxiliary_loss_clip": 0.01063863, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.03428185, + "balance_loss_mlp": 1.01937985, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.8062215156353034, + "language_loss": 0.79589427, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81688911, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 2.6279983520507812 + }, + { + "auxiliary_loss_clip": 0.01104945, + "auxiliary_loss_mlp": 0.00750673, + "balance_loss_clip": 1.03944731, + "balance_loss_mlp": 1.00046897, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 1.8591917875644597, + "language_loss": 0.71304071, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73159683, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 2.639894485473633 + }, + { + "auxiliary_loss_clip": 0.01076059, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_clip": 1.03627396, + "balance_loss_mlp": 1.02968621, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.8693266294555504, + "language_loss": 0.7602483, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78145945, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.6113035678863525 + }, + { + "auxiliary_loss_clip": 0.01112908, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.04028845, + "balance_loss_mlp": 1.01896143, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 3.9691821192894143, + "language_loss": 0.74708688, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76856399, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 2.5917835235595703 + }, + { + "auxiliary_loss_clip": 0.01114514, + "auxiliary_loss_mlp": 0.01041746, + "balance_loss_clip": 1.04298604, + "balance_loss_mlp": 1.02704167, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.856664447509852, + "language_loss": 0.80157554, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82313812, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 2.5964605808258057 + }, + { + "auxiliary_loss_clip": 0.01097413, + "auxiliary_loss_mlp": 0.01043573, + "balance_loss_clip": 1.03924763, + "balance_loss_mlp": 1.02649057, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 3.6173427138779393, + "language_loss": 0.71232867, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.73373854, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 2.6439015865325928 + }, + { + "auxiliary_loss_clip": 0.0104235, + "auxiliary_loss_mlp": 0.01046641, + "balance_loss_clip": 1.03547215, + "balance_loss_mlp": 1.02897453, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6339403265512327, + "language_loss": 0.6933167, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71420658, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 2.8584752082824707 + }, + { + "auxiliary_loss_clip": 0.01117585, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.04253125, + "balance_loss_mlp": 1.02094865, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 1.9797684897553622, + "language_loss": 0.73497963, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.7565366, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.969705820083618 + }, + { + "auxiliary_loss_clip": 0.01126883, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.04286492, + "balance_loss_mlp": 1.01943898, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.965754695990839, + "language_loss": 0.88065934, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90227401, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.488842248916626 + }, + { + "auxiliary_loss_clip": 0.01088116, + "auxiliary_loss_mlp": 0.01043723, + "balance_loss_clip": 1.03475082, + "balance_loss_mlp": 1.0266999, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.161575123665949, + "language_loss": 0.94508398, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96640241, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 2.616631031036377 + }, + { + "auxiliary_loss_clip": 0.0109583, + "auxiliary_loss_mlp": 0.01036379, + "balance_loss_clip": 1.03715408, + "balance_loss_mlp": 1.02042913, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 2.0350534816606496, + "language_loss": 0.82684934, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84817141, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 2.62434458732605 + }, + { + "auxiliary_loss_clip": 0.01077181, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.03601372, + "balance_loss_mlp": 1.02330399, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.7568530915079985, + "language_loss": 0.69017386, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71132767, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 2.7918567657470703 + }, + { + "auxiliary_loss_clip": 0.01117849, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_clip": 1.04183853, + "balance_loss_mlp": 1.02600968, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.832070459887086, + "language_loss": 0.84654558, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.86814868, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.553392171859741 + }, + { + "auxiliary_loss_clip": 0.01027867, + "auxiliary_loss_mlp": 0.01008015, + "balance_loss_clip": 1.01012015, + "balance_loss_mlp": 1.00633395, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8185254959005238, + "language_loss": 0.60640043, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62675923, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.1765666007995605 + }, + { + "auxiliary_loss_clip": 0.01089926, + "auxiliary_loss_mlp": 0.01041724, + "balance_loss_clip": 1.03559029, + "balance_loss_mlp": 1.02476048, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.7636674968113935, + "language_loss": 0.73370421, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75502074, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 2.5904531478881836 + }, + { + "auxiliary_loss_clip": 0.01091605, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.02411866, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.8672365463737528, + "language_loss": 0.81785202, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83916444, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 2.603684902191162 + }, + { + "auxiliary_loss_clip": 0.0112885, + "auxiliary_loss_mlp": 0.01044287, + "balance_loss_clip": 1.04296219, + "balance_loss_mlp": 1.02692974, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 1.840404810769955, + "language_loss": 0.76467925, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78641057, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 2.5232555866241455 + }, + { + "auxiliary_loss_clip": 0.01078238, + "auxiliary_loss_mlp": 0.01045053, + "balance_loss_clip": 1.0348618, + "balance_loss_mlp": 1.02565789, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.8290432966732462, + "language_loss": 0.74950707, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77073997, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.591336965560913 + }, + { + "auxiliary_loss_clip": 0.01099174, + "auxiliary_loss_mlp": 0.01043056, + "balance_loss_clip": 1.03865147, + "balance_loss_mlp": 1.02759457, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 2.046863719323369, + "language_loss": 0.82110894, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.8425312, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 2.5395548343658447 + }, + { + "auxiliary_loss_clip": 0.01110671, + "auxiliary_loss_mlp": 0.01047523, + "balance_loss_clip": 1.03987265, + "balance_loss_mlp": 1.03171587, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.802281228440512, + "language_loss": 0.85921466, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88079661, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 2.4886233806610107 + }, + { + "auxiliary_loss_clip": 0.01067804, + "auxiliary_loss_mlp": 0.01053283, + "balance_loss_clip": 1.03538322, + "balance_loss_mlp": 1.03376842, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 2.424228085024228, + "language_loss": 0.72176206, + "learning_rate": 3.546538084949365e-06, + "loss": 0.74297285, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.6097512245178223 + }, + { + "auxiliary_loss_clip": 0.01112836, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.04115725, + "balance_loss_mlp": 1.02539253, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.9655032170547644, + "language_loss": 0.64220721, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66373748, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.551851987838745 + }, + { + "auxiliary_loss_clip": 0.01114829, + "auxiliary_loss_mlp": 0.00750487, + "balance_loss_clip": 1.04314148, + "balance_loss_mlp": 1.00049865, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.822773725452681, + "language_loss": 0.7113899, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.73004311, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.546456813812256 + }, + { + "auxiliary_loss_clip": 0.01025502, + "auxiliary_loss_mlp": 0.01006923, + "balance_loss_clip": 1.00756645, + "balance_loss_mlp": 1.00524223, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8510340662685071, + "language_loss": 0.55414546, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57446975, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.0832691192626953 + }, + { + "auxiliary_loss_clip": 0.01108445, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.03773916, + "balance_loss_mlp": 1.02130961, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 1.4965042917838303, + "language_loss": 0.73931915, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76079404, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 4.259646654129028 + }, + { + "auxiliary_loss_clip": 0.01122224, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.03915167, + "balance_loss_mlp": 1.03321028, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 1.935869620732778, + "language_loss": 0.76724666, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78896284, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.587414026260376 + }, + { + "auxiliary_loss_clip": 0.01106597, + "auxiliary_loss_mlp": 0.00750617, + "balance_loss_clip": 1.03888476, + "balance_loss_mlp": 1.00054169, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 2.600278376462676, + "language_loss": 0.65072322, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.66929531, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 4.0804126262664795 + }, + { + "auxiliary_loss_clip": 0.01107649, + "auxiliary_loss_mlp": 0.0103815, + "balance_loss_clip": 1.03740954, + "balance_loss_mlp": 1.02193713, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 2.1796834221372663, + "language_loss": 0.81400764, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83546561, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.516310930252075 + }, + { + "auxiliary_loss_clip": 0.01071217, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.03149688, + "balance_loss_mlp": 1.02139068, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 1.9368714907006894, + "language_loss": 0.68926072, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71035326, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.6540162563323975 + }, + { + "auxiliary_loss_clip": 0.01105225, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.04094291, + "balance_loss_mlp": 1.01705742, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 1.9931180822640533, + "language_loss": 0.9624911, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98387891, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.5542449951171875 + }, + { + "auxiliary_loss_clip": 0.01092046, + "auxiliary_loss_mlp": 0.01043391, + "balance_loss_clip": 1.03657794, + "balance_loss_mlp": 1.02860928, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.0298831776586077, + "language_loss": 0.7795161, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80087048, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 4.108372449874878 + }, + { + "auxiliary_loss_clip": 0.01115511, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.04058766, + "balance_loss_mlp": 1.02592695, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 2.127164196316768, + "language_loss": 0.74471873, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76630002, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.5796823501586914 + }, + { + "auxiliary_loss_clip": 0.0107152, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.03144109, + "balance_loss_mlp": 1.02373219, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.269499837584154, + "language_loss": 0.76474059, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78586423, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.627201557159424 + }, + { + "auxiliary_loss_clip": 0.01108934, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.04063809, + "balance_loss_mlp": 1.02193761, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 2.0744952333968527, + "language_loss": 0.7219342, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74341315, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 2.499732494354248 + }, + { + "auxiliary_loss_clip": 0.0110163, + "auxiliary_loss_mlp": 0.01044091, + "balance_loss_clip": 1.03588796, + "balance_loss_mlp": 1.02681744, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 2.0969058481811587, + "language_loss": 0.78348744, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80494463, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 3.982335090637207 + }, + { + "auxiliary_loss_clip": 0.01068953, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.03500438, + "balance_loss_mlp": 1.02250028, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8378721573175665, + "language_loss": 0.80461955, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82567424, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.6517839431762695 + }, + { + "auxiliary_loss_clip": 0.01082744, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.03551459, + "balance_loss_mlp": 1.02304745, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 1.9441323130862427, + "language_loss": 0.76588905, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78710413, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 2.622684955596924 + }, + { + "auxiliary_loss_clip": 0.01029373, + "auxiliary_loss_mlp": 0.01028262, + "balance_loss_clip": 1.03277564, + "balance_loss_mlp": 1.01407635, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.792395494926728, + "language_loss": 0.81274164, + "learning_rate": 3.542331483604246e-06, + "loss": 0.833318, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 2.8010592460632324 + }, + { + "auxiliary_loss_clip": 0.01101239, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.03592443, + "balance_loss_mlp": 1.01998782, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 3.054551889327013, + "language_loss": 0.72716355, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.74853814, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.531181573867798 + }, + { + "auxiliary_loss_clip": 0.01113516, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.04142344, + "balance_loss_mlp": 1.02521467, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.7525595632077535, + "language_loss": 0.83196282, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85351008, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.597466468811035 + }, + { + "auxiliary_loss_clip": 0.01051652, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.03609109, + "balance_loss_mlp": 1.02469373, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.6930611127889992, + "language_loss": 0.86675239, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88767076, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 2.699384927749634 + }, + { + "auxiliary_loss_clip": 0.01094722, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.03457308, + "balance_loss_mlp": 1.02448261, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.6848462895057656, + "language_loss": 0.72494513, + "learning_rate": 3.5413392369578e-06, + "loss": 0.74628997, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 2.603320837020874 + }, + { + "auxiliary_loss_clip": 0.01096667, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.03347886, + "balance_loss_mlp": 1.0217371, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 3.1752681590003977, + "language_loss": 0.73328525, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75463885, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 2.5785200595855713 + }, + { + "auxiliary_loss_clip": 0.01089326, + "auxiliary_loss_mlp": 0.01038666, + "balance_loss_clip": 1.03885365, + "balance_loss_mlp": 1.02408671, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 1.8749019422027753, + "language_loss": 0.73186398, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75314391, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.578335762023926 + }, + { + "auxiliary_loss_clip": 0.01069047, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.03327656, + "balance_loss_mlp": 1.02269053, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 1.6935145683727895, + "language_loss": 0.73773748, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75880373, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 2.569242477416992 + }, + { + "auxiliary_loss_clip": 0.01093262, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.03544819, + "balance_loss_mlp": 1.02547789, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.390422355684752, + "language_loss": 0.74676251, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.76809037, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 2.52778697013855 + }, + { + "auxiliary_loss_clip": 0.01065949, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_clip": 1.03203011, + "balance_loss_mlp": 1.02598917, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.835156761633249, + "language_loss": 0.70957482, + "learning_rate": 3.540097613646296e-06, + "loss": 0.73065001, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 2.6476728916168213 + }, + { + "auxiliary_loss_clip": 0.01094119, + "auxiliary_loss_mlp": 0.01047967, + "balance_loss_clip": 1.03943539, + "balance_loss_mlp": 1.03271985, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.556062269908521, + "language_loss": 0.80890095, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83032179, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.61576247215271 + }, + { + "auxiliary_loss_clip": 0.01122537, + "auxiliary_loss_mlp": 0.01035864, + "balance_loss_clip": 1.03900278, + "balance_loss_mlp": 1.02043796, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.6226873863501659, + "language_loss": 0.77851284, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80009687, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 2.475137710571289 + }, + { + "auxiliary_loss_clip": 0.01067582, + "auxiliary_loss_mlp": 0.01047534, + "balance_loss_clip": 1.03049421, + "balance_loss_mlp": 1.03188205, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.715573203438112, + "language_loss": 0.84219033, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86334145, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 2.557713270187378 + }, + { + "auxiliary_loss_clip": 0.01086207, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.03426373, + "balance_loss_mlp": 1.02513742, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 3.3087408412455144, + "language_loss": 0.54773039, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.56900352, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 2.8634274005889893 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.01041412, + "balance_loss_clip": 1.03824997, + "balance_loss_mlp": 1.02533102, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.063118746478973, + "language_loss": 0.80171245, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82326376, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 2.6088595390319824 + }, + { + "auxiliary_loss_clip": 0.01108988, + "auxiliary_loss_mlp": 0.01038111, + "balance_loss_clip": 1.038311, + "balance_loss_mlp": 1.02313805, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.733655071835932, + "language_loss": 0.79517114, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81664217, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 2.514374256134033 + }, + { + "auxiliary_loss_clip": 0.01122859, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.03840458, + "balance_loss_mlp": 1.02131259, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.7051502288528635, + "language_loss": 0.85350698, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87509173, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 2.5599441528320312 + }, + { + "auxiliary_loss_clip": 0.01103776, + "auxiliary_loss_mlp": 0.01032645, + "balance_loss_clip": 1.0401969, + "balance_loss_mlp": 1.01849532, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.8490060365262009, + "language_loss": 0.74273169, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.7640959, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 2.612173080444336 + }, + { + "auxiliary_loss_clip": 0.01091958, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.03850818, + "balance_loss_mlp": 1.02997935, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 2.353431947404375, + "language_loss": 0.73608446, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75747752, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.6347098350524902 + }, + { + "auxiliary_loss_clip": 0.01117499, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.03862309, + "balance_loss_mlp": 1.02277279, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.7100940287298805, + "language_loss": 0.75965858, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78119594, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 2.4875097274780273 + }, + { + "auxiliary_loss_clip": 0.01078928, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.03532231, + "balance_loss_mlp": 1.02056098, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 2.18781154815509, + "language_loss": 0.85447717, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87561882, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 2.6145381927490234 + }, + { + "auxiliary_loss_clip": 0.01091029, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.03517723, + "balance_loss_mlp": 1.01946366, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 2.6768905386119366, + "language_loss": 0.68148017, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70275915, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.559724807739258 + }, + { + "auxiliary_loss_clip": 0.01112145, + "auxiliary_loss_mlp": 0.01037719, + "balance_loss_clip": 1.03716135, + "balance_loss_mlp": 1.02229345, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.5826004401171971, + "language_loss": 0.69934332, + "learning_rate": 3.536862563102088e-06, + "loss": 0.720842, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.575807571411133 + }, + { + "auxiliary_loss_clip": 0.01123944, + "auxiliary_loss_mlp": 0.01042572, + "balance_loss_clip": 1.03846085, + "balance_loss_mlp": 1.02497697, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 1.7641063466832358, + "language_loss": 0.83986932, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86153448, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.531545639038086 + }, + { + "auxiliary_loss_clip": 0.01036705, + "auxiliary_loss_mlp": 0.01006882, + "balance_loss_clip": 1.00991881, + "balance_loss_mlp": 1.00536823, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.8771033474646536, + "language_loss": 0.52318186, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54361773, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 2.940546751022339 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01037351, + "balance_loss_clip": 1.03785825, + "balance_loss_mlp": 1.02151978, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 2.7174372718145334, + "language_loss": 0.72412741, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74547404, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.5109901428222656 + }, + { + "auxiliary_loss_clip": 0.01059496, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.03382957, + "balance_loss_mlp": 1.02588356, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.4202224583543377, + "language_loss": 0.77519035, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79620683, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.6776373386383057 + }, + { + "auxiliary_loss_clip": 0.01095795, + "auxiliary_loss_mlp": 0.01039884, + "balance_loss_clip": 1.04071069, + "balance_loss_mlp": 1.02399397, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 2.9539680493449656, + "language_loss": 0.8052367, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82659352, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 4.096009969711304 + }, + { + "auxiliary_loss_clip": 0.01096687, + "auxiliary_loss_mlp": 0.01040359, + "balance_loss_clip": 1.03478479, + "balance_loss_mlp": 1.02490926, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.6630825362701236, + "language_loss": 0.8435483, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86491871, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.592625856399536 + }, + { + "auxiliary_loss_clip": 0.01090925, + "auxiliary_loss_mlp": 0.01052334, + "balance_loss_clip": 1.03515375, + "balance_loss_mlp": 1.03302193, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.9089602710124658, + "language_loss": 0.79820013, + "learning_rate": 3.535116532028798e-06, + "loss": 0.81963277, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 4.0809266567230225 + }, + { + "auxiliary_loss_clip": 0.01108708, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.03953719, + "balance_loss_mlp": 1.0246222, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.597734676227812, + "language_loss": 0.70091987, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72239232, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.532661199569702 + }, + { + "auxiliary_loss_clip": 0.01083683, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.03515005, + "balance_loss_mlp": 1.02293015, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 3.2667026508251134, + "language_loss": 0.67918932, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.70040095, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.6136209964752197 + }, + { + "auxiliary_loss_clip": 0.01035111, + "auxiliary_loss_mlp": 0.01000134, + "balance_loss_clip": 1.00856733, + "balance_loss_mlp": 0.99844092, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.8861139517268966, + "language_loss": 0.68689579, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70724827, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.1521332263946533 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01037662, + "balance_loss_clip": 1.03867912, + "balance_loss_mlp": 1.0227375, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.156914013600331, + "language_loss": 0.79511386, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81666946, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 4.071050643920898 + }, + { + "auxiliary_loss_clip": 0.0110067, + "auxiliary_loss_mlp": 0.00750457, + "balance_loss_clip": 1.03725386, + "balance_loss_mlp": 1.00048077, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 1.9055267080584106, + "language_loss": 0.82181847, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84032977, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.6495296955108643 + }, + { + "auxiliary_loss_clip": 0.0112178, + "auxiliary_loss_mlp": 0.010378, + "balance_loss_clip": 1.03919101, + "balance_loss_mlp": 1.02108705, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 2.1819900306458626, + "language_loss": 0.62352318, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64511895, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.574218988418579 + }, + { + "auxiliary_loss_clip": 0.01093285, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.04130745, + "balance_loss_mlp": 1.01799345, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.5557797704463474, + "language_loss": 0.75617301, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77743453, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 4.069910287857056 + }, + { + "auxiliary_loss_clip": 0.01117, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.03748834, + "balance_loss_mlp": 1.02192366, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.9881507634882751, + "language_loss": 0.75189012, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77344471, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 2.4858784675598145 + }, + { + "auxiliary_loss_clip": 0.01091741, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.03596973, + "balance_loss_mlp": 1.01993489, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 1.8450691380414777, + "language_loss": 0.82702821, + "learning_rate": 3.532867444142186e-06, + "loss": 0.84828836, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.570783853530884 + }, + { + "auxiliary_loss_clip": 0.01092954, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.0377512, + "balance_loss_mlp": 1.02217054, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 1.8489914064743378, + "language_loss": 0.73214781, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75344038, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 2.6608619689941406 + }, + { + "auxiliary_loss_clip": 0.01076133, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.03256702, + "balance_loss_mlp": 1.02640057, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.7125327838469129, + "language_loss": 0.71958601, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74074847, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.6139278411865234 + }, + { + "auxiliary_loss_clip": 0.01089276, + "auxiliary_loss_mlp": 0.01038369, + "balance_loss_clip": 1.03402424, + "balance_loss_mlp": 1.0216682, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.10513509663914, + "language_loss": 0.74466431, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76594079, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.5589756965637207 + }, + { + "auxiliary_loss_clip": 0.01098995, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.03425765, + "balance_loss_mlp": 1.02016425, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.164496486896547, + "language_loss": 0.85257709, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87391835, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 2.561960458755493 + }, + { + "auxiliary_loss_clip": 0.010905, + "auxiliary_loss_mlp": 0.01046129, + "balance_loss_clip": 1.03846049, + "balance_loss_mlp": 1.0306201, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.7980322927790564, + "language_loss": 0.78686911, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.80823541, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 2.5979697704315186 + }, + { + "auxiliary_loss_clip": 0.01061937, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.03701866, + "balance_loss_mlp": 1.02508807, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.575948069996954, + "language_loss": 0.74953258, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77055728, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.673710823059082 + }, + { + "auxiliary_loss_clip": 0.01066991, + "auxiliary_loss_mlp": 0.01043999, + "balance_loss_clip": 1.04079247, + "balance_loss_mlp": 1.02692783, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.7072919549660757, + "language_loss": 0.79130089, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81241077, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 2.6276981830596924 + }, + { + "auxiliary_loss_clip": 0.01074472, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.03325427, + "balance_loss_mlp": 1.01910377, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.6218510024517183, + "language_loss": 0.77226764, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79334062, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 2.6385960578918457 + }, + { + "auxiliary_loss_clip": 0.01102026, + "auxiliary_loss_mlp": 0.01042698, + "balance_loss_clip": 1.03731632, + "balance_loss_mlp": 1.02715302, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 1.913662770429269, + "language_loss": 0.81470007, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83614731, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 2.667701005935669 + }, + { + "auxiliary_loss_clip": 0.0109413, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_clip": 1.03723764, + "balance_loss_mlp": 1.02753961, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.718277644148213, + "language_loss": 0.73133832, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75271773, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.558314561843872 + }, + { + "auxiliary_loss_clip": 0.01082785, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.0396688, + "balance_loss_mlp": 1.02592313, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.732965192085151, + "language_loss": 0.7694357, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79068595, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.632659435272217 + }, + { + "auxiliary_loss_clip": 0.01086324, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.03338289, + "balance_loss_mlp": 1.02187097, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.6627951235156453, + "language_loss": 0.81613874, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83737946, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.570612668991089 + }, + { + "auxiliary_loss_clip": 0.01110026, + "auxiliary_loss_mlp": 0.0104358, + "balance_loss_clip": 1.03692484, + "balance_loss_mlp": 1.02742672, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 2.197018747044406, + "language_loss": 0.8707667, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89230275, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 2.505378007888794 + }, + { + "auxiliary_loss_clip": 0.00982762, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.01638627, + "balance_loss_mlp": 1.02633321, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7739662660828571, + "language_loss": 0.57525527, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59536767, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 3.397423267364502 + }, + { + "auxiliary_loss_clip": 0.01027056, + "auxiliary_loss_mlp": 0.01010905, + "balance_loss_clip": 1.01975155, + "balance_loss_mlp": 1.00933194, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.6446844530135147, + "language_loss": 0.56257093, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58295053, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 3.4595096111297607 + }, + { + "auxiliary_loss_clip": 0.01098514, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.03920341, + "balance_loss_mlp": 1.02426124, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 3.354238916046482, + "language_loss": 0.77366209, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79503465, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 2.628084421157837 + }, + { + "auxiliary_loss_clip": 0.0108707, + "auxiliary_loss_mlp": 0.01043522, + "balance_loss_clip": 1.03657722, + "balance_loss_mlp": 1.02548528, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 1.7588639635074845, + "language_loss": 0.75844234, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.7797482, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 2.6501307487487793 + }, + { + "auxiliary_loss_clip": 0.01102242, + "auxiliary_loss_mlp": 0.01044134, + "balance_loss_clip": 1.03931642, + "balance_loss_mlp": 1.02897048, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 1.822442669734498, + "language_loss": 0.68246531, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70392907, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 2.6161413192749023 + }, + { + "auxiliary_loss_clip": 0.01106447, + "auxiliary_loss_mlp": 0.01040544, + "balance_loss_clip": 1.03836465, + "balance_loss_mlp": 1.02493966, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 1.9309169876505012, + "language_loss": 0.65983516, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68130505, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 2.590822696685791 + }, + { + "auxiliary_loss_clip": 0.0101611, + "auxiliary_loss_mlp": 0.01005192, + "balance_loss_clip": 1.00943542, + "balance_loss_mlp": 1.00334418, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7114141887593027, + "language_loss": 0.61527419, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.6354872, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.2030351161956787 + }, + { + "auxiliary_loss_clip": 0.01120552, + "auxiliary_loss_mlp": 0.01041114, + "balance_loss_clip": 1.04093349, + "balance_loss_mlp": 1.02493739, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 3.1526927358217987, + "language_loss": 0.7308377, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75245446, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.5409693717956543 + }, + { + "auxiliary_loss_clip": 0.01102896, + "auxiliary_loss_mlp": 0.01038354, + "balance_loss_clip": 1.04125404, + "balance_loss_mlp": 1.02313077, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 1.9272831117703628, + "language_loss": 0.7554822, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.77689475, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 2.7704920768737793 + }, + { + "auxiliary_loss_clip": 0.01110231, + "auxiliary_loss_mlp": 0.01041676, + "balance_loss_clip": 1.04099596, + "balance_loss_mlp": 1.02529109, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 3.6284689792875007, + "language_loss": 0.78192979, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80344892, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 2.5660228729248047 + }, + { + "auxiliary_loss_clip": 0.01109877, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.04102921, + "balance_loss_mlp": 1.020895, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.9733736860445976, + "language_loss": 0.83325064, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85472643, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 2.518911600112915 + }, + { + "auxiliary_loss_clip": 0.01125721, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.04457188, + "balance_loss_mlp": 1.02319241, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.9460699575341558, + "language_loss": 0.76457655, + "learning_rate": 3.52659529557275e-06, + "loss": 0.7862193, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.4986960887908936 + }, + { + "auxiliary_loss_clip": 0.01077506, + "auxiliary_loss_mlp": 0.01050452, + "balance_loss_clip": 1.03337538, + "balance_loss_mlp": 1.03098464, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.5104827782939734, + "language_loss": 0.72839439, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74967396, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.5912578105926514 + }, + { + "auxiliary_loss_clip": 0.01124063, + "auxiliary_loss_mlp": 0.01048027, + "balance_loss_clip": 1.04252923, + "balance_loss_mlp": 1.03169513, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 1.837866930355153, + "language_loss": 0.65273327, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67445415, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.582413911819458 + }, + { + "auxiliary_loss_clip": 0.01071257, + "auxiliary_loss_mlp": 0.01041799, + "balance_loss_clip": 1.03585458, + "balance_loss_mlp": 1.02512789, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 1.5888735077253042, + "language_loss": 0.73121983, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.75235033, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 2.820543050765991 + }, + { + "auxiliary_loss_clip": 0.01086701, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.03704739, + "balance_loss_mlp": 1.02711546, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.78909517821063, + "language_loss": 0.79092485, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81222451, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.6003496646881104 + }, + { + "auxiliary_loss_clip": 0.01089596, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.03833687, + "balance_loss_mlp": 1.0212642, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.645753199184132, + "language_loss": 0.8092581, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.83052993, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 4.0591442584991455 + }, + { + "auxiliary_loss_clip": 0.01120041, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.03911209, + "balance_loss_mlp": 1.02488542, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 1.9698105687113323, + "language_loss": 0.75139737, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77299601, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 4.20776629447937 + }, + { + "auxiliary_loss_clip": 0.01087363, + "auxiliary_loss_mlp": 0.00750547, + "balance_loss_clip": 1.03595257, + "balance_loss_mlp": 1.00052571, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 2.5848362601788564, + "language_loss": 0.82964408, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84802318, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.687425374984741 + }, + { + "auxiliary_loss_clip": 0.01119469, + "auxiliary_loss_mlp": 0.01037782, + "balance_loss_clip": 1.03787208, + "balance_loss_mlp": 1.02147412, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.1640014779146877, + "language_loss": 0.87165189, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89322436, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.4878652095794678 + }, + { + "auxiliary_loss_clip": 0.01074036, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.03639054, + "balance_loss_mlp": 1.01878285, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 1.9585099076910308, + "language_loss": 0.75120926, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77229106, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.700364112854004 + }, + { + "auxiliary_loss_clip": 0.00982449, + "auxiliary_loss_mlp": 0.01000647, + "balance_loss_clip": 1.0062356, + "balance_loss_mlp": 0.99856073, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6555926956381164, + "language_loss": 0.58254486, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60237586, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 5.023577928543091 + }, + { + "auxiliary_loss_clip": 0.01096421, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.03650916, + "balance_loss_mlp": 1.01999652, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.398184386088357, + "language_loss": 0.83324271, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85456079, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.920560359954834 + }, + { + "auxiliary_loss_clip": 0.01019287, + "auxiliary_loss_mlp": 0.00747762, + "balance_loss_clip": 1.01208591, + "balance_loss_mlp": 1.00010669, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9022444686395543, + "language_loss": 0.63529533, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65296584, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 3.0040650367736816 + }, + { + "auxiliary_loss_clip": 0.0110309, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.03685308, + "balance_loss_mlp": 1.02588034, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5983614930113355, + "language_loss": 0.79304755, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81449461, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 2.534886598587036 + }, + { + "auxiliary_loss_clip": 0.01107931, + "auxiliary_loss_mlp": 0.01041689, + "balance_loss_clip": 1.03889966, + "balance_loss_mlp": 1.02641869, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.9492258453582922, + "language_loss": 0.74130535, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76280153, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 3.9619157314300537 + }, + { + "auxiliary_loss_clip": 0.01110291, + "auxiliary_loss_mlp": 0.01045263, + "balance_loss_clip": 1.03855491, + "balance_loss_mlp": 1.02864504, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 2.0805558226503726, + "language_loss": 0.88094318, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90249872, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 2.497304677963257 + }, + { + "auxiliary_loss_clip": 0.0112231, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.03874874, + "balance_loss_mlp": 1.02400851, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 4.798821752827433, + "language_loss": 0.69622469, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71785498, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.5219602584838867 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.0104038, + "balance_loss_clip": 1.03873277, + "balance_loss_mlp": 1.02279663, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.545017293162665, + "language_loss": 0.8017832, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82341319, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.46480131149292 + }, + { + "auxiliary_loss_clip": 0.01059692, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_clip": 1.03573656, + "balance_loss_mlp": 1.02980804, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 1.7722111992390617, + "language_loss": 0.75024199, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77129054, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.607539415359497 + }, + { + "auxiliary_loss_clip": 0.01107274, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.03837311, + "balance_loss_mlp": 1.02028561, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.4762585859456736, + "language_loss": 0.73801863, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75943756, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 2.7008650302886963 + }, + { + "auxiliary_loss_clip": 0.01086637, + "auxiliary_loss_mlp": 0.0075058, + "balance_loss_clip": 1.03575492, + "balance_loss_mlp": 1.00060058, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 1.7004311857937477, + "language_loss": 0.69720304, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71557516, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.6252901554107666 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.01047285, + "balance_loss_clip": 1.03640711, + "balance_loss_mlp": 1.03193676, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.9292529992119376, + "language_loss": 0.81133682, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83288002, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 2.514503002166748 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.00750433, + "balance_loss_clip": 1.03845572, + "balance_loss_mlp": 1.00047755, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 2.617704902861959, + "language_loss": 0.84282523, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86142826, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 2.5377352237701416 + }, + { + "auxiliary_loss_clip": 0.01090516, + "auxiliary_loss_mlp": 0.01054559, + "balance_loss_clip": 1.03660059, + "balance_loss_mlp": 1.03714252, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 2.068674333161561, + "language_loss": 0.65439212, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67584288, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.5939319133758545 + }, + { + "auxiliary_loss_clip": 0.01073892, + "auxiliary_loss_mlp": 0.01039706, + "balance_loss_clip": 1.03584719, + "balance_loss_mlp": 1.02298045, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 1.6473289696047324, + "language_loss": 0.75229102, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77342701, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.6662650108337402 + }, + { + "auxiliary_loss_clip": 0.01053942, + "auxiliary_loss_mlp": 0.01044937, + "balance_loss_clip": 1.03400981, + "balance_loss_mlp": 1.02828336, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.2816765802431664, + "language_loss": 0.7688756, + "learning_rate": 3.520286966670535e-06, + "loss": 0.78986442, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.669558048248291 + }, + { + "auxiliary_loss_clip": 0.01105372, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.03689146, + "balance_loss_mlp": 1.02072227, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5271061001855946, + "language_loss": 0.83774388, + "learning_rate": 3.520033883075255e-06, + "loss": 0.85914475, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 2.5995607376098633 + }, + { + "auxiliary_loss_clip": 0.01094935, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.0356245, + "balance_loss_mlp": 1.02111936, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 1.7302321476270104, + "language_loss": 0.71122551, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73254883, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 2.528225898742676 + }, + { + "auxiliary_loss_clip": 0.01129146, + "auxiliary_loss_mlp": 0.0104068, + "balance_loss_clip": 1.04173636, + "balance_loss_mlp": 1.02157032, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.4435449788107997, + "language_loss": 0.62129986, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.6429981, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 2.502561569213867 + }, + { + "auxiliary_loss_clip": 0.01110139, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.03771794, + "balance_loss_mlp": 1.01888728, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.120490035419588, + "language_loss": 0.78475273, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80619675, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 2.534212350845337 + }, + { + "auxiliary_loss_clip": 0.01101066, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.04248941, + "balance_loss_mlp": 1.01586866, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.7572748483541343, + "language_loss": 0.8329441, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.85426223, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 2.5248944759368896 + }, + { + "auxiliary_loss_clip": 0.01095985, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.04030418, + "balance_loss_mlp": 1.02406907, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.8166038676145149, + "language_loss": 0.71022815, + "learning_rate": 3.518767600693314e-06, + "loss": 0.73158497, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 2.745584726333618 + }, + { + "auxiliary_loss_clip": 0.01108482, + "auxiliary_loss_mlp": 0.00750464, + "balance_loss_clip": 1.03529119, + "balance_loss_mlp": 1.00046921, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 1.9537182805985076, + "language_loss": 0.67121696, + "learning_rate": 3.518514171403042e-06, + "loss": 0.68980646, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 2.5366594791412354 + }, + { + "auxiliary_loss_clip": 0.01084059, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.03828871, + "balance_loss_mlp": 1.01842189, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.0221861136092874, + "language_loss": 0.83927417, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86044425, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.622396945953369 + }, + { + "auxiliary_loss_clip": 0.01085982, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.03628445, + "balance_loss_mlp": 1.02326894, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.6675822162647886, + "language_loss": 0.78448939, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80574226, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.6172707080841064 + }, + { + "auxiliary_loss_clip": 0.01030773, + "auxiliary_loss_mlp": 0.01015036, + "balance_loss_clip": 1.01876163, + "balance_loss_mlp": 1.01328337, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8310671957047187, + "language_loss": 0.61028004, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63073814, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.1772704124450684 + }, + { + "auxiliary_loss_clip": 0.01123147, + "auxiliary_loss_mlp": 0.01047353, + "balance_loss_clip": 1.04033983, + "balance_loss_mlp": 1.03210592, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.0668905465356198, + "language_loss": 0.73167741, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75338244, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 2.6273210048675537 + }, + { + "auxiliary_loss_clip": 0.01106708, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.03720677, + "balance_loss_mlp": 1.02488017, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 1.7815678301266635, + "language_loss": 0.8113879, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83286107, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.5387532711029053 + }, + { + "auxiliary_loss_clip": 0.01093371, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.03539157, + "balance_loss_mlp": 1.01884449, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.7824403224799037, + "language_loss": 0.58511215, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.60637707, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.5977790355682373 + }, + { + "auxiliary_loss_clip": 0.01104722, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.03529453, + "balance_loss_mlp": 1.02212167, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 1.9530816964001585, + "language_loss": 0.78282362, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80424356, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 2.605088233947754 + }, + { + "auxiliary_loss_clip": 0.01117911, + "auxiliary_loss_mlp": 0.00750761, + "balance_loss_clip": 1.04026282, + "balance_loss_mlp": 1.00053036, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 1.8441949358055778, + "language_loss": 0.65694886, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67563558, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.5232346057891846 + }, + { + "auxiliary_loss_clip": 0.01023087, + "auxiliary_loss_mlp": 0.01001802, + "balance_loss_clip": 1.01870871, + "balance_loss_mlp": 0.99996608, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9469623248669319, + "language_loss": 0.67330855, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69355744, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.2325143814086914 + }, + { + "auxiliary_loss_clip": 0.01099619, + "auxiliary_loss_mlp": 0.01043611, + "balance_loss_clip": 1.03932357, + "balance_loss_mlp": 1.02854848, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 1.866650278008188, + "language_loss": 0.88893384, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91036618, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.6672914028167725 + }, + { + "auxiliary_loss_clip": 0.01073546, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.03763127, + "balance_loss_mlp": 1.02611041, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8226913616211446, + "language_loss": 0.6826483, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.7038337, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.7049427032470703 + }, + { + "auxiliary_loss_clip": 0.01110465, + "auxiliary_loss_mlp": 0.01038096, + "balance_loss_clip": 1.04020619, + "balance_loss_mlp": 1.02282476, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 1.6033085116346528, + "language_loss": 0.71090317, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73238873, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.6924591064453125 + }, + { + "auxiliary_loss_clip": 0.01075022, + "auxiliary_loss_mlp": 0.01042159, + "balance_loss_clip": 1.03757441, + "balance_loss_mlp": 1.02568436, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 2.2399291691820413, + "language_loss": 0.73217481, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75334662, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 4.085167646408081 + }, + { + "auxiliary_loss_clip": 0.01116491, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_clip": 1.03868532, + "balance_loss_mlp": 1.02876616, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 3.0122359503332614, + "language_loss": 0.62815356, + "learning_rate": 3.514960119583781e-06, + "loss": 0.64976895, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 4.036021709442139 + }, + { + "auxiliary_loss_clip": 0.01103655, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.03895783, + "balance_loss_mlp": 1.02116394, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 1.8271280279051485, + "language_loss": 0.77419496, + "learning_rate": 3.514705827570645e-06, + "loss": 0.7955941, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.6309564113616943 + }, + { + "auxiliary_loss_clip": 0.01109506, + "auxiliary_loss_mlp": 0.01031376, + "balance_loss_clip": 1.03916061, + "balance_loss_mlp": 1.01627278, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 1.9576452746505075, + "language_loss": 0.76913279, + "learning_rate": 3.514451478119711e-06, + "loss": 0.79054165, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.571502208709717 + }, + { + "auxiliary_loss_clip": 0.01112659, + "auxiliary_loss_mlp": 0.01040849, + "balance_loss_clip": 1.03925824, + "balance_loss_mlp": 1.02328324, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.2995529284452076, + "language_loss": 0.70469183, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72622693, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.7037389278411865 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_clip": 1.04058695, + "balance_loss_mlp": 1.02892661, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.6445859354596073, + "language_loss": 0.74785042, + "learning_rate": 3.513942606943036e-06, + "loss": 0.76933885, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 4.091508865356445 + }, + { + "auxiliary_loss_clip": 0.01105614, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.03938675, + "balance_loss_mlp": 1.02120733, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.050331255087181, + "language_loss": 0.77097934, + "learning_rate": 3.513688085236591e-06, + "loss": 0.79238951, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 2.561014413833618 + }, + { + "auxiliary_loss_clip": 0.01059984, + "auxiliary_loss_mlp": 0.01042247, + "balance_loss_clip": 1.03483891, + "balance_loss_mlp": 1.02644002, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6034135922580688, + "language_loss": 0.81143129, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83245355, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 2.814072370529175 + }, + { + "auxiliary_loss_clip": 0.01088544, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.03628016, + "balance_loss_mlp": 1.01683676, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 3.809080901892515, + "language_loss": 0.75500262, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77620161, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.767155885696411 + }, + { + "auxiliary_loss_clip": 0.0111257, + "auxiliary_loss_mlp": 0.01035777, + "balance_loss_clip": 1.03960216, + "balance_loss_mlp": 1.01916504, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 1.759859866799428, + "language_loss": 0.7161594, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73764288, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 2.5864615440368652 + }, + { + "auxiliary_loss_clip": 0.01039133, + "auxiliary_loss_mlp": 0.01005161, + "balance_loss_clip": 1.01232743, + "balance_loss_mlp": 1.00351548, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7432806241258457, + "language_loss": 0.56798661, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58842957, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 4.578832149505615 + }, + { + "auxiliary_loss_clip": 0.01116545, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.04116476, + "balance_loss_mlp": 1.0250237, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 2.597487516741221, + "language_loss": 0.80583572, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82740939, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.5613880157470703 + }, + { + "auxiliary_loss_clip": 0.01099867, + "auxiliary_loss_mlp": 0.00750424, + "balance_loss_clip": 1.03500843, + "balance_loss_mlp": 1.00059319, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.5660729556855766, + "language_loss": 0.87215924, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89066213, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.62610125541687 + }, + { + "auxiliary_loss_clip": 0.01109296, + "auxiliary_loss_mlp": 0.01037012, + "balance_loss_clip": 1.04227042, + "balance_loss_mlp": 1.02172375, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.8538742912871093, + "language_loss": 0.83486688, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85633004, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 2.6108860969543457 + }, + { + "auxiliary_loss_clip": 0.01106686, + "auxiliary_loss_mlp": 0.01040494, + "balance_loss_clip": 1.04421699, + "balance_loss_mlp": 1.02685034, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.7234803418276718, + "language_loss": 0.73993814, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76141, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 2.7067019939422607 + }, + { + "auxiliary_loss_clip": 0.01087676, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.0376364, + "balance_loss_mlp": 1.02534986, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 1.8551483290093655, + "language_loss": 0.74389708, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76518983, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 2.6206865310668945 + }, + { + "auxiliary_loss_clip": 0.01085875, + "auxiliary_loss_mlp": 0.01039327, + "balance_loss_clip": 1.03781736, + "balance_loss_mlp": 1.02449727, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.783702736887235, + "language_loss": 0.81938803, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84063995, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 2.655628204345703 + }, + { + "auxiliary_loss_clip": 0.01106193, + "auxiliary_loss_mlp": 0.01037984, + "balance_loss_clip": 1.04001915, + "balance_loss_mlp": 1.0233804, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.281201402032497, + "language_loss": 0.79525775, + "learning_rate": 3.51088456024312e-06, + "loss": 0.8166995, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.500720500946045 + }, + { + "auxiliary_loss_clip": 0.01112025, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.03859031, + "balance_loss_mlp": 1.0193969, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.6806582453684076, + "language_loss": 0.69977736, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72126436, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.7260148525238037 + }, + { + "auxiliary_loss_clip": 0.01084608, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.03682733, + "balance_loss_mlp": 1.02349067, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8511895867962718, + "language_loss": 0.77615899, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79738986, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.6102495193481445 + }, + { + "auxiliary_loss_clip": 0.01101577, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.03972077, + "balance_loss_mlp": 1.0216459, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.424598141300598, + "language_loss": 0.7641086, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78549159, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 2.521550178527832 + }, + { + "auxiliary_loss_clip": 0.01036005, + "auxiliary_loss_mlp": 0.01000296, + "balance_loss_clip": 1.00917375, + "balance_loss_mlp": 0.99861526, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8200219839522862, + "language_loss": 0.6005668, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62092984, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 3.0236294269561768 + }, + { + "auxiliary_loss_clip": 0.01094087, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.03486991, + "balance_loss_mlp": 1.02195477, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 2.083290132213876, + "language_loss": 0.7896772, + "learning_rate": 3.509607938211409e-06, + "loss": 0.81099886, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 2.5589611530303955 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.03925312, + "balance_loss_mlp": 1.02527988, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.2087847924543906, + "language_loss": 0.83998966, + "learning_rate": 3.509352442032875e-06, + "loss": 0.86159474, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 2.407177448272705 + }, + { + "auxiliary_loss_clip": 0.01064343, + "auxiliary_loss_mlp": 0.01041666, + "balance_loss_clip": 1.03463197, + "balance_loss_mlp": 1.02534676, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.167201984254963, + "language_loss": 0.71041417, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73147428, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 2.7271535396575928 + }, + { + "auxiliary_loss_clip": 0.01087335, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.0340606, + "balance_loss_mlp": 1.01621401, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 1.9876526715962657, + "language_loss": 0.81317317, + "learning_rate": 3.50884127798111e-06, + "loss": 0.83437479, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 2.7915146350860596 + }, + { + "auxiliary_loss_clip": 0.01101163, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.03915775, + "balance_loss_mlp": 1.02073836, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.001732619089598, + "language_loss": 0.82997942, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.85137165, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.6232457160949707 + }, + { + "auxiliary_loss_clip": 0.01080753, + "auxiliary_loss_mlp": 0.01039438, + "balance_loss_clip": 1.0374167, + "balance_loss_mlp": 1.02349997, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.352333524862482, + "language_loss": 0.8255344, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84673631, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.6232614517211914 + }, + { + "auxiliary_loss_clip": 0.01113869, + "auxiliary_loss_mlp": 0.0075015, + "balance_loss_clip": 1.0356524, + "balance_loss_mlp": 1.00058603, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 1.9914926280503265, + "language_loss": 0.76097322, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77961349, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.488478183746338 + }, + { + "auxiliary_loss_clip": 0.01078024, + "auxiliary_loss_mlp": 0.01047302, + "balance_loss_clip": 1.03478587, + "balance_loss_mlp": 1.03122056, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 2.9286046155927603, + "language_loss": 0.70140922, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72266251, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.561326742172241 + }, + { + "auxiliary_loss_clip": 0.01118183, + "auxiliary_loss_mlp": 0.01043991, + "balance_loss_clip": 1.03838229, + "balance_loss_mlp": 1.02867293, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 1.9596937283398552, + "language_loss": 0.86429816, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88591993, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.474421977996826 + }, + { + "auxiliary_loss_clip": 0.0112045, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.04016149, + "balance_loss_mlp": 1.02509046, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 2.3228726895391083, + "language_loss": 0.68018234, + "learning_rate": 3.507306412966238e-06, + "loss": 0.70178777, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 2.5717129707336426 + }, + { + "auxiliary_loss_clip": 0.01021869, + "auxiliary_loss_mlp": 0.01005579, + "balance_loss_clip": 1.01424742, + "balance_loss_mlp": 1.00382698, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8408457145512575, + "language_loss": 0.7011953, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72146976, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.1117165088653564 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.01036671, + "balance_loss_clip": 1.03939605, + "balance_loss_mlp": 1.02051806, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.9560394853675291, + "language_loss": 0.73844779, + "learning_rate": 3.506794333933431e-06, + "loss": 0.75984228, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.592900276184082 + }, + { + "auxiliary_loss_clip": 0.0111124, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_clip": 1.04023039, + "balance_loss_mlp": 1.02955127, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.6232943974944651, + "language_loss": 0.83034259, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85190892, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.574507236480713 + }, + { + "auxiliary_loss_clip": 0.01002111, + "auxiliary_loss_mlp": 0.010046, + "balance_loss_clip": 1.0218811, + "balance_loss_mlp": 1.00288343, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.794678399462079, + "language_loss": 0.6156441, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63571119, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.191946268081665 + }, + { + "auxiliary_loss_clip": 0.01082731, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.03667951, + "balance_loss_mlp": 1.0182364, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 1.7070867583551248, + "language_loss": 0.78786075, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.80903041, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.6128013134002686 + }, + { + "auxiliary_loss_clip": 0.01062695, + "auxiliary_loss_mlp": 0.01043933, + "balance_loss_clip": 1.03535426, + "balance_loss_mlp": 1.02826941, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5269057975669222, + "language_loss": 0.79841179, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.81947803, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.7267768383026123 + }, + { + "auxiliary_loss_clip": 0.01107752, + "auxiliary_loss_mlp": 0.01040052, + "balance_loss_clip": 1.0388577, + "balance_loss_mlp": 1.02470946, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 2.0597757958478016, + "language_loss": 0.74566829, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76714635, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.611632823944092 + }, + { + "auxiliary_loss_clip": 0.0109082, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.03639388, + "balance_loss_mlp": 1.01910555, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.9706371922786412, + "language_loss": 0.84500659, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86624748, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.5871264934539795 + }, + { + "auxiliary_loss_clip": 0.01089643, + "auxiliary_loss_mlp": 0.01039812, + "balance_loss_clip": 1.03424239, + "balance_loss_mlp": 1.02317011, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 2.519382389168746, + "language_loss": 0.75179756, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77309215, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.6140549182891846 + }, + { + "auxiliary_loss_clip": 0.01027507, + "auxiliary_loss_mlp": 0.0099985, + "balance_loss_clip": 1.01072824, + "balance_loss_mlp": 0.99785912, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7150090152226971, + "language_loss": 0.57158935, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59186292, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 4.579816818237305 + }, + { + "auxiliary_loss_clip": 0.01093603, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.04012799, + "balance_loss_mlp": 1.01721251, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 1.8622788703384896, + "language_loss": 0.76039732, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78165865, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 4.38679838180542 + }, + { + "auxiliary_loss_clip": 0.01110578, + "auxiliary_loss_mlp": 0.01042821, + "balance_loss_clip": 1.03947735, + "balance_loss_mlp": 1.02730012, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.8867605034704766, + "language_loss": 0.84066379, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86219776, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.546609878540039 + }, + { + "auxiliary_loss_clip": 0.01121029, + "auxiliary_loss_mlp": 0.01045136, + "balance_loss_clip": 1.03934216, + "balance_loss_mlp": 1.03059244, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.6483048638480982, + "language_loss": 0.88299692, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90465862, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.5654585361480713 + }, + { + "auxiliary_loss_clip": 0.01121471, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.03966022, + "balance_loss_mlp": 1.02197814, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 1.988698280633227, + "language_loss": 0.85975111, + "learning_rate": 3.503717062883053e-06, + "loss": 0.88136733, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 4.057985067367554 + }, + { + "auxiliary_loss_clip": 0.01110732, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.0383358, + "balance_loss_mlp": 1.02218866, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.8263548563926466, + "language_loss": 0.83187079, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85335004, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 2.697209358215332 + }, + { + "auxiliary_loss_clip": 0.01111876, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.04002666, + "balance_loss_mlp": 1.02447534, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.5226607844772087, + "language_loss": 0.72925162, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75078905, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 2.686903953552246 + }, + { + "auxiliary_loss_clip": 0.01122684, + "auxiliary_loss_mlp": 0.01045155, + "balance_loss_clip": 1.03882909, + "balance_loss_mlp": 1.02908528, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.9243705061231133, + "language_loss": 0.76925194, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79093027, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.55501127243042 + }, + { + "auxiliary_loss_clip": 0.01099931, + "auxiliary_loss_mlp": 0.00750368, + "balance_loss_clip": 1.03999889, + "balance_loss_mlp": 1.00052536, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.6924866116765556, + "language_loss": 0.73090959, + "learning_rate": 3.502689480360739e-06, + "loss": 0.7494126, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 4.156240701675415 + }, + { + "auxiliary_loss_clip": 0.01107199, + "auxiliary_loss_mlp": 0.01040031, + "balance_loss_clip": 1.03647304, + "balance_loss_mlp": 1.02583337, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 2.0068459688161036, + "language_loss": 0.82124084, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84271312, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 2.7738053798675537 + }, + { + "auxiliary_loss_clip": 0.01071807, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.0393163, + "balance_loss_mlp": 1.02614009, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.7655599546888705, + "language_loss": 0.74997365, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77110982, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.683974027633667 + }, + { + "auxiliary_loss_clip": 0.01105468, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.03702116, + "balance_loss_mlp": 1.01871061, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 2.2822046504347577, + "language_loss": 0.73046863, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75186336, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 2.747968912124634 + }, + { + "auxiliary_loss_clip": 0.01098475, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.03627658, + "balance_loss_mlp": 1.02108359, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.5103632857511404, + "language_loss": 0.7723434, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79369241, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 2.733548879623413 + }, + { + "auxiliary_loss_clip": 0.0108953, + "auxiliary_loss_mlp": 0.01047905, + "balance_loss_clip": 1.03866744, + "balance_loss_mlp": 1.03237224, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 2.47082932428091, + "language_loss": 0.72249544, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74386984, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 2.617079973220825 + }, + { + "auxiliary_loss_clip": 0.01093966, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.03651452, + "balance_loss_mlp": 1.02055597, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.370679027674051, + "language_loss": 0.75665367, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77794015, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 2.7537150382995605 + }, + { + "auxiliary_loss_clip": 0.01075933, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.0329721, + "balance_loss_mlp": 1.0233562, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.5744370908984642, + "language_loss": 0.78952545, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81067079, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.6404383182525635 + }, + { + "auxiliary_loss_clip": 0.01103314, + "auxiliary_loss_mlp": 0.01040947, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.02641547, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.486498873593334, + "language_loss": 0.7639634, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78540605, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.5718045234680176 + }, + { + "auxiliary_loss_clip": 0.01104795, + "auxiliary_loss_mlp": 0.01036777, + "balance_loss_clip": 1.03787184, + "balance_loss_mlp": 1.02203083, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.831023309561765, + "language_loss": 0.70328593, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72470164, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 2.634798765182495 + }, + { + "auxiliary_loss_clip": 0.01023782, + "auxiliary_loss_mlp": 0.01010802, + "balance_loss_clip": 1.00641608, + "balance_loss_mlp": 1.00901365, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7806934940306165, + "language_loss": 0.55185556, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57220137, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 3.154491662979126 + }, + { + "auxiliary_loss_clip": 0.01088133, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.03701448, + "balance_loss_mlp": 1.01937163, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 3.5666796165274666, + "language_loss": 0.80268174, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82390773, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 2.5968382358551025 + }, + { + "auxiliary_loss_clip": 0.01061853, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.0312202, + "balance_loss_mlp": 1.02072954, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 2.083527155737033, + "language_loss": 0.78420347, + "learning_rate": 3.499601265005622e-06, + "loss": 0.8051694, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 2.670297861099243 + }, + { + "auxiliary_loss_clip": 0.01106849, + "auxiliary_loss_mlp": 0.01035895, + "balance_loss_clip": 1.03567839, + "balance_loss_mlp": 1.01977801, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 1.8867117821966097, + "language_loss": 0.5361315, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55755889, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 2.628139019012451 + }, + { + "auxiliary_loss_clip": 0.01092318, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.03588843, + "balance_loss_mlp": 1.02349997, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.5421752292710473, + "language_loss": 0.65155941, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67288339, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 2.626065254211426 + }, + { + "auxiliary_loss_clip": 0.01022783, + "auxiliary_loss_mlp": 0.01001795, + "balance_loss_clip": 1.00731707, + "balance_loss_mlp": 1.00000668, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.853518923795385, + "language_loss": 0.58014536, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60039115, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 2.869955062866211 + }, + { + "auxiliary_loss_clip": 0.01097977, + "auxiliary_loss_mlp": 0.01040302, + "balance_loss_clip": 1.03917861, + "balance_loss_mlp": 1.02457869, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.6361556162333903, + "language_loss": 0.83082128, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85220408, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.726151466369629 + }, + { + "auxiliary_loss_clip": 0.01106739, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.04023314, + "balance_loss_mlp": 1.01838088, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.8335874428433205, + "language_loss": 0.80046201, + "learning_rate": 3.498312090875666e-06, + "loss": 0.8218739, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.5844390392303467 + }, + { + "auxiliary_loss_clip": 0.01088195, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.03287899, + "balance_loss_mlp": 1.01853776, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 7.72129312411229, + "language_loss": 0.75153756, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77275306, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.573033094406128 + }, + { + "auxiliary_loss_clip": 0.01111305, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.03843582, + "balance_loss_mlp": 1.02145362, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.7156908228313876, + "language_loss": 0.74750644, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76899481, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.5983352661132812 + }, + { + "auxiliary_loss_clip": 0.01112737, + "auxiliary_loss_mlp": 0.01041491, + "balance_loss_clip": 1.04067957, + "balance_loss_mlp": 1.02524281, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 1.8537728080416227, + "language_loss": 0.81456614, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83610845, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.5623064041137695 + }, + { + "auxiliary_loss_clip": 0.01072616, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.03734648, + "balance_loss_mlp": 1.03023243, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.0590086101891085, + "language_loss": 0.70592844, + "learning_rate": 3.497279728822468e-06, + "loss": 0.72714037, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 2.657015085220337 + }, + { + "auxiliary_loss_clip": 0.01120486, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.03855717, + "balance_loss_mlp": 1.02177358, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 1.8620605163705004, + "language_loss": 0.61939031, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64096469, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.5252609252929688 + }, + { + "auxiliary_loss_clip": 0.01112019, + "auxiliary_loss_mlp": 0.01047599, + "balance_loss_clip": 1.03945136, + "balance_loss_mlp": 1.031708, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 2.4125839355993945, + "language_loss": 0.74629617, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76789236, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.5331945419311523 + }, + { + "auxiliary_loss_clip": 0.01063727, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.03491998, + "balance_loss_mlp": 1.01714444, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.684540953024581, + "language_loss": 0.79622495, + "learning_rate": 3.49650486108985e-06, + "loss": 0.81717873, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.65990948677063 + }, + { + "auxiliary_loss_clip": 0.01104594, + "auxiliary_loss_mlp": 0.00750283, + "balance_loss_clip": 1.03672719, + "balance_loss_mlp": 1.00044262, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.5822006433375366, + "language_loss": 0.77381915, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79236794, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.562333345413208 + }, + { + "auxiliary_loss_clip": 0.01107024, + "auxiliary_loss_mlp": 0.01048574, + "balance_loss_clip": 1.03828597, + "balance_loss_mlp": 1.03287411, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6976031963993339, + "language_loss": 0.84349012, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86504608, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.561999559402466 + }, + { + "auxiliary_loss_clip": 0.01116822, + "auxiliary_loss_mlp": 0.01041097, + "balance_loss_clip": 1.03780437, + "balance_loss_mlp": 1.02489626, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 1.4209974577861975, + "language_loss": 0.70703399, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.72861314, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.55146861076355 + }, + { + "auxiliary_loss_clip": 0.01032413, + "auxiliary_loss_mlp": 0.0100773, + "balance_loss_clip": 1.00629056, + "balance_loss_mlp": 1.00585866, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 1.0175078429318518, + "language_loss": 0.61838359, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.638785, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 2.904221296310425 + }, + { + "auxiliary_loss_clip": 0.01098271, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.03570354, + "balance_loss_mlp": 1.02004051, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 2.1117930480989675, + "language_loss": 0.86302757, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88438296, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.536909341812134 + }, + { + "auxiliary_loss_clip": 0.01077722, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.03787792, + "balance_loss_mlp": 1.02342057, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.7853472409005096, + "language_loss": 0.77173495, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79290724, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.6230857372283936 + }, + { + "auxiliary_loss_clip": 0.01106566, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.0377183, + "balance_loss_mlp": 1.02543354, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 2.1188866701218316, + "language_loss": 0.75216258, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.7736426, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.5072827339172363 + }, + { + "auxiliary_loss_clip": 0.01106125, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.03788126, + "balance_loss_mlp": 1.01831055, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 1.7076123958802563, + "language_loss": 0.73769826, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.75910211, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 4.0151519775390625 + }, + { + "auxiliary_loss_clip": 0.01122322, + "auxiliary_loss_mlp": 0.01041032, + "balance_loss_clip": 1.04102492, + "balance_loss_mlp": 1.02441478, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 2.503326847450414, + "language_loss": 0.86314023, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88477373, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.6375575065612793 + }, + { + "auxiliary_loss_clip": 0.01051581, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.03608847, + "balance_loss_mlp": 1.02115345, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.572022356396922, + "language_loss": 0.74870688, + "learning_rate": 3.493918281539737e-06, + "loss": 0.76957345, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 4.352061748504639 + }, + { + "auxiliary_loss_clip": 0.01092318, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.03810835, + "balance_loss_mlp": 1.02489471, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.469512699113888, + "language_loss": 0.74695623, + "learning_rate": 3.493659311850379e-06, + "loss": 0.76827639, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.7861363887786865 + }, + { + "auxiliary_loss_clip": 0.01099058, + "auxiliary_loss_mlp": 0.00750727, + "balance_loss_clip": 1.04341519, + "balance_loss_mlp": 1.00058365, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 1.9189555891418417, + "language_loss": 0.64761937, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.66611719, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 4.307919979095459 + }, + { + "auxiliary_loss_clip": 0.01117045, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.03723955, + "balance_loss_mlp": 1.0192101, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.8139950216411063, + "language_loss": 0.6722157, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69371855, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 2.597785711288452 + }, + { + "auxiliary_loss_clip": 0.01121275, + "auxiliary_loss_mlp": 0.01045548, + "balance_loss_clip": 1.0393647, + "balance_loss_mlp": 1.0300622, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 2.328015446308633, + "language_loss": 0.75071406, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77238226, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.525428533554077 + }, + { + "auxiliary_loss_clip": 0.01110859, + "auxiliary_loss_mlp": 0.01043817, + "balance_loss_clip": 1.04024601, + "balance_loss_mlp": 1.02723503, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 5.640749737107475, + "language_loss": 0.80866295, + "learning_rate": 3.492622866794074e-06, + "loss": 0.83020973, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 2.611921548843384 + }, + { + "auxiliary_loss_clip": 0.01104402, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.0410862, + "balance_loss_mlp": 1.02328587, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.8136317660174812, + "language_loss": 0.77363658, + "learning_rate": 3.492363614004407e-06, + "loss": 0.79507148, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.530618906021118 + }, + { + "auxiliary_loss_clip": 0.01124765, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.04012501, + "balance_loss_mlp": 1.01774502, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 1.9015665689954158, + "language_loss": 0.83493799, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85653579, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 4.121895790100098 + }, + { + "auxiliary_loss_clip": 0.01108899, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.03854954, + "balance_loss_mlp": 1.02467799, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6117157726707594, + "language_loss": 0.7343452, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.7558375, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 2.7583649158477783 + }, + { + "auxiliary_loss_clip": 0.01120464, + "auxiliary_loss_mlp": 0.00750428, + "balance_loss_clip": 1.03903866, + "balance_loss_mlp": 1.00059056, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.4458527851788507, + "language_loss": 0.72237706, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74108595, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 2.733853578567505 + }, + { + "auxiliary_loss_clip": 0.01106751, + "auxiliary_loss_mlp": 0.01039054, + "balance_loss_clip": 1.03738189, + "balance_loss_mlp": 1.02284169, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 3.3532348595948718, + "language_loss": 0.81969011, + "learning_rate": 3.491326037038301e-06, + "loss": 0.8411482, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.6285762786865234 + }, + { + "auxiliary_loss_clip": 0.01025413, + "auxiliary_loss_mlp": 0.01006244, + "balance_loss_clip": 1.01010942, + "balance_loss_mlp": 1.00438488, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.7042621004137257, + "language_loss": 0.57727188, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59758848, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 3.2067911624908447 + }, + { + "auxiliary_loss_clip": 0.01121331, + "auxiliary_loss_mlp": 0.01047589, + "balance_loss_clip": 1.03894496, + "balance_loss_mlp": 1.03206801, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 2.1308057357637393, + "language_loss": 0.6521244, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67381358, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 2.5841472148895264 + }, + { + "auxiliary_loss_clip": 0.01100665, + "auxiliary_loss_mlp": 0.01039633, + "balance_loss_clip": 1.03585553, + "balance_loss_mlp": 1.02614474, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.9034499572372263, + "language_loss": 0.81742299, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83882594, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 2.592027425765991 + }, + { + "auxiliary_loss_clip": 0.01112819, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.03814495, + "balance_loss_mlp": 1.02232957, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.2799819498467317, + "language_loss": 0.83494776, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85647213, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 2.562542200088501 + }, + { + "auxiliary_loss_clip": 0.01084902, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.034989, + "balance_loss_mlp": 1.02247405, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 1.8715510470160994, + "language_loss": 0.83987153, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86110407, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 2.5466084480285645 + }, + { + "auxiliary_loss_clip": 0.00984601, + "auxiliary_loss_mlp": 0.0102197, + "balance_loss_clip": 1.01382089, + "balance_loss_mlp": 1.01990759, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7545211811536083, + "language_loss": 0.56278539, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58285111, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 3.241708993911743 + }, + { + "auxiliary_loss_clip": 0.0109175, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.0348165, + "balance_loss_mlp": 1.01622653, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 6.695800116057339, + "language_loss": 0.80866688, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82990414, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 2.90364146232605 + }, + { + "auxiliary_loss_clip": 0.01009291, + "auxiliary_loss_mlp": 0.01002522, + "balance_loss_clip": 1.01277471, + "balance_loss_mlp": 1.00072217, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7957613626251387, + "language_loss": 0.66129661, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68141472, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 3.2051525115966797 + }, + { + "auxiliary_loss_clip": 0.0110489, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.03768229, + "balance_loss_mlp": 1.01933873, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.1980522967502516, + "language_loss": 0.73127943, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75265902, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.5639071464538574 + }, + { + "auxiliary_loss_clip": 0.0106723, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.03310466, + "balance_loss_mlp": 1.02255654, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 2.2680980482750748, + "language_loss": 0.72933447, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75038046, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.639538288116455 + }, + { + "auxiliary_loss_clip": 0.01069119, + "auxiliary_loss_mlp": 0.00750318, + "balance_loss_clip": 1.03374982, + "balance_loss_mlp": 1.00059354, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.798515248073073, + "language_loss": 0.81256545, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.83075982, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.69126296043396 + }, + { + "auxiliary_loss_clip": 0.0109519, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.03725028, + "balance_loss_mlp": 1.0215888, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 2.4379170844016955, + "language_loss": 0.85377514, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87509418, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.611398220062256 + }, + { + "auxiliary_loss_clip": 0.01075982, + "auxiliary_loss_mlp": 0.01044278, + "balance_loss_clip": 1.03696144, + "balance_loss_mlp": 1.02807784, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 1.8160319394081514, + "language_loss": 0.74523747, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.76644009, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.7089831829071045 + }, + { + "auxiliary_loss_clip": 0.00995757, + "auxiliary_loss_mlp": 0.01001976, + "balance_loss_clip": 1.01554132, + "balance_loss_mlp": 1.00037825, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.806386809721618, + "language_loss": 0.65233308, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67231041, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.179140567779541 + }, + { + "auxiliary_loss_clip": 0.01068039, + "auxiliary_loss_mlp": 0.00749985, + "balance_loss_clip": 1.03687286, + "balance_loss_mlp": 1.00060081, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.6973903127657999, + "language_loss": 0.76878035, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.7869606, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.846844434738159 + }, + { + "auxiliary_loss_clip": 0.01003588, + "auxiliary_loss_mlp": 0.01006368, + "balance_loss_clip": 1.00822473, + "balance_loss_mlp": 1.00448442, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7852849884366745, + "language_loss": 0.58459735, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60469687, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.2606053352355957 + }, + { + "auxiliary_loss_clip": 0.01110525, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_clip": 1.03956056, + "balance_loss_mlp": 1.02790499, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.7775290671738444, + "language_loss": 0.76423293, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.78578126, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 2.645934820175171 + }, + { + "auxiliary_loss_clip": 0.01116813, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.03928471, + "balance_loss_mlp": 1.0229888, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.5578824573724541, + "language_loss": 0.82994384, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85147709, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.633574962615967 + }, + { + "auxiliary_loss_clip": 0.01109064, + "auxiliary_loss_mlp": 0.01041839, + "balance_loss_clip": 1.04034603, + "balance_loss_mlp": 1.02516127, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.52543970745844, + "language_loss": 0.74237311, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76388216, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.584885358810425 + }, + { + "auxiliary_loss_clip": 0.01093613, + "auxiliary_loss_mlp": 0.0075009, + "balance_loss_clip": 1.04566026, + "balance_loss_mlp": 1.00062656, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.700473315456462, + "language_loss": 0.82925093, + "learning_rate": 3.486124592522163e-06, + "loss": 0.8476879, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.660261392593384 + }, + { + "auxiliary_loss_clip": 0.01106995, + "auxiliary_loss_mlp": 0.010413, + "balance_loss_clip": 1.04099, + "balance_loss_mlp": 1.02578533, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.6416519835611052, + "language_loss": 0.74493337, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76641637, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.6421523094177246 + }, + { + "auxiliary_loss_clip": 0.01080805, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.03283954, + "balance_loss_mlp": 1.02280569, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 2.068552059693898, + "language_loss": 0.81635594, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83753604, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.64970064163208 + }, + { + "auxiliary_loss_clip": 0.01053162, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_clip": 1.0327481, + "balance_loss_mlp": 1.02743912, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6675612519096046, + "language_loss": 0.79008055, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81104481, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.7276358604431152 + }, + { + "auxiliary_loss_clip": 0.01068305, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.03437042, + "balance_loss_mlp": 1.02160728, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.6829179986390974, + "language_loss": 0.78719521, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.80824792, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.6992406845092773 + }, + { + "auxiliary_loss_clip": 0.01076613, + "auxiliary_loss_mlp": 0.00750716, + "balance_loss_clip": 1.0361973, + "balance_loss_mlp": 1.0005635, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 1.5225974815891063, + "language_loss": 0.67829365, + "learning_rate": 3.484820706183595e-06, + "loss": 0.69656694, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.6820573806762695 + }, + { + "auxiliary_loss_clip": 0.01096831, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.03907847, + "balance_loss_mlp": 1.02456713, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 4.453803365195682, + "language_loss": 0.78713787, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80850261, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.584442377090454 + }, + { + "auxiliary_loss_clip": 0.01065546, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.03227854, + "balance_loss_mlp": 1.02091813, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.0049562444778393, + "language_loss": 0.67603481, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.69707668, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.7664520740509033 + }, + { + "auxiliary_loss_clip": 0.0110688, + "auxiliary_loss_mlp": 0.0075021, + "balance_loss_clip": 1.03742027, + "balance_loss_mlp": 1.00060058, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 1.4010755582988659, + "language_loss": 0.87575179, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.89432269, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.617971897125244 + }, + { + "auxiliary_loss_clip": 0.01100269, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_clip": 1.04399991, + "balance_loss_mlp": 1.02423334, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 3.3497390816872854, + "language_loss": 0.81783414, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83924103, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 4.08509373664856 + }, + { + "auxiliary_loss_clip": 0.01068449, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.03160322, + "balance_loss_mlp": 1.02229249, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.6981463318672874, + "language_loss": 0.77058995, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79164869, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.6819238662719727 + }, + { + "auxiliary_loss_clip": 0.01085033, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.03320158, + "balance_loss_mlp": 1.01744616, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.553740417141596, + "language_loss": 0.83792722, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.85909879, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 4.385515928268433 + }, + { + "auxiliary_loss_clip": 0.01095424, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.04050648, + "balance_loss_mlp": 1.01859641, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 2.227896671893764, + "language_loss": 0.78564048, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80693537, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 4.178746461868286 + }, + { + "auxiliary_loss_clip": 0.01109102, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.04029632, + "balance_loss_mlp": 1.02385724, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 2.394061666373625, + "language_loss": 0.7931298, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81461108, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.715804100036621 + }, + { + "auxiliary_loss_clip": 0.01116035, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.03732729, + "balance_loss_mlp": 1.02001297, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 1.965269840881921, + "language_loss": 0.78338975, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80489463, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 2.5169386863708496 + }, + { + "auxiliary_loss_clip": 0.01098539, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.03836823, + "balance_loss_mlp": 1.0205456, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.051656883091833, + "language_loss": 0.75001752, + "learning_rate": 3.482208711902952e-06, + "loss": 0.77135658, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 2.6172821521759033 + }, + { + "auxiliary_loss_clip": 0.0110648, + "auxiliary_loss_mlp": 0.01042723, + "balance_loss_clip": 1.03730822, + "balance_loss_mlp": 1.02809, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 1.9472668983626105, + "language_loss": 0.85524577, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87673783, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.539915084838867 + }, + { + "auxiliary_loss_clip": 0.01106712, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.0376296, + "balance_loss_mlp": 1.01883435, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.0039417701151443, + "language_loss": 0.7859363, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80734253, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 4.049840211868286 + }, + { + "auxiliary_loss_clip": 0.01081285, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.03585649, + "balance_loss_mlp": 1.01866186, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 3.4675461091996294, + "language_loss": 0.87404883, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.8951968, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 2.6861186027526855 + }, + { + "auxiliary_loss_clip": 0.01118859, + "auxiliary_loss_mlp": 0.01039435, + "balance_loss_clip": 1.03872466, + "balance_loss_mlp": 1.0250107, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.438425358587016, + "language_loss": 0.70078218, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72236514, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 2.6171045303344727 + }, + { + "auxiliary_loss_clip": 0.01114641, + "auxiliary_loss_mlp": 0.00749864, + "balance_loss_clip": 1.04003501, + "balance_loss_mlp": 1.00048018, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 1.9244448223311226, + "language_loss": 0.80527568, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82392073, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 2.5600664615631104 + }, + { + "auxiliary_loss_clip": 0.010795, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.04812312, + "balance_loss_mlp": 1.02007437, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.1432045874898056, + "language_loss": 0.70627779, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.7274099, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 2.8215885162353516 + }, + { + "auxiliary_loss_clip": 0.01098284, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.04061127, + "balance_loss_mlp": 1.02217054, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 1.9087421644147355, + "language_loss": 0.58456254, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60590416, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 2.6159298419952393 + }, + { + "auxiliary_loss_clip": 0.01110944, + "auxiliary_loss_mlp": 0.01043431, + "balance_loss_clip": 1.04075599, + "balance_loss_mlp": 1.0285058, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.5661790248138883, + "language_loss": 0.63925731, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66080111, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 2.6296045780181885 + }, + { + "auxiliary_loss_clip": 0.01098709, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.03879094, + "balance_loss_mlp": 1.01705194, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 1.91954935138386, + "language_loss": 0.71676624, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.73807871, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 2.7493271827697754 + }, + { + "auxiliary_loss_clip": 0.01082059, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.03725266, + "balance_loss_mlp": 1.01568079, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.4696660600272202, + "language_loss": 0.77603287, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79714596, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.816917657852173 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.00750038, + "balance_loss_clip": 1.03788209, + "balance_loss_mlp": 1.00054455, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 2.569944596617897, + "language_loss": 0.85620928, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87487113, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 2.520689010620117 + }, + { + "auxiliary_loss_clip": 0.01079047, + "auxiliary_loss_mlp": 0.01048029, + "balance_loss_clip": 1.03572607, + "balance_loss_mlp": 1.03058875, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 1.7917196531261457, + "language_loss": 0.72631454, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.7475853, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 2.6976981163024902 + }, + { + "auxiliary_loss_clip": 0.01120049, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.03930092, + "balance_loss_mlp": 1.01909173, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4978366870984194, + "language_loss": 0.81506991, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83661366, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.549090623855591 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.04253232, + "balance_loss_mlp": 1.02363348, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 2.1537157011420214, + "language_loss": 0.67310143, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69471264, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 2.6343424320220947 + }, + { + "auxiliary_loss_clip": 0.01078654, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.03643298, + "balance_loss_mlp": 1.02176452, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.8788864048079403, + "language_loss": 0.75172359, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77286506, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 2.640202283859253 + }, + { + "auxiliary_loss_clip": 0.01067905, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_clip": 1.03208303, + "balance_loss_mlp": 1.03839087, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.0682397874033516, + "language_loss": 0.80724978, + "learning_rate": 3.478017834441318e-06, + "loss": 0.82847959, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.746474504470825 + }, + { + "auxiliary_loss_clip": 0.01032429, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_clip": 1.03999996, + "balance_loss_mlp": 1.02928567, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 1.9552317379291078, + "language_loss": 0.72546422, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74624497, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 3.0786240100860596 + }, + { + "auxiliary_loss_clip": 0.01062705, + "auxiliary_loss_mlp": 0.01035211, + "balance_loss_clip": 1.03959179, + "balance_loss_mlp": 1.02013135, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 2.129296672637495, + "language_loss": 0.86725771, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88823688, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 2.942887306213379 + }, + { + "auxiliary_loss_clip": 0.01119265, + "auxiliary_loss_mlp": 0.01040347, + "balance_loss_clip": 1.0407306, + "balance_loss_mlp": 1.02676272, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 2.2568845954109666, + "language_loss": 0.84942329, + "learning_rate": 3.477230446361943e-06, + "loss": 0.87101936, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.5177180767059326 + }, + { + "auxiliary_loss_clip": 0.0110794, + "auxiliary_loss_mlp": 0.00750122, + "balance_loss_clip": 1.03944969, + "balance_loss_mlp": 1.00052166, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.462557357604908, + "language_loss": 0.83227229, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85085291, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.6148464679718018 + }, + { + "auxiliary_loss_clip": 0.01082574, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.03454792, + "balance_loss_mlp": 1.02041161, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.3231197004718296, + "language_loss": 0.82972431, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.85089397, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 2.5941078662872314 + }, + { + "auxiliary_loss_clip": 0.01109239, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.0408808, + "balance_loss_mlp": 1.02139211, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.019805492147851, + "language_loss": 0.67570466, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69716257, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.694553852081299 + }, + { + "auxiliary_loss_clip": 0.01109273, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.03813457, + "balance_loss_mlp": 1.02127802, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.594812107709945, + "language_loss": 0.80843151, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.82988787, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.5082435607910156 + }, + { + "auxiliary_loss_clip": 0.01078301, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.04065049, + "balance_loss_mlp": 1.02780735, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.7931277307928473, + "language_loss": 0.92559522, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94681013, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.6351711750030518 + }, + { + "auxiliary_loss_clip": 0.01111261, + "auxiliary_loss_mlp": 0.01036971, + "balance_loss_clip": 1.04144752, + "balance_loss_mlp": 1.02223635, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.8670600745159545, + "language_loss": 0.67752814, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69901049, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.6191444396972656 + }, + { + "auxiliary_loss_clip": 0.01087109, + "auxiliary_loss_mlp": 0.01044332, + "balance_loss_clip": 1.0376029, + "balance_loss_mlp": 1.02976418, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.7422306782048107, + "language_loss": 0.72352946, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74484384, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.6770577430725098 + }, + { + "auxiliary_loss_clip": 0.01075065, + "auxiliary_loss_mlp": 0.00750451, + "balance_loss_clip": 1.03723168, + "balance_loss_mlp": 1.0005517, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.1780061889880264, + "language_loss": 0.76345932, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.78171444, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.6498281955718994 + }, + { + "auxiliary_loss_clip": 0.0102385, + "auxiliary_loss_mlp": 0.01021154, + "balance_loss_clip": 1.0167253, + "balance_loss_mlp": 1.01936626, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8519752728596227, + "language_loss": 0.57114565, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59159571, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.104902505874634 + }, + { + "auxiliary_loss_clip": 0.01094391, + "auxiliary_loss_mlp": 0.01037806, + "balance_loss_clip": 1.0374763, + "balance_loss_mlp": 1.02338147, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.6722652478038416, + "language_loss": 0.71393055, + "learning_rate": 3.474602179854327e-06, + "loss": 0.7352525, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.590949773788452 + }, + { + "auxiliary_loss_clip": 0.01122943, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.04118896, + "balance_loss_mlp": 1.02595615, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 1.947814684801773, + "language_loss": 0.84544122, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86707777, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.5176830291748047 + }, + { + "auxiliary_loss_clip": 0.01110583, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.04365253, + "balance_loss_mlp": 1.02307725, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.5174961978574013, + "language_loss": 0.84589171, + "learning_rate": 3.474075855228966e-06, + "loss": 0.86736542, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.600125312805176 + }, + { + "auxiliary_loss_clip": 0.01112769, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.04242587, + "balance_loss_mlp": 1.02246666, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 1.9151121940228197, + "language_loss": 0.77275044, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79425347, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.6007986068725586 + }, + { + "auxiliary_loss_clip": 0.01077732, + "auxiliary_loss_mlp": 0.01036284, + "balance_loss_clip": 1.03467822, + "balance_loss_mlp": 1.02078629, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 2.186185052460068, + "language_loss": 0.72651309, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74765325, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 4.181470155715942 + }, + { + "auxiliary_loss_clip": 0.01120468, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.04128432, + "balance_loss_mlp": 1.02014852, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 1.794404015301163, + "language_loss": 0.70286274, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72441673, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.5316381454467773 + }, + { + "auxiliary_loss_clip": 0.01119807, + "auxiliary_loss_mlp": 0.01038956, + "balance_loss_clip": 1.04094172, + "balance_loss_mlp": 1.02519357, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 2.0870755827844385, + "language_loss": 0.80451101, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82609868, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.5251729488372803 + }, + { + "auxiliary_loss_clip": 0.01079067, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.03588533, + "balance_loss_mlp": 1.02402127, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 3.945001776373629, + "language_loss": 0.671996, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69318819, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 4.415801048278809 + }, + { + "auxiliary_loss_clip": 0.01066986, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.03674769, + "balance_loss_mlp": 1.02343357, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5271906976324745, + "language_loss": 0.79370713, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81475556, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 4.185474395751953 + }, + { + "auxiliary_loss_clip": 0.01070135, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.03742838, + "balance_loss_mlp": 1.0192318, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.6442486570509027, + "language_loss": 0.77914429, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80019361, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 2.7491872310638428 + }, + { + "auxiliary_loss_clip": 0.01123481, + "auxiliary_loss_mlp": 0.01042719, + "balance_loss_clip": 1.04376554, + "balance_loss_mlp": 1.02732873, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.216802081545645, + "language_loss": 0.78176087, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.80342287, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 2.5324389934539795 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.01039567, + "balance_loss_clip": 1.03871822, + "balance_loss_mlp": 1.02339017, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.5861238273004319, + "language_loss": 0.76441205, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78599173, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.5424368381500244 + }, + { + "auxiliary_loss_clip": 0.01093411, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.03744578, + "balance_loss_mlp": 1.02079201, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.8057791207132952, + "language_loss": 0.76426512, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.7855556, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 4.1274094581604 + }, + { + "auxiliary_loss_clip": 0.01087515, + "auxiliary_loss_mlp": 0.01035225, + "balance_loss_clip": 1.03943634, + "balance_loss_mlp": 1.01958489, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.4621006798759286, + "language_loss": 0.70923674, + "learning_rate": 3.471177075288801e-06, + "loss": 0.7304641, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 2.6520943641662598 + }, + { + "auxiliary_loss_clip": 0.01092565, + "auxiliary_loss_mlp": 0.01040387, + "balance_loss_clip": 1.03669739, + "balance_loss_mlp": 1.02308977, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 1.9185073881627135, + "language_loss": 0.74773699, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76906645, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 2.5772364139556885 + }, + { + "auxiliary_loss_clip": 0.01088674, + "auxiliary_loss_mlp": 0.01040569, + "balance_loss_clip": 1.03837061, + "balance_loss_mlp": 1.02482152, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 2.7327771992796452, + "language_loss": 0.73455489, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75584733, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 2.6702160835266113 + }, + { + "auxiliary_loss_clip": 0.01114134, + "auxiliary_loss_mlp": 0.00750568, + "balance_loss_clip": 1.03937149, + "balance_loss_mlp": 1.00053072, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.2257818774050864, + "language_loss": 0.67014766, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.68879473, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 2.5984771251678467 + }, + { + "auxiliary_loss_clip": 0.01070479, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.0371418, + "balance_loss_mlp": 1.02191007, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 1.8523574698243253, + "language_loss": 0.71132314, + "learning_rate": 3.470121299177082e-06, + "loss": 0.73238385, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 2.7759816646575928 + }, + { + "auxiliary_loss_clip": 0.01106819, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.03694546, + "balance_loss_mlp": 1.01852131, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 1.8206354869702905, + "language_loss": 0.73169029, + "learning_rate": 3.469857215756257e-06, + "loss": 0.75309348, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 2.638197422027588 + }, + { + "auxiliary_loss_clip": 0.01085902, + "auxiliary_loss_mlp": 0.00750224, + "balance_loss_clip": 1.03573918, + "balance_loss_mlp": 1.00042391, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.8775410167074584, + "language_loss": 0.86350358, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88186491, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 2.661569595336914 + }, + { + "auxiliary_loss_clip": 0.01120673, + "auxiliary_loss_mlp": 0.00750302, + "balance_loss_clip": 1.03924417, + "balance_loss_mlp": 1.0004344, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.4543239507527181, + "language_loss": 0.80322266, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82193244, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0109153, + "auxiliary_loss_mlp": 0.00750198, + "balance_loss_clip": 1.03524137, + "balance_loss_mlp": 1.00045061, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.4353853712899498, + "language_loss": 0.88012272, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89854002, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.6751582622528076 + }, + { + "auxiliary_loss_clip": 0.01117391, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.03957701, + "balance_loss_mlp": 1.02029037, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 3.503325060643671, + "language_loss": 0.77651882, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79803127, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 2.720370292663574 + }, + { + "auxiliary_loss_clip": 0.01120844, + "auxiliary_loss_mlp": 0.01048083, + "balance_loss_clip": 1.03968298, + "balance_loss_mlp": 1.0328896, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.8195678956757972, + "language_loss": 0.7499398, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.7716291, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.557607889175415 + }, + { + "auxiliary_loss_clip": 0.01096418, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.04019094, + "balance_loss_mlp": 1.02582645, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.5853450575093657, + "language_loss": 0.6924898, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71385396, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.624631881713867 + }, + { + "auxiliary_loss_clip": 0.01088428, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.03753757, + "balance_loss_mlp": 1.02799797, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.0952270959862958, + "language_loss": 0.79690087, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81821507, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 2.7194669246673584 + }, + { + "auxiliary_loss_clip": 0.0111242, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_clip": 1.03659844, + "balance_loss_mlp": 1.02911425, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.8181122548288577, + "language_loss": 0.80729693, + "learning_rate": 3.467742542694501e-06, + "loss": 0.8288523, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.507322311401367 + }, + { + "auxiliary_loss_clip": 0.01092235, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.03568065, + "balance_loss_mlp": 1.02596354, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.7292385598828417, + "language_loss": 0.8024689, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82379657, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.721803903579712 + }, + { + "auxiliary_loss_clip": 0.01026228, + "auxiliary_loss_mlp": 0.01014328, + "balance_loss_clip": 1.01030087, + "balance_loss_mlp": 1.01256406, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.842433479121426, + "language_loss": 0.60801053, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62841618, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.093393325805664 + }, + { + "auxiliary_loss_clip": 0.01084678, + "auxiliary_loss_mlp": 0.01042092, + "balance_loss_clip": 1.03737617, + "balance_loss_mlp": 1.02707207, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 2.3021893371841764, + "language_loss": 0.77034843, + "learning_rate": 3.46694862168102e-06, + "loss": 0.7916162, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.6196956634521484 + }, + { + "auxiliary_loss_clip": 0.01095127, + "auxiliary_loss_mlp": 0.01041129, + "balance_loss_clip": 1.03666437, + "balance_loss_mlp": 1.02561426, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 3.31243982681902, + "language_loss": 0.74053252, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76189506, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.6814355850219727 + }, + { + "auxiliary_loss_clip": 0.01110859, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.03818941, + "balance_loss_mlp": 1.02180886, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.2967291265778766, + "language_loss": 0.80677605, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82825875, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.548750877380371 + }, + { + "auxiliary_loss_clip": 0.01065009, + "auxiliary_loss_mlp": 0.01041646, + "balance_loss_clip": 1.03565347, + "balance_loss_mlp": 1.02786565, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5485434135909775, + "language_loss": 0.76516032, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78622687, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 2.723365068435669 + }, + { + "auxiliary_loss_clip": 0.0104933, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.03424478, + "balance_loss_mlp": 1.0264442, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.5997561427337665, + "language_loss": 0.82637596, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84728253, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 2.7222883701324463 + }, + { + "auxiliary_loss_clip": 0.01116822, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.03769398, + "balance_loss_mlp": 1.02269411, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.218811328184447, + "language_loss": 0.76456821, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78611606, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.607661008834839 + }, + { + "auxiliary_loss_clip": 0.01104346, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.03546691, + "balance_loss_mlp": 1.01870811, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.6659042870240777, + "language_loss": 0.66141808, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.6828078, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.6962530612945557 + }, + { + "auxiliary_loss_clip": 0.01049471, + "auxiliary_loss_mlp": 0.0103824, + "balance_loss_clip": 1.03408647, + "balance_loss_mlp": 1.02226043, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.0631732787129407, + "language_loss": 0.73184252, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75271964, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.69242787361145 + }, + { + "auxiliary_loss_clip": 0.01119752, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.03920782, + "balance_loss_mlp": 1.02089441, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 2.089722125842379, + "language_loss": 0.86875248, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.89030385, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.52994966506958 + }, + { + "auxiliary_loss_clip": 0.01091545, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.03569818, + "balance_loss_mlp": 1.02895868, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.776487726839746, + "language_loss": 0.7661562, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78750563, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.608107805252075 + }, + { + "auxiliary_loss_clip": 0.01109748, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.03870606, + "balance_loss_mlp": 1.02673113, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 2.931729834463681, + "language_loss": 0.75785208, + "learning_rate": 3.464298604081606e-06, + "loss": 0.77936542, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.604787826538086 + }, + { + "auxiliary_loss_clip": 0.01075364, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.03601146, + "balance_loss_mlp": 1.01659012, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4503202504284154, + "language_loss": 0.73283887, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75390071, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.7314748764038086 + }, + { + "auxiliary_loss_clip": 0.01079578, + "auxiliary_loss_mlp": 0.0104134, + "balance_loss_clip": 1.03511322, + "balance_loss_mlp": 1.02603972, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.7320954497564647, + "language_loss": 0.90613675, + "learning_rate": 3.463767933923799e-06, + "loss": 0.92734587, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.611793279647827 + }, + { + "auxiliary_loss_clip": 0.0110439, + "auxiliary_loss_mlp": 0.01038549, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.02491736, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 3.8826679465461136, + "language_loss": 0.7980395, + "learning_rate": 3.463502515580524e-06, + "loss": 0.81946886, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.5961592197418213 + }, + { + "auxiliary_loss_clip": 0.01100105, + "auxiliary_loss_mlp": 0.01036494, + "balance_loss_clip": 1.0350219, + "balance_loss_mlp": 1.02243936, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8428190399864786, + "language_loss": 0.62529945, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64666545, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.6198198795318604 + }, + { + "auxiliary_loss_clip": 0.01108385, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.03617203, + "balance_loss_mlp": 1.01897228, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 1.9501151760073432, + "language_loss": 0.8327328, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85415602, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.5634992122650146 + }, + { + "auxiliary_loss_clip": 0.01030427, + "auxiliary_loss_mlp": 0.01004147, + "balance_loss_clip": 1.01409185, + "balance_loss_mlp": 1.00266922, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8013960056279668, + "language_loss": 0.70585662, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72620243, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 4.510406255722046 + }, + { + "auxiliary_loss_clip": 0.0107988, + "auxiliary_loss_mlp": 0.01046565, + "balance_loss_clip": 1.03179502, + "balance_loss_mlp": 1.02993584, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 3.991557962041616, + "language_loss": 0.77759296, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79885745, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 2.6840550899505615 + }, + { + "auxiliary_loss_clip": 0.01054045, + "auxiliary_loss_mlp": 0.01043894, + "balance_loss_clip": 1.03099942, + "balance_loss_mlp": 1.02829528, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 12.740533013964885, + "language_loss": 0.68873298, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70971239, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 4.188509225845337 + }, + { + "auxiliary_loss_clip": 0.0106734, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.03630877, + "balance_loss_mlp": 1.01766062, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 2.414442890699177, + "language_loss": 0.67673337, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69774729, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 2.690075397491455 + }, + { + "auxiliary_loss_clip": 0.01025787, + "auxiliary_loss_mlp": 0.010153, + "balance_loss_clip": 1.0093081, + "balance_loss_mlp": 1.01394141, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.7792577195177004, + "language_loss": 0.53098512, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55139601, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 4.603490114212036 + }, + { + "auxiliary_loss_clip": 0.01105734, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.03718567, + "balance_loss_mlp": 1.02395618, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.7453758426326584, + "language_loss": 0.84160221, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86304724, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 2.6372861862182617 + }, + { + "auxiliary_loss_clip": 0.01095581, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.03483272, + "balance_loss_mlp": 1.01923537, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.1887623513615155, + "language_loss": 0.67307842, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69440043, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.6451096534729004 + }, + { + "auxiliary_loss_clip": 0.01087296, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.03236246, + "balance_loss_mlp": 1.01868975, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 2.63420286312765, + "language_loss": 0.78103578, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80223978, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 2.6025357246398926 + }, + { + "auxiliary_loss_clip": 0.01087448, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.03379929, + "balance_loss_mlp": 1.02225876, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.6174997909268543, + "language_loss": 0.67990601, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70114243, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 4.217071533203125 + }, + { + "auxiliary_loss_clip": 0.01109137, + "auxiliary_loss_mlp": 0.01048384, + "balance_loss_clip": 1.03719926, + "balance_loss_mlp": 1.03307796, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 2.280856874039608, + "language_loss": 0.84386134, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86543655, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 2.5565176010131836 + }, + { + "auxiliary_loss_clip": 0.01076606, + "auxiliary_loss_mlp": 0.01040898, + "balance_loss_clip": 1.03520012, + "balance_loss_mlp": 1.02482843, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 2.6020276813271246, + "language_loss": 0.64810967, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.6692847, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 2.7068190574645996 + }, + { + "auxiliary_loss_clip": 0.01024071, + "auxiliary_loss_mlp": 0.01005002, + "balance_loss_clip": 1.01809156, + "balance_loss_mlp": 1.00344014, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8889458625098678, + "language_loss": 0.61121166, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63150239, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 3.3321163654327393 + }, + { + "auxiliary_loss_clip": 0.01121856, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.04020262, + "balance_loss_mlp": 1.02805829, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.377458290085947, + "language_loss": 0.71083939, + "learning_rate": 3.459514586533184e-06, + "loss": 0.73250377, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 2.5315635204315186 + }, + { + "auxiliary_loss_clip": 0.01090659, + "auxiliary_loss_mlp": 0.0075032, + "balance_loss_clip": 1.03701687, + "balance_loss_mlp": 1.00032604, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.523786481005504, + "language_loss": 0.77265936, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79106915, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.68613862991333 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.03899932, + "balance_loss_mlp": 1.02259099, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.6489506452082434, + "language_loss": 0.75593913, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.77749735, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.5223467350006104 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01035869, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.02155209, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 1.4856297696992318, + "language_loss": 0.69388998, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71531141, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.5177319049835205 + }, + { + "auxiliary_loss_clip": 0.0109496, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.03617644, + "balance_loss_mlp": 1.02530217, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.8355446200595358, + "language_loss": 0.78522432, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80658406, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 2.6197078227996826 + }, + { + "auxiliary_loss_clip": 0.01091787, + "auxiliary_loss_mlp": 0.01038589, + "balance_loss_clip": 1.03885758, + "balance_loss_mlp": 1.0234611, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 1.921953810307718, + "language_loss": 0.83488679, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.8561905, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.6325058937072754 + }, + { + "auxiliary_loss_clip": 0.01105665, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.03650832, + "balance_loss_mlp": 1.02686894, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 1.726730691785163, + "language_loss": 0.71212131, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73361409, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.5461950302124023 + }, + { + "auxiliary_loss_clip": 0.01034447, + "auxiliary_loss_mlp": 0.01010268, + "balance_loss_clip": 1.00823331, + "balance_loss_mlp": 1.00887954, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.6903397135287963, + "language_loss": 0.56413621, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58458334, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.2229726314544678 + }, + { + "auxiliary_loss_clip": 0.01094759, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.03758121, + "balance_loss_mlp": 1.01902294, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.6359785898569965, + "language_loss": 0.77754122, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.7988233, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.6683738231658936 + }, + { + "auxiliary_loss_clip": 0.01080016, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.03819299, + "balance_loss_mlp": 1.02201176, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 2.98904292659274, + "language_loss": 0.71749836, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73866254, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.622189521789551 + }, + { + "auxiliary_loss_clip": 0.01094931, + "auxiliary_loss_mlp": 0.01037891, + "balance_loss_clip": 1.04192853, + "balance_loss_mlp": 1.02246523, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 1.6177010485683105, + "language_loss": 0.80619299, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.8275212, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.6649973392486572 + }, + { + "auxiliary_loss_clip": 0.01086988, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.03410017, + "balance_loss_mlp": 1.02476037, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 1.7025022282798419, + "language_loss": 0.65997851, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68123317, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.7061803340911865 + }, + { + "auxiliary_loss_clip": 0.01065679, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_clip": 1.03521323, + "balance_loss_mlp": 1.03250742, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.9303890338256935, + "language_loss": 0.6979484, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71908677, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.6808197498321533 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.03877616, + "balance_loss_mlp": 1.02165413, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.6775424389219962, + "language_loss": 0.78940386, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.81084168, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 2.8418586254119873 + }, + { + "auxiliary_loss_clip": 0.01093905, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_clip": 1.03689027, + "balance_loss_mlp": 1.02942085, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 1.987189444243559, + "language_loss": 0.76203686, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78340411, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 2.652278423309326 + }, + { + "auxiliary_loss_clip": 0.01087015, + "auxiliary_loss_mlp": 0.01034513, + "balance_loss_clip": 1.04036975, + "balance_loss_mlp": 1.01806188, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.3685484889999096, + "language_loss": 0.7772736, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.79848886, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 2.645732879638672 + }, + { + "auxiliary_loss_clip": 0.01091005, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.03315258, + "balance_loss_mlp": 1.02040863, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 1.8699921787859908, + "language_loss": 0.636374, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.65763527, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.816887617111206 + }, + { + "auxiliary_loss_clip": 0.0110445, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.03550696, + "balance_loss_mlp": 1.0207051, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.7272521637052594, + "language_loss": 0.826572, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84796137, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.5602777004241943 + }, + { + "auxiliary_loss_clip": 0.01074659, + "auxiliary_loss_mlp": 0.01041112, + "balance_loss_clip": 1.03405941, + "balance_loss_mlp": 1.02530479, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 1.9614775949641972, + "language_loss": 0.69908303, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.72024071, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.767507553100586 + }, + { + "auxiliary_loss_clip": 0.01102527, + "auxiliary_loss_mlp": 0.01039889, + "balance_loss_clip": 1.03538752, + "balance_loss_mlp": 1.02651393, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.7561648582596392, + "language_loss": 0.69625705, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71768123, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.588771104812622 + }, + { + "auxiliary_loss_clip": 0.01103671, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.03578007, + "balance_loss_mlp": 1.02164137, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.3339479471413456, + "language_loss": 0.70055598, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72195125, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.674095630645752 + }, + { + "auxiliary_loss_clip": 0.01079792, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.03522563, + "balance_loss_mlp": 1.0249908, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.7677091792033819, + "language_loss": 0.85322779, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87442547, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.667341709136963 + }, + { + "auxiliary_loss_clip": 0.01095526, + "auxiliary_loss_mlp": 0.01040975, + "balance_loss_clip": 1.03659892, + "balance_loss_mlp": 1.02662206, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 1.980296938012574, + "language_loss": 0.76907557, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79044056, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 2.5855026245117188 + }, + { + "auxiliary_loss_clip": 0.01103893, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.04031467, + "balance_loss_mlp": 1.0261445, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 2.40408932250522, + "language_loss": 0.75847328, + "learning_rate": 3.453375588053264e-06, + "loss": 0.77991319, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 2.5680878162384033 + }, + { + "auxiliary_loss_clip": 0.01116273, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.0371567, + "balance_loss_mlp": 1.01753402, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 2.023572842835238, + "language_loss": 0.86485946, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88634574, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.555964469909668 + }, + { + "auxiliary_loss_clip": 0.01027017, + "auxiliary_loss_mlp": 0.01001186, + "balance_loss_clip": 1.01408529, + "balance_loss_mlp": 0.99955273, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8101134720203945, + "language_loss": 0.60303038, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62331241, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.138641595840454 + }, + { + "auxiliary_loss_clip": 0.01097828, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.03642642, + "balance_loss_mlp": 1.01634955, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.5823263103898484, + "language_loss": 0.77481234, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79611105, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.6478278636932373 + }, + { + "auxiliary_loss_clip": 0.01008034, + "auxiliary_loss_mlp": 0.00747418, + "balance_loss_clip": 1.01210713, + "balance_loss_mlp": 1.00012767, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.8320268674494049, + "language_loss": 0.58696371, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60451823, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 3.202639579772949 + }, + { + "auxiliary_loss_clip": 0.01107501, + "auxiliary_loss_mlp": 0.01041539, + "balance_loss_clip": 1.03889275, + "balance_loss_mlp": 1.02700126, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.9938340402591292, + "language_loss": 0.68470955, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70619994, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 4.191979169845581 + }, + { + "auxiliary_loss_clip": 0.01112998, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.03989542, + "balance_loss_mlp": 1.01948643, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.7416998392811496, + "language_loss": 0.83988702, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.86137289, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 2.6633265018463135 + }, + { + "auxiliary_loss_clip": 0.01096963, + "auxiliary_loss_mlp": 0.01038967, + "balance_loss_clip": 1.03720164, + "balance_loss_mlp": 1.02124035, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 3.607168413666157, + "language_loss": 0.69961196, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72097123, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 4.177443742752075 + }, + { + "auxiliary_loss_clip": 0.01084389, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.0352155, + "balance_loss_mlp": 1.01616573, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.9323669664343743, + "language_loss": 0.86892581, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89007854, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 4.157556772232056 + }, + { + "auxiliary_loss_clip": 0.00982026, + "auxiliary_loss_mlp": 0.01022151, + "balance_loss_clip": 1.00657487, + "balance_loss_mlp": 1.02004135, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7918729420668019, + "language_loss": 0.55041361, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57045543, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 3.134157657623291 + }, + { + "auxiliary_loss_clip": 0.01107857, + "auxiliary_loss_mlp": 0.01045856, + "balance_loss_clip": 1.03825259, + "balance_loss_mlp": 1.03052616, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0457734801966714, + "language_loss": 0.77860725, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80014431, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 2.66949725151062 + }, + { + "auxiliary_loss_clip": 0.01115705, + "auxiliary_loss_mlp": 0.01041215, + "balance_loss_clip": 1.04442346, + "balance_loss_mlp": 1.025527, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.7595759082881486, + "language_loss": 0.66965294, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69122219, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 2.604383945465088 + }, + { + "auxiliary_loss_clip": 0.01070028, + "auxiliary_loss_mlp": 0.01040386, + "balance_loss_clip": 1.03722501, + "balance_loss_mlp": 1.02541304, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.597748139892353, + "language_loss": 0.86546272, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88656688, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 4.164279937744141 + }, + { + "auxiliary_loss_clip": 0.0109282, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.03587604, + "balance_loss_mlp": 1.01706994, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 1.9008000452066274, + "language_loss": 0.75769484, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77895826, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 2.591252326965332 + }, + { + "auxiliary_loss_clip": 0.01076585, + "auxiliary_loss_mlp": 0.01044139, + "balance_loss_clip": 1.03603935, + "balance_loss_mlp": 1.02644873, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.6802123320726547, + "language_loss": 0.87899637, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90020365, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 2.659585475921631 + }, + { + "auxiliary_loss_clip": 0.01082298, + "auxiliary_loss_mlp": 0.0104205, + "balance_loss_clip": 1.03819489, + "balance_loss_mlp": 1.02612352, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.7252473588404924, + "language_loss": 0.78171003, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80295348, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 2.6601085662841797 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.0345068, + "balance_loss_mlp": 1.01657462, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9288631379525207, + "language_loss": 0.87967432, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90099591, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 2.5932250022888184 + }, + { + "auxiliary_loss_clip": 0.01096368, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.03503048, + "balance_loss_mlp": 1.02017629, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 2.6304893053195992, + "language_loss": 0.76365077, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78495944, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.5890510082244873 + }, + { + "auxiliary_loss_clip": 0.01125221, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.04251933, + "balance_loss_mlp": 1.02642226, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.9850765904339986, + "language_loss": 0.70214522, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72383273, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.6334240436553955 + }, + { + "auxiliary_loss_clip": 0.01099411, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_clip": 1.04275417, + "balance_loss_mlp": 1.02842903, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 2.1850334584633306, + "language_loss": 0.83142889, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85286021, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.650184154510498 + }, + { + "auxiliary_loss_clip": 0.01080465, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.03382659, + "balance_loss_mlp": 1.0150485, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 3.5776339008809903, + "language_loss": 0.76121044, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78232396, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.7806427478790283 + }, + { + "auxiliary_loss_clip": 0.01066005, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.03344345, + "balance_loss_mlp": 1.01887214, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.7035350024694136, + "language_loss": 0.70935273, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73035896, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 2.822927236557007 + }, + { + "auxiliary_loss_clip": 0.0111116, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.03940153, + "balance_loss_mlp": 1.02259636, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.9113262801868534, + "language_loss": 0.7364108, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.7579149, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.5981643199920654 + }, + { + "auxiliary_loss_clip": 0.01114721, + "auxiliary_loss_mlp": 0.01044114, + "balance_loss_clip": 1.03885055, + "balance_loss_mlp": 1.02881932, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.935799500981489, + "language_loss": 0.73728305, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75887138, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.596229314804077 + }, + { + "auxiliary_loss_clip": 0.010753, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.03958464, + "balance_loss_mlp": 1.02185369, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 1.9304173746570656, + "language_loss": 0.82307959, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84421116, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.676405429840088 + }, + { + "auxiliary_loss_clip": 0.01109254, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.03860319, + "balance_loss_mlp": 1.02558804, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.7050854368393331, + "language_loss": 0.74892974, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.77042842, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.5502500534057617 + }, + { + "auxiliary_loss_clip": 0.01029233, + "auxiliary_loss_mlp": 0.0101208, + "balance_loss_clip": 1.01119626, + "balance_loss_mlp": 1.01048267, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8879610909223147, + "language_loss": 0.56950855, + "learning_rate": 3.446400750732793e-06, + "loss": 0.58992171, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 3.0237650871276855 + }, + { + "auxiliary_loss_clip": 0.01083498, + "auxiliary_loss_mlp": 0.0104129, + "balance_loss_clip": 1.03511095, + "balance_loss_mlp": 1.0267942, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.5039746490689574, + "language_loss": 0.74422157, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76546955, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.7354865074157715 + }, + { + "auxiliary_loss_clip": 0.01065969, + "auxiliary_loss_mlp": 0.01042604, + "balance_loss_clip": 1.03070033, + "balance_loss_mlp": 1.02418613, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.3699304751931924, + "language_loss": 0.86729383, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88837957, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.6464853286743164 + }, + { + "auxiliary_loss_clip": 0.01114202, + "auxiliary_loss_mlp": 0.01038147, + "balance_loss_clip": 1.03988111, + "balance_loss_mlp": 1.02176142, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.4985894991233457, + "language_loss": 0.76213741, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78366089, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.6012837886810303 + }, + { + "auxiliary_loss_clip": 0.01093405, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.03579378, + "balance_loss_mlp": 1.01962793, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.4856899969950534, + "language_loss": 0.8002702, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.8215633, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 2.657620429992676 + }, + { + "auxiliary_loss_clip": 0.01109698, + "auxiliary_loss_mlp": 0.01048267, + "balance_loss_clip": 1.03785753, + "balance_loss_mlp": 1.03219759, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.5503725866758526, + "language_loss": 0.67208433, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69366395, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.5908148288726807 + }, + { + "auxiliary_loss_clip": 0.01120887, + "auxiliary_loss_mlp": 0.01044796, + "balance_loss_clip": 1.0388155, + "balance_loss_mlp": 1.02798712, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 1.6488156018362692, + "language_loss": 0.7959317, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81758857, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.6232547760009766 + }, + { + "auxiliary_loss_clip": 0.01101494, + "auxiliary_loss_mlp": 0.01048034, + "balance_loss_clip": 1.03937685, + "balance_loss_mlp": 1.03017616, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 1.9517448972382387, + "language_loss": 0.81953031, + "learning_rate": 3.444516567560673e-06, + "loss": 0.84102559, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.6053647994995117 + }, + { + "auxiliary_loss_clip": 0.01104987, + "auxiliary_loss_mlp": 0.01038357, + "balance_loss_clip": 1.03724778, + "balance_loss_mlp": 1.02363491, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 2.079265646911031, + "language_loss": 0.65574974, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67718321, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 2.7702929973602295 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.03734231, + "balance_loss_mlp": 1.02715516, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.672430720842899, + "language_loss": 0.74523842, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76671612, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.6070408821105957 + }, + { + "auxiliary_loss_clip": 0.01098785, + "auxiliary_loss_mlp": 0.01058361, + "balance_loss_clip": 1.03410792, + "balance_loss_mlp": 1.04162383, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.5959199829148627, + "language_loss": 0.7792604, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80083179, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 2.7647037506103516 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.03732729, + "balance_loss_mlp": 1.02896118, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.0793459598025987, + "language_loss": 0.79564023, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81710696, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.587761402130127 + }, + { + "auxiliary_loss_clip": 0.01093073, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.03642213, + "balance_loss_mlp": 1.02730346, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.6170172617352752, + "language_loss": 0.8042618, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82560402, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 2.631632089614868 + }, + { + "auxiliary_loss_clip": 0.01122312, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_clip": 1.04219735, + "balance_loss_mlp": 1.03154027, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 2.0647078392486287, + "language_loss": 0.7728548, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79454261, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 2.670661449432373 + }, + { + "auxiliary_loss_clip": 0.01087185, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.0394218, + "balance_loss_mlp": 1.01656723, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.4586085404534175, + "language_loss": 0.76485395, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78603536, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.7202301025390625 + }, + { + "auxiliary_loss_clip": 0.01088362, + "auxiliary_loss_mlp": 0.0075036, + "balance_loss_clip": 1.03838587, + "balance_loss_mlp": 1.00073338, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.9247353618852587, + "language_loss": 0.82933581, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.84772301, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.770373821258545 + }, + { + "auxiliary_loss_clip": 0.01081644, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.03717232, + "balance_loss_mlp": 1.02199924, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 1.5960103819427014, + "language_loss": 0.71912557, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74031186, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 2.7124991416931152 + }, + { + "auxiliary_loss_clip": 0.01119873, + "auxiliary_loss_mlp": 0.01041507, + "balance_loss_clip": 1.0385555, + "balance_loss_mlp": 1.02481806, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.325958381425429, + "language_loss": 0.81871951, + "learning_rate": 3.441820222206035e-06, + "loss": 0.84033334, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.58510160446167 + }, + { + "auxiliary_loss_clip": 0.0111586, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.04042888, + "balance_loss_mlp": 1.02818668, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.364678907634583, + "language_loss": 0.76769358, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78929394, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.576326608657837 + }, + { + "auxiliary_loss_clip": 0.01058041, + "auxiliary_loss_mlp": 0.01048932, + "balance_loss_clip": 1.03447366, + "balance_loss_mlp": 1.03102708, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.135195780456227, + "language_loss": 0.82952058, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85059029, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 4.2847580909729 + }, + { + "auxiliary_loss_clip": 0.01111216, + "auxiliary_loss_mlp": 0.01043719, + "balance_loss_clip": 1.0400703, + "balance_loss_mlp": 1.02770936, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 1.9457931497482621, + "language_loss": 0.76404023, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78558952, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 2.643202781677246 + }, + { + "auxiliary_loss_clip": 0.01118495, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.03869963, + "balance_loss_mlp": 1.02239704, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 1.7798141523870747, + "language_loss": 0.82441503, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84596854, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 2.6230385303497314 + }, + { + "auxiliary_loss_clip": 0.010756, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_clip": 1.0364182, + "balance_loss_mlp": 1.02979851, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.5230504369535494, + "language_loss": 0.87977678, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.90100169, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 4.200295686721802 + }, + { + "auxiliary_loss_clip": 0.01095675, + "auxiliary_loss_mlp": 0.01039427, + "balance_loss_clip": 1.03710556, + "balance_loss_mlp": 1.02432358, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4818865844790985, + "language_loss": 0.78861177, + "learning_rate": 3.440199789988407e-06, + "loss": 0.80996281, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 4.225628614425659 + }, + { + "auxiliary_loss_clip": 0.01063075, + "auxiliary_loss_mlp": 0.01039114, + "balance_loss_clip": 1.03801131, + "balance_loss_mlp": 1.02433205, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 1.9572639613223628, + "language_loss": 0.63730139, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65832329, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 2.851844310760498 + }, + { + "auxiliary_loss_clip": 0.01049298, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.03264225, + "balance_loss_mlp": 1.02183938, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 2.9159499863845584, + "language_loss": 0.75809562, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77895784, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 2.824004650115967 + }, + { + "auxiliary_loss_clip": 0.01061755, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.03661001, + "balance_loss_mlp": 1.01700401, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.838301951526555, + "language_loss": 0.71724403, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73820269, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 4.243762731552124 + }, + { + "auxiliary_loss_clip": 0.01093778, + "auxiliary_loss_mlp": 0.01038956, + "balance_loss_clip": 1.03569865, + "balance_loss_mlp": 1.02164721, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.9171062925029767, + "language_loss": 0.66969967, + "learning_rate": 3.439118409456376e-06, + "loss": 0.69102705, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 2.829429864883423 + }, + { + "auxiliary_loss_clip": 0.01107655, + "auxiliary_loss_mlp": 0.01041875, + "balance_loss_clip": 1.0368588, + "balance_loss_mlp": 1.0251621, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.6290004444403547, + "language_loss": 0.76455605, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78605127, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.640763521194458 + }, + { + "auxiliary_loss_clip": 0.00986961, + "auxiliary_loss_mlp": 0.0100144, + "balance_loss_clip": 1.01506639, + "balance_loss_mlp": 0.99972373, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.9330069994613007, + "language_loss": 0.61185479, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63173878, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 3.1902458667755127 + }, + { + "auxiliary_loss_clip": 0.010956, + "auxiliary_loss_mlp": 0.0103645, + "balance_loss_clip": 1.04199302, + "balance_loss_mlp": 1.02048779, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.4794856900650464, + "language_loss": 0.76265103, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78397149, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 2.8694815635681152 + }, + { + "auxiliary_loss_clip": 0.01109577, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.0389303, + "balance_loss_mlp": 1.02287245, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 1.8609783016119406, + "language_loss": 0.80665123, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82813728, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.6025872230529785 + }, + { + "auxiliary_loss_clip": 0.01096432, + "auxiliary_loss_mlp": 0.01036549, + "balance_loss_clip": 1.03843379, + "balance_loss_mlp": 1.0197885, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 1.9533925329428778, + "language_loss": 0.89136058, + "learning_rate": 3.43776545600926e-06, + "loss": 0.9126904, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.6337697505950928 + }, + { + "auxiliary_loss_clip": 0.0111496, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.04060543, + "balance_loss_mlp": 1.02057219, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.7621012392238902, + "language_loss": 0.68478572, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70630395, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.638737678527832 + }, + { + "auxiliary_loss_clip": 0.01111163, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.03727174, + "balance_loss_mlp": 1.01869011, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.8719421642381553, + "language_loss": 0.83247858, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.8539387, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 2.600970506668091 + }, + { + "auxiliary_loss_clip": 0.01088684, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.03935552, + "balance_loss_mlp": 1.02362752, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.4639105120028795, + "language_loss": 0.84673947, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86802888, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.6849043369293213 + }, + { + "auxiliary_loss_clip": 0.01103604, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.03802419, + "balance_loss_mlp": 1.02666306, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.6801701564856106, + "language_loss": 0.83830702, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.85978585, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.6516928672790527 + }, + { + "auxiliary_loss_clip": 0.0107817, + "auxiliary_loss_mlp": 0.01040219, + "balance_loss_clip": 1.03283083, + "balance_loss_mlp": 1.02566385, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.8415595399825373, + "language_loss": 0.8103317, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83151555, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.606703758239746 + }, + { + "auxiliary_loss_clip": 0.01110694, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_clip": 1.03991699, + "balance_loss_mlp": 1.02214622, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.5629340362985165, + "language_loss": 0.8609159, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88239199, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.6424078941345215 + }, + { + "auxiliary_loss_clip": 0.01100038, + "auxiliary_loss_mlp": 0.01038725, + "balance_loss_clip": 1.03827977, + "balance_loss_mlp": 1.02266717, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 2.030277757371567, + "language_loss": 0.83592039, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85730797, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.5737662315368652 + }, + { + "auxiliary_loss_clip": 0.0110872, + "auxiliary_loss_mlp": 0.01048928, + "balance_loss_clip": 1.03824198, + "balance_loss_mlp": 1.03232229, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.9097357553523675, + "language_loss": 0.79625398, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81783044, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.566033124923706 + }, + { + "auxiliary_loss_clip": 0.01103988, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.04053199, + "balance_loss_mlp": 1.02168715, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.5545488710253264, + "language_loss": 0.73073387, + "learning_rate": 3.435326705894206e-06, + "loss": 0.75214547, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.6848437786102295 + }, + { + "auxiliary_loss_clip": 0.01083254, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.03690231, + "balance_loss_mlp": 1.0207448, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5250670493370928, + "language_loss": 0.7356348, + "learning_rate": 3.435055461383471e-06, + "loss": 0.756827, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.7125794887542725 + }, + { + "auxiliary_loss_clip": 0.01114165, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.04110157, + "balance_loss_mlp": 1.0203054, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.233472167699271, + "language_loss": 0.7070787, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.72858524, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.538696765899658 + }, + { + "auxiliary_loss_clip": 0.01076606, + "auxiliary_loss_mlp": 0.01046581, + "balance_loss_clip": 1.03455877, + "balance_loss_mlp": 1.02955759, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 2.7281943538502897, + "language_loss": 0.79303026, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81426209, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 2.6400084495544434 + }, + { + "auxiliary_loss_clip": 0.01008797, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.01770782, + "balance_loss_mlp": 1.03025556, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8589317795936406, + "language_loss": 0.58693939, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60734934, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.145254373550415 + }, + { + "auxiliary_loss_clip": 0.01057062, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_clip": 1.03144979, + "balance_loss_mlp": 1.02711058, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.528457966987507, + "language_loss": 0.85151911, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87251949, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.636631488800049 + }, + { + "auxiliary_loss_clip": 0.01100617, + "auxiliary_loss_mlp": 0.01049445, + "balance_loss_clip": 1.03554571, + "balance_loss_mlp": 1.03157544, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 1.7713512002610994, + "language_loss": 0.68230659, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70380718, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.5418262481689453 + }, + { + "auxiliary_loss_clip": 0.01082861, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_clip": 1.03443122, + "balance_loss_mlp": 1.03209245, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.4847139848196829, + "language_loss": 0.66880596, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69011736, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.7530343532562256 + }, + { + "auxiliary_loss_clip": 0.01086407, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.03757918, + "balance_loss_mlp": 1.02238595, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.5820689292886683, + "language_loss": 0.69296187, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71421891, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.8022704124450684 + }, + { + "auxiliary_loss_clip": 0.01083479, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.03388476, + "balance_loss_mlp": 1.02524471, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.6548507327103548, + "language_loss": 0.7812047, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80247289, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.656893730163574 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.01039909, + "balance_loss_clip": 1.03630352, + "balance_loss_mlp": 1.02324319, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 2.9160239243841843, + "language_loss": 0.71147162, + "learning_rate": 3.432611813236704e-06, + "loss": 0.73291361, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.617446184158325 + }, + { + "auxiliary_loss_clip": 0.01016811, + "auxiliary_loss_mlp": 0.01000079, + "balance_loss_clip": 1.00909698, + "balance_loss_mlp": 0.99837416, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6753571514116881, + "language_loss": 0.53079879, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55096769, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.2781524658203125 + }, + { + "auxiliary_loss_clip": 0.01085674, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.03327644, + "balance_loss_mlp": 1.02715373, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0145124813910122, + "language_loss": 0.73852801, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75983614, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 2.5950217247009277 + }, + { + "auxiliary_loss_clip": 0.01093073, + "auxiliary_loss_mlp": 0.00750506, + "balance_loss_clip": 1.0352819, + "balance_loss_mlp": 1.0008322, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.09538606846988, + "language_loss": 0.80235517, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82079101, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.6009345054626465 + }, + { + "auxiliary_loss_clip": 0.01036443, + "auxiliary_loss_mlp": 0.0100587, + "balance_loss_clip": 1.00850344, + "balance_loss_mlp": 1.00403392, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8430666092796103, + "language_loss": 0.59559453, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.6160177, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.151658773422241 + }, + { + "auxiliary_loss_clip": 0.01122587, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.03979802, + "balance_loss_mlp": 1.02310348, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 1.9237920937056738, + "language_loss": 0.81668925, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83831841, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 2.5262036323547363 + }, + { + "auxiliary_loss_clip": 0.01086975, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.03421998, + "balance_loss_mlp": 1.01935792, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.778941904626402, + "language_loss": 0.82623804, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84745473, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 2.5913021564483643 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.03701878, + "balance_loss_mlp": 1.02022719, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 1.9679809377381812, + "language_loss": 0.69187295, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71323043, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 4.189934253692627 + }, + { + "auxiliary_loss_clip": 0.01118703, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.03941858, + "balance_loss_mlp": 1.02013326, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.733241389035002, + "language_loss": 0.67854482, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70008039, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 2.6916518211364746 + }, + { + "auxiliary_loss_clip": 0.01089855, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.03806186, + "balance_loss_mlp": 1.01998532, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.689113956784758, + "language_loss": 0.83110881, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85236049, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 2.5819504261016846 + }, + { + "auxiliary_loss_clip": 0.01105461, + "auxiliary_loss_mlp": 0.01041474, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.02645981, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 3.136766497003621, + "language_loss": 0.7071408, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.7286101, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 2.5581607818603516 + }, + { + "auxiliary_loss_clip": 0.01087418, + "auxiliary_loss_mlp": 0.0075044, + "balance_loss_clip": 1.04022753, + "balance_loss_mlp": 1.00069547, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.6313751094996551, + "language_loss": 0.73163426, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75001287, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.6222152709960938 + }, + { + "auxiliary_loss_clip": 0.01090367, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.03635526, + "balance_loss_mlp": 1.02198482, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.7128964938330253, + "language_loss": 0.80693424, + "learning_rate": 3.429346772085922e-06, + "loss": 0.82820201, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 5.667940616607666 + }, + { + "auxiliary_loss_clip": 0.01070932, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.03718567, + "balance_loss_mlp": 1.02861571, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.6223344016806693, + "language_loss": 0.64972317, + "learning_rate": 3.429074332770984e-06, + "loss": 0.67087734, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 2.851393699645996 + }, + { + "auxiliary_loss_clip": 0.01098351, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.03476346, + "balance_loss_mlp": 1.02597415, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.9009743895219369, + "language_loss": 0.80493164, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82633013, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 2.5826966762542725 + }, + { + "auxiliary_loss_clip": 0.01096959, + "auxiliary_loss_mlp": 0.007504, + "balance_loss_clip": 1.03773749, + "balance_loss_mlp": 1.00080276, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.1357532883995423, + "language_loss": 0.81045842, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.82893205, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 4.11744236946106 + }, + { + "auxiliary_loss_clip": 0.01069976, + "auxiliary_loss_mlp": 0.01042971, + "balance_loss_clip": 1.03603184, + "balance_loss_mlp": 1.02691364, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.710680622944316, + "language_loss": 0.77602398, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79715347, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 2.6614396572113037 + }, + { + "auxiliary_loss_clip": 0.01107334, + "auxiliary_loss_mlp": 0.01044837, + "balance_loss_clip": 1.03749609, + "balance_loss_mlp": 1.02916098, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 1.7498733736389127, + "language_loss": 0.74149704, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76301873, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 2.59525728225708 + }, + { + "auxiliary_loss_clip": 0.01092647, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.03794444, + "balance_loss_mlp": 1.02180469, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 2.5845508488189743, + "language_loss": 0.725694, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74699706, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 2.6349356174468994 + }, + { + "auxiliary_loss_clip": 0.01106138, + "auxiliary_loss_mlp": 0.01046082, + "balance_loss_clip": 1.03488231, + "balance_loss_mlp": 1.0296545, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.197193089382709, + "language_loss": 0.8674525, + "learning_rate": 3.427438559239605e-06, + "loss": 0.88897467, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.5574591159820557 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01038461, + "balance_loss_clip": 1.03737736, + "balance_loss_mlp": 1.02403688, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.560867565134695, + "language_loss": 0.66484082, + "learning_rate": 3.427165740807239e-06, + "loss": 0.686306, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.7101633548736572 + }, + { + "auxiliary_loss_clip": 0.01079824, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.03242087, + "balance_loss_mlp": 1.02672398, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 3.3440063832857136, + "language_loss": 0.72851229, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74973822, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.5881781578063965 + }, + { + "auxiliary_loss_clip": 0.01124084, + "auxiliary_loss_mlp": 0.01043024, + "balance_loss_clip": 1.04142475, + "balance_loss_mlp": 1.02848077, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 2.6312821721427877, + "language_loss": 0.84013236, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.86180341, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 2.586970329284668 + }, + { + "auxiliary_loss_clip": 0.01100472, + "auxiliary_loss_mlp": 0.01042891, + "balance_loss_clip": 1.04581714, + "balance_loss_mlp": 1.02696431, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.05886491367467, + "language_loss": 0.71337569, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73480934, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.835456371307373 + }, + { + "auxiliary_loss_clip": 0.01045026, + "auxiliary_loss_mlp": 0.01049786, + "balance_loss_clip": 1.035568, + "balance_loss_mlp": 1.03185725, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.8794920630103247, + "language_loss": 0.84090209, + "learning_rate": 3.426073925998578e-06, + "loss": 0.8618502, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.9437575340270996 + }, + { + "auxiliary_loss_clip": 0.01098096, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_clip": 1.04060054, + "balance_loss_mlp": 1.03483629, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.491260429848423, + "language_loss": 0.90060347, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92209661, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 3.063532590866089 + }, + { + "auxiliary_loss_clip": 0.01042937, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.03109467, + "balance_loss_mlp": 1.02518082, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.7843425242632407, + "language_loss": 0.73616445, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75700855, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.8474950790405273 + }, + { + "auxiliary_loss_clip": 0.01122885, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.04094756, + "balance_loss_mlp": 1.02329993, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 2.3963722357983093, + "language_loss": 0.74350309, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.76512057, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.5427277088165283 + }, + { + "auxiliary_loss_clip": 0.01098411, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.03870797, + "balance_loss_mlp": 1.02006745, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 2.026666630890565, + "language_loss": 0.89026439, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91159451, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.6304450035095215 + }, + { + "auxiliary_loss_clip": 0.01111454, + "auxiliary_loss_mlp": 0.01040089, + "balance_loss_clip": 1.03941798, + "balance_loss_mlp": 1.02531278, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.4418397066971211, + "language_loss": 0.71108025, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73259568, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.6629810333251953 + }, + { + "auxiliary_loss_clip": 0.01091851, + "auxiliary_loss_mlp": 0.0103371, + "balance_loss_clip": 1.03737617, + "balance_loss_mlp": 1.01933932, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 2.16837801843869, + "language_loss": 0.86388242, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88513803, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.6468136310577393 + }, + { + "auxiliary_loss_clip": 0.01119247, + "auxiliary_loss_mlp": 0.01038523, + "balance_loss_clip": 1.03907216, + "balance_loss_mlp": 1.02377629, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.6170809511241242, + "language_loss": 0.76583469, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78741241, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.5578973293304443 + }, + { + "auxiliary_loss_clip": 0.0103676, + "auxiliary_loss_mlp": 0.01020449, + "balance_loss_clip": 1.01057553, + "balance_loss_mlp": 1.0188998, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7196888996244244, + "language_loss": 0.50134611, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52191824, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.1067373752593994 + }, + { + "auxiliary_loss_clip": 0.01074734, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.03857028, + "balance_loss_mlp": 1.02647662, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.851418910589825, + "language_loss": 0.7234804, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74463463, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 2.692063570022583 + }, + { + "auxiliary_loss_clip": 0.01006938, + "auxiliary_loss_mlp": 0.01012734, + "balance_loss_clip": 1.00992656, + "balance_loss_mlp": 1.01107681, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7563835211492255, + "language_loss": 0.59207267, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.6122694, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.194793939590454 + }, + { + "auxiliary_loss_clip": 0.01095712, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.03767991, + "balance_loss_mlp": 1.02169907, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 1.8892484325011487, + "language_loss": 0.74006402, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.76139271, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.6754589080810547 + }, + { + "auxiliary_loss_clip": 0.01081083, + "auxiliary_loss_mlp": 0.01040721, + "balance_loss_clip": 1.03094339, + "balance_loss_mlp": 1.02520621, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.1198662714193186, + "language_loss": 0.80599946, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82721752, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.5686819553375244 + }, + { + "auxiliary_loss_clip": 0.0107625, + "auxiliary_loss_mlp": 0.01040427, + "balance_loss_clip": 1.03637362, + "balance_loss_mlp": 1.02357054, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.88570647245324, + "language_loss": 0.72424704, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74541384, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.6291663646698 + }, + { + "auxiliary_loss_clip": 0.01102697, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.03741729, + "balance_loss_mlp": 1.01985955, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.9045819446902434, + "language_loss": 0.68434238, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70574224, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.7884573936462402 + }, + { + "auxiliary_loss_clip": 0.01072025, + "auxiliary_loss_mlp": 0.01040288, + "balance_loss_clip": 1.03750908, + "balance_loss_mlp": 1.02350342, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.0099319769377577, + "language_loss": 0.68120682, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70232999, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 2.6691019535064697 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.01038375, + "balance_loss_clip": 1.0404489, + "balance_loss_mlp": 1.02350962, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.7598308419167032, + "language_loss": 0.75694275, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77843469, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.622217893600464 + }, + { + "auxiliary_loss_clip": 0.01122203, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.03775597, + "balance_loss_mlp": 1.02853799, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 2.0588138082574834, + "language_loss": 0.73650509, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75817335, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.5655038356781006 + }, + { + "auxiliary_loss_clip": 0.0109957, + "auxiliary_loss_mlp": 0.01037686, + "balance_loss_clip": 1.0361352, + "balance_loss_mlp": 1.0213902, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 1.980613084022729, + "language_loss": 0.80708528, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82845771, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.664320468902588 + }, + { + "auxiliary_loss_clip": 0.01012788, + "auxiliary_loss_mlp": 0.01006441, + "balance_loss_clip": 1.00636959, + "balance_loss_mlp": 1.00459373, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7502835147883371, + "language_loss": 0.5091182, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52931046, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 3.05072021484375 + }, + { + "auxiliary_loss_clip": 0.0103896, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.03028643, + "balance_loss_mlp": 1.02040863, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 1.8492164931406254, + "language_loss": 0.74499214, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76573396, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.731799364089966 + }, + { + "auxiliary_loss_clip": 0.01102393, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.03601563, + "balance_loss_mlp": 1.0213778, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.66231265112594, + "language_loss": 0.71288395, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73425812, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.556212902069092 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.03866148, + "balance_loss_mlp": 1.01554608, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 2.469505682226415, + "language_loss": 0.70344734, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72480905, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 2.55718731880188 + }, + { + "auxiliary_loss_clip": 0.01091793, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.03579235, + "balance_loss_mlp": 1.0225302, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 1.9470772101936769, + "language_loss": 0.81548786, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83678532, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 4.107728481292725 + }, + { + "auxiliary_loss_clip": 0.01114978, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.0379405, + "balance_loss_mlp": 1.01891625, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.587179363988766, + "language_loss": 0.80391872, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82539159, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 2.524494171142578 + }, + { + "auxiliary_loss_clip": 0.01088306, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.03324413, + "balance_loss_mlp": 1.02058172, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 2.0951584543234216, + "language_loss": 0.87908101, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90030909, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 2.560673236846924 + }, + { + "auxiliary_loss_clip": 0.0109689, + "auxiliary_loss_mlp": 0.01040501, + "balance_loss_clip": 1.03788304, + "balance_loss_mlp": 1.02628565, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.7262411225765302, + "language_loss": 0.92014354, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94151747, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 2.6045966148376465 + }, + { + "auxiliary_loss_clip": 0.0107215, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_clip": 1.03781819, + "balance_loss_mlp": 1.03375828, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 3.1522689035734346, + "language_loss": 0.73715395, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75838667, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 2.641777753829956 + }, + { + "auxiliary_loss_clip": 0.01106437, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.03732538, + "balance_loss_mlp": 1.02433288, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 1.8843018745106803, + "language_loss": 0.76402903, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78548348, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 4.070129156112671 + }, + { + "auxiliary_loss_clip": 0.01061021, + "auxiliary_loss_mlp": 0.0104722, + "balance_loss_clip": 1.0292747, + "balance_loss_mlp": 1.03054225, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.3639814139320645, + "language_loss": 0.7728036, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79388595, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 2.7082955837249756 + }, + { + "auxiliary_loss_clip": 0.01085397, + "auxiliary_loss_mlp": 0.01036349, + "balance_loss_clip": 1.0359844, + "balance_loss_mlp": 1.02245533, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.7136270150905222, + "language_loss": 0.68431723, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7055347, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.8250744342803955 + }, + { + "auxiliary_loss_clip": 0.01103872, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.03603208, + "balance_loss_mlp": 1.0253197, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.015263194694122, + "language_loss": 0.75637484, + "learning_rate": 3.417583075166451e-06, + "loss": 0.77780402, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 4.160030126571655 + }, + { + "auxiliary_loss_clip": 0.01105084, + "auxiliary_loss_mlp": 0.01050615, + "balance_loss_clip": 1.03717303, + "balance_loss_mlp": 1.03418827, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.5279813235165025, + "language_loss": 0.76167238, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78322941, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.6016433238983154 + }, + { + "auxiliary_loss_clip": 0.01084827, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.03318238, + "balance_loss_mlp": 1.03524446, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.0827273477798567, + "language_loss": 0.75374103, + "learning_rate": 3.417033501108875e-06, + "loss": 0.7751044, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 2.6368002891540527 + }, + { + "auxiliary_loss_clip": 0.01120753, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.0394752, + "balance_loss_mlp": 1.02183211, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 1.6577621792090576, + "language_loss": 0.72902429, + "learning_rate": 3.416758633473798e-06, + "loss": 0.7506007, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.6065313816070557 + }, + { + "auxiliary_loss_clip": 0.01090963, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.03586173, + "balance_loss_mlp": 1.02095973, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.9882833476755946, + "language_loss": 0.74489868, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76616758, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.601810932159424 + }, + { + "auxiliary_loss_clip": 0.01119086, + "auxiliary_loss_mlp": 0.01039908, + "balance_loss_clip": 1.03939974, + "balance_loss_mlp": 1.02506685, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 1.793090524399908, + "language_loss": 0.7633239, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78491378, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.575751781463623 + }, + { + "auxiliary_loss_clip": 0.01101652, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.03599691, + "balance_loss_mlp": 1.02947772, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 3.2279208426878854, + "language_loss": 0.81554139, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.83699977, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.5762219429016113 + }, + { + "auxiliary_loss_clip": 0.01123083, + "auxiliary_loss_mlp": 0.01040474, + "balance_loss_clip": 1.03923321, + "balance_loss_mlp": 1.02481019, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 2.750400526268133, + "language_loss": 0.76838315, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79001868, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 2.4991824626922607 + }, + { + "auxiliary_loss_clip": 0.01081459, + "auxiliary_loss_mlp": 0.00750427, + "balance_loss_clip": 1.03594089, + "balance_loss_mlp": 1.00059795, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.4187125385133714, + "language_loss": 0.82319325, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84151214, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.648200750350952 + }, + { + "auxiliary_loss_clip": 0.01077033, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.0314939, + "balance_loss_mlp": 1.02598345, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 2.009971180071837, + "language_loss": 0.77246845, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79364491, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 2.6663029193878174 + }, + { + "auxiliary_loss_clip": 0.01094582, + "auxiliary_loss_mlp": 0.01040219, + "balance_loss_clip": 1.03805125, + "balance_loss_mlp": 1.02595568, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.1440841554007557, + "language_loss": 0.82058215, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84193015, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.6316587924957275 + }, + { + "auxiliary_loss_clip": 0.01107623, + "auxiliary_loss_mlp": 0.01035366, + "balance_loss_clip": 1.03836954, + "balance_loss_mlp": 1.02070332, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.073373827210662, + "language_loss": 0.91935718, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.94078708, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.550199031829834 + }, + { + "auxiliary_loss_clip": 0.01110303, + "auxiliary_loss_mlp": 0.01040476, + "balance_loss_clip": 1.03943014, + "balance_loss_mlp": 1.02514577, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 5.38888824143208, + "language_loss": 0.76496077, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78646857, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.6147615909576416 + }, + { + "auxiliary_loss_clip": 0.01080327, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.03478229, + "balance_loss_mlp": 1.01473808, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 2.3972546574999223, + "language_loss": 0.88649094, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.90758502, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.6041274070739746 + }, + { + "auxiliary_loss_clip": 0.01089475, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.03455949, + "balance_loss_mlp": 1.0164032, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 1.7984685213425358, + "language_loss": 0.71557474, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73678088, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.651399850845337 + }, + { + "auxiliary_loss_clip": 0.01094678, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.03761458, + "balance_loss_mlp": 1.0222044, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.719503201886561, + "language_loss": 0.91309726, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.9344238, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.625046491622925 + }, + { + "auxiliary_loss_clip": 0.01096288, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.03769934, + "balance_loss_mlp": 1.02326024, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.469452665358442, + "language_loss": 0.7275551, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.74890208, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.7157413959503174 + }, + { + "auxiliary_loss_clip": 0.01104716, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.03439891, + "balance_loss_mlp": 1.02272892, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.8787312879921378, + "language_loss": 0.71784776, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73927784, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 2.693244218826294 + }, + { + "auxiliary_loss_clip": 0.01086609, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.0338732, + "balance_loss_mlp": 1.02015507, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 3.6637052671733557, + "language_loss": 0.78764874, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80886555, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.6213622093200684 + }, + { + "auxiliary_loss_clip": 0.01103603, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.03692126, + "balance_loss_mlp": 1.02638435, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4052182657487282, + "language_loss": 0.89886647, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92031014, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.5867316722869873 + }, + { + "auxiliary_loss_clip": 0.01092641, + "auxiliary_loss_mlp": 0.01035307, + "balance_loss_clip": 1.03491771, + "balance_loss_mlp": 1.01909435, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.0714592884502405, + "language_loss": 0.88067043, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90194988, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.6701502799987793 + }, + { + "auxiliary_loss_clip": 0.01104498, + "auxiliary_loss_mlp": 0.00750238, + "balance_loss_clip": 1.03426051, + "balance_loss_mlp": 1.00052512, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 2.055225746191587, + "language_loss": 0.82277578, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84132314, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.535163402557373 + }, + { + "auxiliary_loss_clip": 0.01095236, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.0360353, + "balance_loss_mlp": 1.02581596, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.844168859315706, + "language_loss": 0.79940706, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82076907, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.6177268028259277 + }, + { + "auxiliary_loss_clip": 0.01097868, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.03946209, + "balance_loss_mlp": 1.01850843, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.123060393854354, + "language_loss": 0.89420378, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91551775, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 2.602593183517456 + }, + { + "auxiliary_loss_clip": 0.0108542, + "auxiliary_loss_mlp": 0.00750357, + "balance_loss_clip": 1.03406739, + "balance_loss_mlp": 1.00063348, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.080414321372753, + "language_loss": 0.63149595, + "learning_rate": 3.410974019048255e-06, + "loss": 0.64985371, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.6037485599517822 + }, + { + "auxiliary_loss_clip": 0.01091644, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.03687918, + "balance_loss_mlp": 1.01951742, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 2.826258223419304, + "language_loss": 0.69802213, + "learning_rate": 3.410697971904651e-06, + "loss": 0.71929789, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 2.7450530529022217 + }, + { + "auxiliary_loss_clip": 0.01014148, + "auxiliary_loss_mlp": 0.01003514, + "balance_loss_clip": 1.0097295, + "balance_loss_mlp": 1.0018692, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7181765534779462, + "language_loss": 0.61645025, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.6366269, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.2200870513916016 + }, + { + "auxiliary_loss_clip": 0.01026049, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.03532898, + "balance_loss_mlp": 1.02981794, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 2.020618741256822, + "language_loss": 0.64847851, + "learning_rate": 3.410145717146488e-06, + "loss": 0.66920257, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 2.9064571857452393 + }, + { + "auxiliary_loss_clip": 0.01090034, + "auxiliary_loss_mlp": 0.0075001, + "balance_loss_clip": 1.03490806, + "balance_loss_mlp": 1.00058901, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.0819574863715276, + "language_loss": 0.77552623, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79392672, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 2.944045305252075 + }, + { + "auxiliary_loss_clip": 0.01089935, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.0369916, + "balance_loss_mlp": 1.02554083, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.9155946416306218, + "language_loss": 0.82418501, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84546942, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 2.724907875061035 + }, + { + "auxiliary_loss_clip": 0.01104872, + "auxiliary_loss_mlp": 0.01041889, + "balance_loss_clip": 1.03481245, + "balance_loss_mlp": 1.02564073, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.3296824546238644, + "language_loss": 0.71450138, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73596901, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 2.654650926589966 + }, + { + "auxiliary_loss_clip": 0.01077845, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.03659296, + "balance_loss_mlp": 1.02030945, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.0671177577102346, + "language_loss": 0.78799945, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80911422, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 2.690944194793701 + }, + { + "auxiliary_loss_clip": 0.01072429, + "auxiliary_loss_mlp": 0.01042998, + "balance_loss_clip": 1.03145278, + "balance_loss_mlp": 1.02676177, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.3574457973975345, + "language_loss": 0.70297658, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72413081, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 4.094045877456665 + }, + { + "auxiliary_loss_clip": 0.01096333, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.03941047, + "balance_loss_mlp": 1.02137733, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.0736424290866853, + "language_loss": 0.72229791, + "learning_rate": 3.408487669858431e-06, + "loss": 0.74362653, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 2.784607172012329 + }, + { + "auxiliary_loss_clip": 0.01102757, + "auxiliary_loss_mlp": 0.01033812, + "balance_loss_clip": 1.03562474, + "balance_loss_mlp": 1.01887465, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.6223711062893686, + "language_loss": 0.5885154, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.6098811, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 2.6236836910247803 + }, + { + "auxiliary_loss_clip": 0.01100627, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.03953576, + "balance_loss_mlp": 1.02023423, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7486719535697879, + "language_loss": 0.74091709, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76227891, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 2.587465286254883 + }, + { + "auxiliary_loss_clip": 0.01105668, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.03758669, + "balance_loss_mlp": 1.0157994, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.9273934583748953, + "language_loss": 0.7718277, + "learning_rate": 3.407657925038002e-06, + "loss": 0.79318869, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 2.7046899795532227 + }, + { + "auxiliary_loss_clip": 0.01114741, + "auxiliary_loss_mlp": 0.01043106, + "balance_loss_clip": 1.03787172, + "balance_loss_mlp": 1.02650046, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 2.067209032812854, + "language_loss": 0.8201167, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84169519, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 4.137381076812744 + }, + { + "auxiliary_loss_clip": 0.01054874, + "auxiliary_loss_mlp": 0.01038144, + "balance_loss_clip": 1.03069472, + "balance_loss_mlp": 1.02304053, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.8197464341041367, + "language_loss": 0.72970855, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.75063884, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.772378921508789 + }, + { + "auxiliary_loss_clip": 0.01091781, + "auxiliary_loss_mlp": 0.0104196, + "balance_loss_clip": 1.0343281, + "balance_loss_mlp": 1.02690351, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.244591678423254, + "language_loss": 0.68259293, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70393038, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 2.5777971744537354 + }, + { + "auxiliary_loss_clip": 0.01089526, + "auxiliary_loss_mlp": 0.01046254, + "balance_loss_clip": 1.03503597, + "balance_loss_mlp": 1.03052437, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 1.6940533400259867, + "language_loss": 0.72005045, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74140823, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 2.623037338256836 + }, + { + "auxiliary_loss_clip": 0.01093525, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.03595018, + "balance_loss_mlp": 1.0204041, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.8753712264658602, + "language_loss": 0.81004649, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83133411, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 4.1101086139678955 + }, + { + "auxiliary_loss_clip": 0.01119181, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.03788543, + "balance_loss_mlp": 1.02020788, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.6820710306079247, + "language_loss": 0.75405902, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.7756021, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.6061275005340576 + }, + { + "auxiliary_loss_clip": 0.01115849, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.03704476, + "balance_loss_mlp": 1.01756239, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.5289054321725977, + "language_loss": 0.74553096, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76700807, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.5859880447387695 + }, + { + "auxiliary_loss_clip": 0.01093836, + "auxiliary_loss_mlp": 0.01042788, + "balance_loss_clip": 1.03953838, + "balance_loss_mlp": 1.02617049, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.7638137781564307, + "language_loss": 0.62905049, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.65041673, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.661342144012451 + }, + { + "auxiliary_loss_clip": 0.01095031, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.03718042, + "balance_loss_mlp": 1.0235393, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 2.0265881258330825, + "language_loss": 0.78262556, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.8039602, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.780611038208008 + }, + { + "auxiliary_loss_clip": 0.01060216, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.03246474, + "balance_loss_mlp": 1.02140617, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.0736997986286996, + "language_loss": 0.68335885, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70432186, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 2.7657971382141113 + }, + { + "auxiliary_loss_clip": 0.01108325, + "auxiliary_loss_mlp": 0.01041629, + "balance_loss_clip": 1.04034281, + "balance_loss_mlp": 1.02781844, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 5.911696325387324, + "language_loss": 0.60961127, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63111079, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.6604063510894775 + }, + { + "auxiliary_loss_clip": 0.01097956, + "auxiliary_loss_mlp": 0.01042106, + "balance_loss_clip": 1.03544593, + "balance_loss_mlp": 1.0250833, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.8011616406185667, + "language_loss": 0.82944751, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.85084808, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 2.64890193939209 + }, + { + "auxiliary_loss_clip": 0.01108805, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.04105461, + "balance_loss_mlp": 1.01760173, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.3869764291222033, + "language_loss": 0.68517095, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.70658541, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 2.5850558280944824 + }, + { + "auxiliary_loss_clip": 0.01083921, + "auxiliary_loss_mlp": 0.01040783, + "balance_loss_clip": 1.03365266, + "balance_loss_mlp": 1.0244813, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.33617942260793, + "language_loss": 0.70673358, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.72798067, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.614035129547119 + }, + { + "auxiliary_loss_clip": 0.01006039, + "auxiliary_loss_mlp": 0.01008903, + "balance_loss_clip": 1.00978982, + "balance_loss_mlp": 1.00738323, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7247308648343134, + "language_loss": 0.5575515, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57770097, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.2566442489624023 + }, + { + "auxiliary_loss_clip": 0.0106807, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.03673851, + "balance_loss_mlp": 1.02500057, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 2.3281649782189455, + "language_loss": 0.77651107, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79760033, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.704287052154541 + }, + { + "auxiliary_loss_clip": 0.01113498, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.03863001, + "balance_loss_mlp": 1.01698065, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.6126181670143749, + "language_loss": 0.81530344, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83673799, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.5699985027313232 + }, + { + "auxiliary_loss_clip": 0.0110502, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.03836405, + "balance_loss_mlp": 1.020154, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.6208762780558306, + "language_loss": 0.79171038, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81310797, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.575924873352051 + }, + { + "auxiliary_loss_clip": 0.01076486, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_clip": 1.0394609, + "balance_loss_mlp": 1.02933598, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 1.942142141839069, + "language_loss": 0.74413455, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76533687, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.7389333248138428 + }, + { + "auxiliary_loss_clip": 0.01096858, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.03796458, + "balance_loss_mlp": 1.02164149, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.7303047121594453, + "language_loss": 0.71908176, + "learning_rate": 3.402114029526814e-06, + "loss": 0.7404024, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 2.754516839981079 + }, + { + "auxiliary_loss_clip": 0.01071104, + "auxiliary_loss_mlp": 0.00750495, + "balance_loss_clip": 1.03475523, + "balance_loss_mlp": 1.00064659, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 2.093787350977113, + "language_loss": 0.73161066, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.74982655, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 2.6551573276519775 + }, + { + "auxiliary_loss_clip": 0.01097131, + "auxiliary_loss_mlp": 0.01035686, + "balance_loss_clip": 1.03850389, + "balance_loss_mlp": 1.01990247, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 1.9923243973626883, + "language_loss": 0.75999987, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78132808, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.629188060760498 + }, + { + "auxiliary_loss_clip": 0.01093531, + "auxiliary_loss_mlp": 0.01045095, + "balance_loss_clip": 1.03970623, + "balance_loss_mlp": 1.02676022, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 1.4426936938995194, + "language_loss": 0.66175091, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68313718, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.6375486850738525 + }, + { + "auxiliary_loss_clip": 0.01087135, + "auxiliary_loss_mlp": 0.01045462, + "balance_loss_clip": 1.03605723, + "balance_loss_mlp": 1.02862954, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 2.3258348972654477, + "language_loss": 0.79811084, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.81943691, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.6601123809814453 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01043919, + "balance_loss_clip": 1.03935671, + "balance_loss_mlp": 1.02669358, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.592270164660543, + "language_loss": 0.67316234, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.69469476, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 2.560316801071167 + }, + { + "auxiliary_loss_clip": 0.01101834, + "auxiliary_loss_mlp": 0.0104081, + "balance_loss_clip": 1.04048347, + "balance_loss_mlp": 1.02539587, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.736552981120991, + "language_loss": 0.78305411, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80448055, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.652031421661377 + }, + { + "auxiliary_loss_clip": 0.01074669, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.04026508, + "balance_loss_mlp": 1.02311707, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.6412293096945856, + "language_loss": 0.84573722, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.8668611, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.696012496948242 + }, + { + "auxiliary_loss_clip": 0.01110077, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.03819728, + "balance_loss_mlp": 1.02457583, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.7988844297236732, + "language_loss": 0.66795427, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.68944353, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.595276117324829 + }, + { + "auxiliary_loss_clip": 0.01051375, + "auxiliary_loss_mlp": 0.01046606, + "balance_loss_clip": 1.03067923, + "balance_loss_mlp": 1.02923727, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.6801165506838782, + "language_loss": 0.76704443, + "learning_rate": 3.399612333050327e-06, + "loss": 0.78802431, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 2.6529502868652344 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.00750346, + "balance_loss_clip": 1.04073191, + "balance_loss_mlp": 1.00067735, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.6518727798904482, + "language_loss": 0.71986151, + "learning_rate": 3.399334101267362e-06, + "loss": 0.73849452, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 2.6094541549682617 + }, + { + "auxiliary_loss_clip": 0.01097932, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.03953254, + "balance_loss_mlp": 1.02162278, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5614780836245743, + "language_loss": 0.80392236, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82526892, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 2.6114091873168945 + }, + { + "auxiliary_loss_clip": 0.01110065, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.03958368, + "balance_loss_mlp": 1.02419329, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 1.799517016641621, + "language_loss": 0.82949758, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85098791, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.5854389667510986 + }, + { + "auxiliary_loss_clip": 0.010802, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.03516352, + "balance_loss_mlp": 1.01862741, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.398839657837667, + "language_loss": 0.75370276, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77484131, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 2.662414073944092 + }, + { + "auxiliary_loss_clip": 0.01104766, + "auxiliary_loss_mlp": 0.01040487, + "balance_loss_clip": 1.03757262, + "balance_loss_mlp": 1.02482247, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 2.1068831242955546, + "language_loss": 0.88545012, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90690267, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 2.6389687061309814 + }, + { + "auxiliary_loss_clip": 0.01105012, + "auxiliary_loss_mlp": 0.01045199, + "balance_loss_clip": 1.03870416, + "balance_loss_mlp": 1.02968979, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.6369867119013721, + "language_loss": 0.71038508, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73188716, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 4.176388740539551 + }, + { + "auxiliary_loss_clip": 0.01084253, + "auxiliary_loss_mlp": 0.01043675, + "balance_loss_clip": 1.03652358, + "balance_loss_mlp": 1.02733159, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.9472169442655534, + "language_loss": 0.80277336, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82405263, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 2.649656057357788 + }, + { + "auxiliary_loss_clip": 0.01024529, + "auxiliary_loss_mlp": 0.00747628, + "balance_loss_clip": 1.00925636, + "balance_loss_mlp": 1.00019729, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7139001032020459, + "language_loss": 0.61674392, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63446546, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 3.071272373199463 + }, + { + "auxiliary_loss_clip": 0.01104417, + "auxiliary_loss_mlp": 0.0103895, + "balance_loss_clip": 1.04028356, + "balance_loss_mlp": 1.02383971, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.9902354810031446, + "language_loss": 0.77175373, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79318738, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 2.6171932220458984 + }, + { + "auxiliary_loss_clip": 0.01107499, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.03901827, + "balance_loss_mlp": 1.02139211, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.728477689725714, + "language_loss": 0.91548216, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93692052, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 2.5380988121032715 + }, + { + "auxiliary_loss_clip": 0.01108351, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.04030311, + "balance_loss_mlp": 1.02588034, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.992098436755725, + "language_loss": 0.69928789, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.72078949, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 4.171755075454712 + }, + { + "auxiliary_loss_clip": 0.01093427, + "auxiliary_loss_mlp": 0.01041432, + "balance_loss_clip": 1.03684545, + "balance_loss_mlp": 1.02583897, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 1.7225302018733237, + "language_loss": 0.64098084, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.66232944, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 4.296294212341309 + }, + { + "auxiliary_loss_clip": 0.01117384, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.03973567, + "balance_loss_mlp": 1.02294421, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 1.97648473340015, + "language_loss": 0.8639636, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88550818, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.5509085655212402 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.03931165, + "balance_loss_mlp": 1.02327108, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.315681314456151, + "language_loss": 0.80070043, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82230008, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.5266854763031006 + }, + { + "auxiliary_loss_clip": 0.01097002, + "auxiliary_loss_mlp": 0.01041571, + "balance_loss_clip": 1.03640771, + "balance_loss_mlp": 1.02610385, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.8721569343763458, + "language_loss": 0.78744185, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80882752, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 2.5941104888916016 + }, + { + "auxiliary_loss_clip": 0.01091178, + "auxiliary_loss_mlp": 0.01042364, + "balance_loss_clip": 1.03956318, + "balance_loss_mlp": 1.02661681, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 2.3780760475565232, + "language_loss": 0.73763382, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75896931, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 4.088304281234741 + }, + { + "auxiliary_loss_clip": 0.01105869, + "auxiliary_loss_mlp": 0.01040809, + "balance_loss_clip": 1.03737116, + "balance_loss_mlp": 1.02538371, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.6328177421556005, + "language_loss": 0.7977612, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81922799, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.5609536170959473 + }, + { + "auxiliary_loss_clip": 0.01101363, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_clip": 1.03795004, + "balance_loss_mlp": 1.02946258, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.138966325534749, + "language_loss": 0.76826781, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.78975028, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.5939371585845947 + }, + { + "auxiliary_loss_clip": 0.01090416, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.03773522, + "balance_loss_mlp": 1.02386129, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.5101095459249654, + "language_loss": 0.8161968, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83747786, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.5630908012390137 + }, + { + "auxiliary_loss_clip": 0.01067953, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.03470111, + "balance_loss_mlp": 1.01720417, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 2.048895476196655, + "language_loss": 0.70055819, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72156799, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.6783175468444824 + }, + { + "auxiliary_loss_clip": 0.01026795, + "auxiliary_loss_mlp": 0.01007939, + "balance_loss_clip": 1.00994992, + "balance_loss_mlp": 1.00625825, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.70059269814022, + "language_loss": 0.57165682, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59200418, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 3.1691372394561768 + }, + { + "auxiliary_loss_clip": 0.01100673, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.04110563, + "balance_loss_mlp": 1.02508497, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.2023006013034436, + "language_loss": 0.69183135, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.7132473, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.671095132827759 + }, + { + "auxiliary_loss_clip": 0.01101882, + "auxiliary_loss_mlp": 0.01035915, + "balance_loss_clip": 1.03892469, + "balance_loss_mlp": 1.02154994, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.9602598188954856, + "language_loss": 0.69927716, + "learning_rate": 3.393199595837555e-06, + "loss": 0.7206552, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.6148626804351807 + }, + { + "auxiliary_loss_clip": 0.0107195, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.03978252, + "balance_loss_mlp": 1.02134752, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.7081944465268935, + "language_loss": 0.72716606, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74824631, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.7164361476898193 + }, + { + "auxiliary_loss_clip": 0.01074633, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03309238, + "balance_loss_mlp": 1.03470528, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.562269006496328, + "language_loss": 0.84389007, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86514115, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.641244411468506 + }, + { + "auxiliary_loss_clip": 0.01044834, + "auxiliary_loss_mlp": 0.00750793, + "balance_loss_clip": 1.03091431, + "balance_loss_mlp": 1.00067127, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 1.9419513122260372, + "language_loss": 0.69386041, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71181673, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.696183204650879 + }, + { + "auxiliary_loss_clip": 0.01114408, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.03872883, + "balance_loss_mlp": 1.02212131, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 1.8396582557521386, + "language_loss": 0.73393857, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75544846, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.5727407932281494 + }, + { + "auxiliary_loss_clip": 0.01108833, + "auxiliary_loss_mlp": 0.00750218, + "balance_loss_clip": 1.0369103, + "balance_loss_mlp": 1.00070119, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.281090769790673, + "language_loss": 0.66503775, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68362832, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.649132490158081 + }, + { + "auxiliary_loss_clip": 0.01069582, + "auxiliary_loss_mlp": 0.01045742, + "balance_loss_clip": 1.03663349, + "balance_loss_mlp": 1.03007174, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.7210461910137742, + "language_loss": 0.79709494, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81824815, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.7030670642852783 + }, + { + "auxiliary_loss_clip": 0.01100639, + "auxiliary_loss_mlp": 0.01041954, + "balance_loss_clip": 1.03609097, + "balance_loss_mlp": 1.02621818, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.681495583442382, + "language_loss": 0.80285507, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82428104, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 2.5632550716400146 + }, + { + "auxiliary_loss_clip": 0.01088604, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.03662062, + "balance_loss_mlp": 1.02497244, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.445396939480083, + "language_loss": 0.63710624, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.6583941, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 2.6734235286712646 + }, + { + "auxiliary_loss_clip": 0.01103325, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.03702879, + "balance_loss_mlp": 1.02372444, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 5.0523718965417395, + "language_loss": 0.82215345, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84357309, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.592392921447754 + }, + { + "auxiliary_loss_clip": 0.01117614, + "auxiliary_loss_mlp": 0.01040877, + "balance_loss_clip": 1.03705215, + "balance_loss_mlp": 1.02647614, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.4923961560041237, + "language_loss": 0.76944727, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.7910322, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 2.520237445831299 + }, + { + "auxiliary_loss_clip": 0.01122124, + "auxiliary_loss_mlp": 0.01037085, + "balance_loss_clip": 1.04072535, + "balance_loss_mlp": 1.02335262, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.7414837725501147, + "language_loss": 0.84278822, + "learning_rate": 3.390122747388459e-06, + "loss": 0.86438036, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.637890100479126 + }, + { + "auxiliary_loss_clip": 0.01094621, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.03817415, + "balance_loss_mlp": 1.02563071, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 2.050802224461861, + "language_loss": 0.76715535, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.7884872, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.6635665893554688 + }, + { + "auxiliary_loss_clip": 0.0106133, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.03120375, + "balance_loss_mlp": 1.02330732, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 1.8521978714544491, + "language_loss": 0.7822696, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80326629, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.633882522583008 + }, + { + "auxiliary_loss_clip": 0.01088549, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_clip": 1.03764045, + "balance_loss_mlp": 1.0284493, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 2.006698438781838, + "language_loss": 0.87227571, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89360148, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 2.6591637134552 + }, + { + "auxiliary_loss_clip": 0.0107039, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.03551424, + "balance_loss_mlp": 1.02576852, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.896602737104424, + "language_loss": 0.8092621, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83037436, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 2.669032573699951 + }, + { + "auxiliary_loss_clip": 0.01092488, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.04139543, + "balance_loss_mlp": 1.02219534, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 2.767506583908208, + "language_loss": 0.81502748, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83632338, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 2.678049325942993 + }, + { + "auxiliary_loss_clip": 0.01083222, + "auxiliary_loss_mlp": 0.00750391, + "balance_loss_clip": 1.03641129, + "balance_loss_mlp": 1.00067949, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 3.035488688644614, + "language_loss": 0.77162421, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78996032, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.63702130317688 + }, + { + "auxiliary_loss_clip": 0.01078589, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.03249359, + "balance_loss_mlp": 1.02174318, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 2.01447241647719, + "language_loss": 0.70360929, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72477341, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 2.7064101696014404 + }, + { + "auxiliary_loss_clip": 0.01076321, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.03644669, + "balance_loss_mlp": 1.02033377, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.649200876107679, + "language_loss": 0.92743576, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94856095, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 2.718737840652466 + }, + { + "auxiliary_loss_clip": 0.01120345, + "auxiliary_loss_mlp": 0.01036549, + "balance_loss_clip": 1.04007387, + "balance_loss_mlp": 1.0230726, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 4.466931851480209, + "language_loss": 0.8529824, + "learning_rate": 3.387600581071121e-06, + "loss": 0.8745513, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 2.5696277618408203 + }, + { + "auxiliary_loss_clip": 0.01088901, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.03919137, + "balance_loss_mlp": 1.02435613, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.892754287113972, + "language_loss": 0.79208595, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81336039, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 2.63683819770813 + }, + { + "auxiliary_loss_clip": 0.01072959, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.03621221, + "balance_loss_mlp": 1.02268732, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.5573708288840038, + "language_loss": 0.84868151, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86978364, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.70194673538208 + }, + { + "auxiliary_loss_clip": 0.01094762, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.03571153, + "balance_loss_mlp": 1.02023602, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 1.947330150625815, + "language_loss": 0.80909586, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83039749, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 4.1634743213653564 + }, + { + "auxiliary_loss_clip": 0.01122306, + "auxiliary_loss_mlp": 0.01042888, + "balance_loss_clip": 1.04261029, + "balance_loss_mlp": 1.02815402, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 1.9168083556277093, + "language_loss": 0.71825695, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73990893, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 2.5835540294647217 + }, + { + "auxiliary_loss_clip": 0.01103821, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.02281988, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8057247392023166, + "language_loss": 0.82535046, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84675294, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.538282632827759 + }, + { + "auxiliary_loss_clip": 0.01095558, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.03619456, + "balance_loss_mlp": 1.01683855, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.6872763832973825, + "language_loss": 0.87779951, + "learning_rate": 3.385916768573529e-06, + "loss": 0.89908063, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 2.6001079082489014 + }, + { + "auxiliary_loss_clip": 0.01089286, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.03724003, + "balance_loss_mlp": 1.02019334, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 2.418031420777519, + "language_loss": 0.76904684, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79029608, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 2.620927572250366 + }, + { + "auxiliary_loss_clip": 0.01122508, + "auxiliary_loss_mlp": 0.01040028, + "balance_loss_clip": 1.04154563, + "balance_loss_mlp": 1.0251503, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 2.3487846955431846, + "language_loss": 0.65536988, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67699528, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 2.552090644836426 + }, + { + "auxiliary_loss_clip": 0.01106503, + "auxiliary_loss_mlp": 0.01039428, + "balance_loss_clip": 1.03825855, + "balance_loss_mlp": 1.02295315, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 3.854921680291447, + "language_loss": 0.83552372, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.85698307, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 5.534227609634399 + }, + { + "auxiliary_loss_clip": 0.01085488, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.03418875, + "balance_loss_mlp": 1.02181363, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.5216684580443134, + "language_loss": 0.75626755, + "learning_rate": 3.384793175684533e-06, + "loss": 0.77748668, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.6464107036590576 + }, + { + "auxiliary_loss_clip": 0.01104914, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.03701496, + "balance_loss_mlp": 1.03024054, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.4800983397423177, + "language_loss": 0.71687204, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73837793, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 2.616884469985962 + }, + { + "auxiliary_loss_clip": 0.01105853, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.03694582, + "balance_loss_mlp": 1.01645494, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.0210907481830014, + "language_loss": 0.66000098, + "learning_rate": 3.384231064128447e-06, + "loss": 0.68137681, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.621870517730713 + }, + { + "auxiliary_loss_clip": 0.01108917, + "auxiliary_loss_mlp": 0.0103641, + "balance_loss_clip": 1.03971684, + "balance_loss_mlp": 1.02151513, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 1.9434710280063507, + "language_loss": 0.72508657, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74653995, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 4.133005619049072 + }, + { + "auxiliary_loss_clip": 0.01084395, + "auxiliary_loss_mlp": 0.01038476, + "balance_loss_clip": 1.03957188, + "balance_loss_mlp": 1.02077341, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 2.2868359544573402, + "language_loss": 0.74661225, + "learning_rate": 3.383668742611641e-06, + "loss": 0.76784098, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.729625940322876 + }, + { + "auxiliary_loss_clip": 0.01077677, + "auxiliary_loss_mlp": 0.01038647, + "balance_loss_clip": 1.03442216, + "balance_loss_mlp": 1.02211261, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 2.604670733023657, + "language_loss": 0.85720873, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87837195, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.6868414878845215 + }, + { + "auxiliary_loss_clip": 0.0107863, + "auxiliary_loss_mlp": 0.01043169, + "balance_loss_clip": 1.03693366, + "balance_loss_mlp": 1.02793384, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 2.1394512550359606, + "language_loss": 0.82998157, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85119957, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.6648690700531006 + }, + { + "auxiliary_loss_clip": 0.01106882, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.03772604, + "balance_loss_mlp": 1.02023876, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 1.9187587333500264, + "language_loss": 0.7856431, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.80706507, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 2.590786933898926 + }, + { + "auxiliary_loss_clip": 0.0101389, + "auxiliary_loss_mlp": 0.0100406, + "balance_loss_clip": 1.00783658, + "balance_loss_mlp": 1.00221241, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 1.6867657041926858, + "language_loss": 0.6224184, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64259791, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.142369270324707 + }, + { + "auxiliary_loss_clip": 0.01090817, + "auxiliary_loss_mlp": 0.01032847, + "balance_loss_clip": 1.03844857, + "balance_loss_mlp": 1.01898336, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 2.322565935668507, + "language_loss": 0.89145195, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91268855, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.654749631881714 + }, + { + "auxiliary_loss_clip": 0.01108669, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.03715384, + "balance_loss_mlp": 1.02303314, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6119343632910146, + "language_loss": 0.87065309, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89213341, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.565499782562256 + }, + { + "auxiliary_loss_clip": 0.01109929, + "auxiliary_loss_mlp": 0.01035013, + "balance_loss_clip": 1.03927922, + "balance_loss_mlp": 1.02042174, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 2.236839860595987, + "language_loss": 0.72849107, + "learning_rate": 3.38169896509385e-06, + "loss": 0.74994051, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.7429792881011963 + }, + { + "auxiliary_loss_clip": 0.01084461, + "auxiliary_loss_mlp": 0.01036974, + "balance_loss_clip": 1.035761, + "balance_loss_mlp": 1.01968861, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.737813105894618, + "language_loss": 0.80591869, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82713294, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.6087095737457275 + }, + { + "auxiliary_loss_clip": 0.01017491, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.01728272, + "balance_loss_mlp": 1.00043285, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.832516017657859, + "language_loss": 0.58831966, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.6059711, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.1606929302215576 + }, + { + "auxiliary_loss_clip": 0.01108942, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.03657532, + "balance_loss_mlp": 1.02435017, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 2.359997790569292, + "language_loss": 0.73827016, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.75976682, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.6693787574768066 + }, + { + "auxiliary_loss_clip": 0.01123664, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.04147363, + "balance_loss_mlp": 1.02618527, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.535689782766412, + "language_loss": 0.79705268, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81870127, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.7769131660461426 + }, + { + "auxiliary_loss_clip": 0.01093968, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_clip": 1.03692579, + "balance_loss_mlp": 1.02777731, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.686998814693987, + "language_loss": 0.78984761, + "learning_rate": 3.380290409114312e-06, + "loss": 0.8112182, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.589318037033081 + }, + { + "auxiliary_loss_clip": 0.01071751, + "auxiliary_loss_mlp": 0.01041175, + "balance_loss_clip": 1.03564024, + "balance_loss_mlp": 1.02450967, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.0553170731210493, + "language_loss": 0.80538642, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.82651567, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 2.6722190380096436 + }, + { + "auxiliary_loss_clip": 0.01084712, + "auxiliary_loss_mlp": 0.0075038, + "balance_loss_clip": 1.03513312, + "balance_loss_mlp": 1.00066376, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.6789438099661362, + "language_loss": 0.81485796, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83320892, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 2.693992853164673 + }, + { + "auxiliary_loss_clip": 0.0109317, + "auxiliary_loss_mlp": 0.01035457, + "balance_loss_clip": 1.03770459, + "balance_loss_mlp": 1.01987004, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6511615705043463, + "language_loss": 0.83577728, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85706353, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.680185317993164 + }, + { + "auxiliary_loss_clip": 0.01084579, + "auxiliary_loss_mlp": 0.01050859, + "balance_loss_clip": 1.0393188, + "balance_loss_mlp": 1.03297806, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.756719388719274, + "language_loss": 0.63558519, + "learning_rate": 3.379162622133105e-06, + "loss": 0.65693957, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.7557830810546875 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.0377233, + "balance_loss_mlp": 1.02440631, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.7338535899422456, + "language_loss": 0.78530574, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80680239, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.6601035594940186 + }, + { + "auxiliary_loss_clip": 0.01081449, + "auxiliary_loss_mlp": 0.01046898, + "balance_loss_clip": 1.03774011, + "balance_loss_mlp": 1.0303514, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.7892760778373549, + "language_loss": 0.79367125, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81495464, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 2.7136993408203125 + }, + { + "auxiliary_loss_clip": 0.01080154, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.03645134, + "balance_loss_mlp": 1.02346325, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.8825007544532402, + "language_loss": 0.79786044, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.81904507, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 2.633892774581909 + }, + { + "auxiliary_loss_clip": 0.01101555, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.04273379, + "balance_loss_mlp": 1.03564, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.4277603602147029, + "language_loss": 0.78840411, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.80993283, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 2.793199062347412 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01045868, + "balance_loss_clip": 1.038311, + "balance_loss_mlp": 1.02879679, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 6.652817953251136, + "language_loss": 0.69690156, + "learning_rate": 3.377751711782227e-06, + "loss": 0.71839356, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 2.6716628074645996 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01047975, + "balance_loss_clip": 1.04078746, + "balance_loss_mlp": 1.03142881, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.623419886301055, + "language_loss": 0.77628273, + "learning_rate": 3.377469372935791e-06, + "loss": 0.79775524, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 2.6575095653533936 + }, + { + "auxiliary_loss_clip": 0.0108194, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.03671205, + "balance_loss_mlp": 1.02552223, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 2.025139543701761, + "language_loss": 0.79621029, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81743497, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.615837335586548 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01036916, + "balance_loss_clip": 1.0373255, + "balance_loss_mlp": 1.02166963, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.8297756251424353, + "language_loss": 0.80689508, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82832402, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 2.5812020301818848 + }, + { + "auxiliary_loss_clip": 0.01082881, + "auxiliary_loss_mlp": 0.01061014, + "balance_loss_clip": 1.03897762, + "balance_loss_mlp": 1.0431689, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.126307282738626, + "language_loss": 0.84449315, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86593211, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.6360793113708496 + }, + { + "auxiliary_loss_clip": 0.01096059, + "auxiliary_loss_mlp": 0.00750264, + "balance_loss_clip": 1.04271078, + "balance_loss_mlp": 1.00061083, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.8774390027392869, + "language_loss": 0.79340798, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81187117, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 2.717743396759033 + }, + { + "auxiliary_loss_clip": 0.01064655, + "auxiliary_loss_mlp": 0.01043757, + "balance_loss_clip": 1.03634405, + "balance_loss_mlp": 1.02812862, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.5703891878654712, + "language_loss": 0.76000178, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78108597, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.797058582305908 + }, + { + "auxiliary_loss_clip": 0.01110937, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_clip": 1.04178405, + "balance_loss_mlp": 1.02956438, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.0904003648770146, + "language_loss": 0.79267192, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81422973, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 2.578909158706665 + }, + { + "auxiliary_loss_clip": 0.01080767, + "auxiliary_loss_mlp": 0.01046753, + "balance_loss_clip": 1.03742647, + "balance_loss_mlp": 1.03003979, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.0943932120602984, + "language_loss": 0.79236472, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81363988, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 4.259927034378052 + }, + { + "auxiliary_loss_clip": 0.0109909, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.03909039, + "balance_loss_mlp": 1.02154589, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.6769238792634147, + "language_loss": 0.75094128, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77230448, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.605846643447876 + }, + { + "auxiliary_loss_clip": 0.01095126, + "auxiliary_loss_mlp": 0.01041711, + "balance_loss_clip": 1.03453076, + "balance_loss_mlp": 1.02378154, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.2863632280275215, + "language_loss": 0.75539035, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77675867, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 2.634528875350952 + }, + { + "auxiliary_loss_clip": 0.01109086, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.03976691, + "balance_loss_mlp": 1.02545905, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 5.557434321951379, + "language_loss": 0.72320062, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74469757, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 2.572890520095825 + }, + { + "auxiliary_loss_clip": 0.01111084, + "auxiliary_loss_mlp": 0.01034978, + "balance_loss_clip": 1.03990901, + "balance_loss_mlp": 1.01881325, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.9007208160842555, + "language_loss": 0.7838527, + "learning_rate": 3.374360200552541e-06, + "loss": 0.80531335, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 2.5890793800354004 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.0104216, + "balance_loss_clip": 1.04141808, + "balance_loss_mlp": 1.02655554, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 1.7353393794551304, + "language_loss": 0.69974399, + "learning_rate": 3.374077235607968e-06, + "loss": 0.7214002, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.540210008621216 + }, + { + "auxiliary_loss_clip": 0.01116375, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.04118574, + "balance_loss_mlp": 1.02088988, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6308959815447404, + "language_loss": 0.7056247, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72713453, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 4.03834867477417 + }, + { + "auxiliary_loss_clip": 0.0109848, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_clip": 1.03842854, + "balance_loss_mlp": 1.02646255, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.6236503355040617, + "language_loss": 0.6343953, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.6558162, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 2.5613362789154053 + }, + { + "auxiliary_loss_clip": 0.01107884, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.03848839, + "balance_loss_mlp": 1.02067018, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 2.548009493932252, + "language_loss": 0.70285141, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.72428668, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.578176498413086 + }, + { + "auxiliary_loss_clip": 0.01108321, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.03776813, + "balance_loss_mlp": 1.02142572, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.8539321437168261, + "language_loss": 0.7458477, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76730204, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.5672996044158936 + }, + { + "auxiliary_loss_clip": 0.01122445, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.04174507, + "balance_loss_mlp": 1.02079129, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.6256333816785846, + "language_loss": 0.77266824, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.7942512, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 4.0910279750823975 + }, + { + "auxiliary_loss_clip": 0.01109046, + "auxiliary_loss_mlp": 0.01032879, + "balance_loss_clip": 1.03858328, + "balance_loss_mlp": 1.01707149, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 1.8446962536835387, + "language_loss": 0.74103671, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76245594, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.5637848377227783 + }, + { + "auxiliary_loss_clip": 0.01117413, + "auxiliary_loss_mlp": 0.01036494, + "balance_loss_clip": 1.03929114, + "balance_loss_mlp": 1.02201641, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.3757621136328988, + "language_loss": 0.8065722, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82811129, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.5288498401641846 + }, + { + "auxiliary_loss_clip": 0.01063612, + "auxiliary_loss_mlp": 0.01042273, + "balance_loss_clip": 1.03663754, + "balance_loss_mlp": 1.02623951, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.7494917102222165, + "language_loss": 0.76147228, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78253114, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 2.6470694541931152 + }, + { + "auxiliary_loss_clip": 0.0106127, + "auxiliary_loss_mlp": 0.01040315, + "balance_loss_clip": 1.03218818, + "balance_loss_mlp": 1.0252583, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.9219039597423835, + "language_loss": 0.76048982, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.7815057, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.6494951248168945 + }, + { + "auxiliary_loss_clip": 0.0109153, + "auxiliary_loss_mlp": 0.01039964, + "balance_loss_clip": 1.03787363, + "balance_loss_mlp": 1.02538419, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 1.4085586580267664, + "language_loss": 0.75541401, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.77672887, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.7276558876037598 + }, + { + "auxiliary_loss_clip": 0.01085972, + "auxiliary_loss_mlp": 0.01048228, + "balance_loss_clip": 1.03424907, + "balance_loss_mlp": 1.03168142, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.4472164455754486, + "language_loss": 0.62616134, + "learning_rate": 3.370961184640025e-06, + "loss": 0.64750332, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.5859930515289307 + }, + { + "auxiliary_loss_clip": 0.01100255, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.03982699, + "balance_loss_mlp": 1.03169334, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 2.6048123502727845, + "language_loss": 0.7594496, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78091729, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.6069998741149902 + }, + { + "auxiliary_loss_clip": 0.01083042, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_clip": 1.0372597, + "balance_loss_mlp": 1.02399981, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 1.935144239082927, + "language_loss": 0.78428024, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80549788, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.6444947719573975 + }, + { + "auxiliary_loss_clip": 0.01073183, + "auxiliary_loss_mlp": 0.01042686, + "balance_loss_clip": 1.03315437, + "balance_loss_mlp": 1.02692664, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.7061124313314262, + "language_loss": 0.78082132, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80198002, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.6617660522460938 + }, + { + "auxiliary_loss_clip": 0.01120366, + "auxiliary_loss_mlp": 0.00750226, + "balance_loss_clip": 1.03967667, + "balance_loss_mlp": 1.0005393, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.8598511075140847, + "language_loss": 0.8788687, + "learning_rate": 3.369826514835332e-06, + "loss": 0.8975746, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.5760748386383057 + }, + { + "auxiliary_loss_clip": 0.01094065, + "auxiliary_loss_mlp": 0.01046625, + "balance_loss_clip": 1.03754342, + "balance_loss_mlp": 1.02974463, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.1874566367393755, + "language_loss": 0.81410599, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.83551288, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.611992120742798 + }, + { + "auxiliary_loss_clip": 0.01080231, + "auxiliary_loss_mlp": 0.01040088, + "balance_loss_clip": 1.03724241, + "balance_loss_mlp": 1.02448988, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.8685077365614624, + "language_loss": 0.74982858, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.7710318, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.7044126987457275 + }, + { + "auxiliary_loss_clip": 0.01080579, + "auxiliary_loss_mlp": 0.01038299, + "balance_loss_clip": 1.03492427, + "balance_loss_mlp": 1.02345705, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 3.0834233433581684, + "language_loss": 0.77723712, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79842591, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 2.6377899646759033 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.03823102, + "balance_loss_mlp": 1.02510166, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 3.1892841553265505, + "language_loss": 0.66529649, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68674254, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 2.6490094661712646 + }, + { + "auxiliary_loss_clip": 0.01100823, + "auxiliary_loss_mlp": 0.01043912, + "balance_loss_clip": 1.03775668, + "balance_loss_mlp": 1.02686465, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.6795013641963448, + "language_loss": 0.7565881, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.7780354, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.622241735458374 + }, + { + "auxiliary_loss_clip": 0.01079884, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_clip": 1.03697455, + "balance_loss_mlp": 1.03127074, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 2.1018588758219323, + "language_loss": 0.62737769, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64863837, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.884849786758423 + }, + { + "auxiliary_loss_clip": 0.01071191, + "auxiliary_loss_mlp": 0.01040135, + "balance_loss_clip": 1.03501821, + "balance_loss_mlp": 1.02575254, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.2796668469078325, + "language_loss": 0.73112601, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75223929, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 2.7146456241607666 + }, + { + "auxiliary_loss_clip": 0.01114726, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_clip": 1.03728843, + "balance_loss_mlp": 1.02847958, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.65232687473578, + "language_loss": 0.75007522, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77164793, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 2.604498863220215 + }, + { + "auxiliary_loss_clip": 0.01104125, + "auxiliary_loss_mlp": 0.01039195, + "balance_loss_clip": 1.03552055, + "balance_loss_mlp": 1.02250588, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 2.8616564689523827, + "language_loss": 0.80141121, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.82284439, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.5682191848754883 + }, + { + "auxiliary_loss_clip": 0.01092906, + "auxiliary_loss_mlp": 0.01044876, + "balance_loss_clip": 1.03899741, + "balance_loss_mlp": 1.03157818, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 2.376791217492383, + "language_loss": 0.81247735, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.83385515, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 2.636458158493042 + }, + { + "auxiliary_loss_clip": 0.01049106, + "auxiliary_loss_mlp": 0.01042189, + "balance_loss_clip": 1.03420341, + "balance_loss_mlp": 1.0271095, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 1.8977464782421263, + "language_loss": 0.73155344, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75246644, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 2.809713363647461 + }, + { + "auxiliary_loss_clip": 0.01116176, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.03893316, + "balance_loss_mlp": 1.01878071, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.6896324845805268, + "language_loss": 0.78646755, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80796522, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 2.5443828105926514 + }, + { + "auxiliary_loss_clip": 0.01086062, + "auxiliary_loss_mlp": 0.01043133, + "balance_loss_clip": 1.03412032, + "balance_loss_mlp": 1.02771974, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.8042061203597475, + "language_loss": 0.69243395, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71372592, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 2.7194905281066895 + }, + { + "auxiliary_loss_clip": 0.01084361, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.03970242, + "balance_loss_mlp": 1.01844382, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.9490984572366148, + "language_loss": 0.70499581, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72618055, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 2.650085210800171 + }, + { + "auxiliary_loss_clip": 0.01023662, + "auxiliary_loss_mlp": 0.01002295, + "balance_loss_clip": 1.00785351, + "balance_loss_mlp": 1.00075722, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7349023981895901, + "language_loss": 0.5928449, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61310446, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 3.182616710662842 + }, + { + "auxiliary_loss_clip": 0.01087686, + "auxiliary_loss_mlp": 0.01039049, + "balance_loss_clip": 1.03455245, + "balance_loss_mlp": 1.02547669, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.478118238241829, + "language_loss": 0.82353872, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84480608, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 2.6874496936798096 + }, + { + "auxiliary_loss_clip": 0.01098553, + "auxiliary_loss_mlp": 0.01038484, + "balance_loss_clip": 1.03712821, + "balance_loss_mlp": 1.02212811, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 2.828520068753172, + "language_loss": 0.80437452, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82574487, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.648092269897461 + }, + { + "auxiliary_loss_clip": 0.01011686, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00679445, + "balance_loss_mlp": 1.00062072, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8819734506479364, + "language_loss": 0.62812996, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64826941, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 3.038520574569702 + }, + { + "auxiliary_loss_clip": 0.01085952, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.03704834, + "balance_loss_mlp": 1.02637744, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.5398642016770807, + "language_loss": 0.74113321, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76241887, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 2.610175371170044 + }, + { + "auxiliary_loss_clip": 0.01080972, + "auxiliary_loss_mlp": 0.01048293, + "balance_loss_clip": 1.03523064, + "balance_loss_mlp": 1.03253937, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 2.076806384860458, + "language_loss": 0.79057354, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81186616, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 4.231123208999634 + }, + { + "auxiliary_loss_clip": 0.0111075, + "auxiliary_loss_mlp": 0.00750405, + "balance_loss_clip": 1.0414542, + "balance_loss_mlp": 1.00058854, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 1.9914902226785078, + "language_loss": 0.70820135, + "learning_rate": 3.363855879093996e-06, + "loss": 0.7268129, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 2.6383728981018066 + }, + { + "auxiliary_loss_clip": 0.0112031, + "auxiliary_loss_mlp": 0.01046423, + "balance_loss_clip": 1.0401094, + "balance_loss_mlp": 1.03066397, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.7529708817515322, + "language_loss": 0.81374836, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.83541566, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 2.5299248695373535 + }, + { + "auxiliary_loss_clip": 0.0109616, + "auxiliary_loss_mlp": 0.01040668, + "balance_loss_clip": 1.03994298, + "balance_loss_mlp": 1.02498019, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.8058959244723614, + "language_loss": 0.75680029, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77816862, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 2.5917606353759766 + }, + { + "auxiliary_loss_clip": 0.01104108, + "auxiliary_loss_mlp": 0.01043546, + "balance_loss_clip": 1.03723204, + "balance_loss_mlp": 1.02878809, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4382812813358385, + "language_loss": 0.7817995, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80327606, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.646566867828369 + }, + { + "auxiliary_loss_clip": 0.01093382, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.03597474, + "balance_loss_mlp": 1.02432728, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 2.479835853256542, + "language_loss": 0.73708153, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.75841165, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 4.142596244812012 + }, + { + "auxiliary_loss_clip": 0.01090852, + "auxiliary_loss_mlp": 0.0104513, + "balance_loss_clip": 1.03491747, + "balance_loss_mlp": 1.02675986, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.609720718786786, + "language_loss": 0.74925792, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.77061778, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 4.089672088623047 + }, + { + "auxiliary_loss_clip": 0.01085262, + "auxiliary_loss_mlp": 0.01039601, + "balance_loss_clip": 1.03570664, + "balance_loss_mlp": 1.02430654, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.807502642661693, + "language_loss": 0.6712265, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.6924752, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 2.6145455837249756 + }, + { + "auxiliary_loss_clip": 0.01097977, + "auxiliary_loss_mlp": 0.01040989, + "balance_loss_clip": 1.03714788, + "balance_loss_mlp": 1.02495551, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.7171383345164457, + "language_loss": 0.72242749, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74381709, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 2.5978171825408936 + }, + { + "auxiliary_loss_clip": 0.01104595, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.03726912, + "balance_loss_mlp": 1.02174056, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.7956539145971047, + "language_loss": 0.80359131, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82500803, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.5870184898376465 + }, + { + "auxiliary_loss_clip": 0.01104057, + "auxiliary_loss_mlp": 0.01039109, + "balance_loss_clip": 1.03721166, + "balance_loss_mlp": 1.02312326, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 2.282622201984871, + "language_loss": 0.78774655, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.80917823, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 4.0381176471710205 + }, + { + "auxiliary_loss_clip": 0.01064112, + "auxiliary_loss_mlp": 0.00750632, + "balance_loss_clip": 1.03482509, + "balance_loss_mlp": 1.00063157, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 1.798563175070936, + "language_loss": 0.82773447, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84588188, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.6700515747070312 + }, + { + "auxiliary_loss_clip": 0.01120981, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.04173541, + "balance_loss_mlp": 1.01905942, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.6733408318198109, + "language_loss": 0.70189011, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72343826, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 2.523221969604492 + }, + { + "auxiliary_loss_clip": 0.01087333, + "auxiliary_loss_mlp": 0.01039161, + "balance_loss_clip": 1.03619492, + "balance_loss_mlp": 1.02281702, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.463314896391878, + "language_loss": 0.78629208, + "learning_rate": 3.360433840760998e-06, + "loss": 0.80755705, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.6202239990234375 + }, + { + "auxiliary_loss_clip": 0.01089407, + "auxiliary_loss_mlp": 0.01048324, + "balance_loss_clip": 1.03689301, + "balance_loss_mlp": 1.03181386, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.582271984882295, + "language_loss": 0.92588484, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94726217, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.582719326019287 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.03882182, + "balance_loss_mlp": 1.02197146, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.7230040617637192, + "language_loss": 0.88613558, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90760928, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.6259844303131104 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01041205, + "balance_loss_clip": 1.04185677, + "balance_loss_mlp": 1.02558875, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 1.6322173195483962, + "language_loss": 0.78610224, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80761671, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.766221046447754 + }, + { + "auxiliary_loss_clip": 0.01106418, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.03942347, + "balance_loss_mlp": 1.02324939, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.182240293253967, + "language_loss": 0.66614151, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68757391, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.571075439453125 + }, + { + "auxiliary_loss_clip": 0.01084407, + "auxiliary_loss_mlp": 0.01043846, + "balance_loss_clip": 1.03762424, + "balance_loss_mlp": 1.02813447, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.7798747252368439, + "language_loss": 0.76041543, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78169799, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.6298420429229736 + }, + { + "auxiliary_loss_clip": 0.0109554, + "auxiliary_loss_mlp": 0.0104579, + "balance_loss_clip": 1.03867841, + "balance_loss_mlp": 1.02979255, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.8039063086995497, + "language_loss": 0.66562808, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68704134, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.6078526973724365 + }, + { + "auxiliary_loss_clip": 0.01096501, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.03840744, + "balance_loss_mlp": 1.01998389, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.8318742916102846, + "language_loss": 0.74500871, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76633179, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.629686117172241 + }, + { + "auxiliary_loss_clip": 0.01079103, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.04060423, + "balance_loss_mlp": 1.01666546, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 2.664438095783526, + "language_loss": 0.83744657, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85855615, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 2.6969568729400635 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.01041681, + "balance_loss_clip": 1.04106319, + "balance_loss_mlp": 1.0252775, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.5428881024068397, + "language_loss": 0.78675282, + "learning_rate": 3.357862435944109e-06, + "loss": 0.80828166, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.539940357208252 + }, + { + "auxiliary_loss_clip": 0.01124948, + "auxiliary_loss_mlp": 0.01047605, + "balance_loss_clip": 1.04143775, + "balance_loss_mlp": 1.03130889, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.4855045856846725, + "language_loss": 0.71641231, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73813784, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 2.512491464614868 + }, + { + "auxiliary_loss_clip": 0.01095674, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.03793621, + "balance_loss_mlp": 1.01627779, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.6965000577502645, + "language_loss": 0.73644662, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.75771761, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.558889150619507 + }, + { + "auxiliary_loss_clip": 0.01096186, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_clip": 1.03944612, + "balance_loss_mlp": 1.03032446, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.8267357782589018, + "language_loss": 0.7970835, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81848741, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 2.5534563064575195 + }, + { + "auxiliary_loss_clip": 0.011223, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.04153919, + "balance_loss_mlp": 1.02666914, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 3.510980600872649, + "language_loss": 0.59665489, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.6183002, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 2.601898670196533 + }, + { + "auxiliary_loss_clip": 0.01100423, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.03617072, + "balance_loss_mlp": 1.02144074, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.7299836312629817, + "language_loss": 0.86589968, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88726318, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 2.5837814807891846 + }, + { + "auxiliary_loss_clip": 0.01095326, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_clip": 1.04221177, + "balance_loss_mlp": 1.0253998, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.1151064780682534, + "language_loss": 0.89761281, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91899234, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.01108594, + "auxiliary_loss_mlp": 0.01037637, + "balance_loss_clip": 1.0423888, + "balance_loss_mlp": 1.02268815, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.453207521549089, + "language_loss": 0.72144008, + "learning_rate": 3.355859570559998e-06, + "loss": 0.7429024, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 2.576364040374756 + }, + { + "auxiliary_loss_clip": 0.01098393, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.04143, + "balance_loss_mlp": 1.02160263, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.5912504075842648, + "language_loss": 0.77750427, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.79885143, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.6396780014038086 + }, + { + "auxiliary_loss_clip": 0.0107983, + "auxiliary_loss_mlp": 0.01043638, + "balance_loss_clip": 1.03551805, + "balance_loss_mlp": 1.02783108, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.547985763692039, + "language_loss": 0.76410902, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78534371, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 2.6112353801727295 + }, + { + "auxiliary_loss_clip": 0.01124593, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.04094172, + "balance_loss_mlp": 1.028, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.9227238136462468, + "language_loss": 0.5758301, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59752893, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 2.5077812671661377 + }, + { + "auxiliary_loss_clip": 0.01089704, + "auxiliary_loss_mlp": 0.01049923, + "balance_loss_clip": 1.03948784, + "balance_loss_mlp": 1.03442621, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 1.6414901801420763, + "language_loss": 0.74597186, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76736808, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 2.674245595932007 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.01043347, + "balance_loss_clip": 1.04235721, + "balance_loss_mlp": 1.02879786, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.280042538209432, + "language_loss": 0.78233856, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.80384243, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 2.5306572914123535 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.04058075, + "balance_loss_mlp": 1.02491224, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.6770313165244095, + "language_loss": 0.82716095, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.84854722, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 2.5628066062927246 + }, + { + "auxiliary_loss_clip": 0.01072096, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.03532541, + "balance_loss_mlp": 1.0197823, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.7137577629463105, + "language_loss": 0.79231709, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81339598, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 2.676717519760132 + }, + { + "auxiliary_loss_clip": 0.01028469, + "auxiliary_loss_mlp": 0.01003355, + "balance_loss_clip": 1.01598477, + "balance_loss_mlp": 1.00126874, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7733289969941815, + "language_loss": 0.60470021, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62501848, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.103093147277832 + }, + { + "auxiliary_loss_clip": 0.01118901, + "auxiliary_loss_mlp": 0.0104109, + "balance_loss_clip": 1.03895617, + "balance_loss_mlp": 1.02629614, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.1367258227096837, + "language_loss": 0.80361259, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82521248, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 2.515050172805786 + }, + { + "auxiliary_loss_clip": 0.01106564, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.0375489, + "balance_loss_mlp": 1.02114475, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.6063772056929801, + "language_loss": 0.70163536, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72305822, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.6198439598083496 + }, + { + "auxiliary_loss_clip": 0.01105438, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.0398972, + "balance_loss_mlp": 1.01874375, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.4419692810061762, + "language_loss": 0.81891656, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84030795, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 4.202236652374268 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.03940809, + "balance_loss_mlp": 1.02393723, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.9865254362285238, + "language_loss": 0.80158007, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82313263, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 2.674060344696045 + }, + { + "auxiliary_loss_clip": 0.01100168, + "auxiliary_loss_mlp": 0.01036909, + "balance_loss_clip": 1.03544092, + "balance_loss_mlp": 1.02145934, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.5902361497605808, + "language_loss": 0.79012656, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81149733, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 2.5503616333007812 + }, + { + "auxiliary_loss_clip": 0.01122633, + "auxiliary_loss_mlp": 0.01039955, + "balance_loss_clip": 1.04005349, + "balance_loss_mlp": 1.02347422, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.1885207371646955, + "language_loss": 0.8962577, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91788352, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.6021337509155273 + }, + { + "auxiliary_loss_clip": 0.01101359, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.03638327, + "balance_loss_mlp": 1.02664483, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.7041749910916784, + "language_loss": 0.82163072, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84304959, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.5202322006225586 + }, + { + "auxiliary_loss_clip": 0.01071483, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.03494787, + "balance_loss_mlp": 1.02205873, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.6645706250367163, + "language_loss": 0.83776724, + "learning_rate": 3.351272138300922e-06, + "loss": 0.85885042, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 4.201080322265625 + }, + { + "auxiliary_loss_clip": 0.01003222, + "auxiliary_loss_mlp": 0.01006222, + "balance_loss_clip": 1.00888562, + "balance_loss_mlp": 1.00386202, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8684670687898126, + "language_loss": 0.6101414, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63023585, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 3.3372251987457275 + }, + { + "auxiliary_loss_clip": 0.01118829, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.04088116, + "balance_loss_mlp": 1.01854396, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 2.0136041393154667, + "language_loss": 0.65856624, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.68008637, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 4.038200378417969 + }, + { + "auxiliary_loss_clip": 0.01105613, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.03773916, + "balance_loss_mlp": 1.01860857, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.6452553669880918, + "language_loss": 0.62774795, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.64913511, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.669834852218628 + }, + { + "auxiliary_loss_clip": 0.0110204, + "auxiliary_loss_mlp": 0.00750359, + "balance_loss_clip": 1.03832412, + "balance_loss_mlp": 1.00054288, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.8223128202674235, + "language_loss": 0.74179077, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76031476, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.5515971183776855 + }, + { + "auxiliary_loss_clip": 0.01097725, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.04293108, + "balance_loss_mlp": 1.02214718, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.8081500705249727, + "language_loss": 0.7238003, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74513519, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 4.107164621353149 + }, + { + "auxiliary_loss_clip": 0.0105412, + "auxiliary_loss_mlp": 0.01039557, + "balance_loss_clip": 1.03803957, + "balance_loss_mlp": 1.02446508, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 1.8021777330626814, + "language_loss": 0.74650925, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76744598, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.7559425830841064 + }, + { + "auxiliary_loss_clip": 0.0108158, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.03828216, + "balance_loss_mlp": 1.0241493, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.4110268518198548, + "language_loss": 0.760795, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78199935, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 2.6011440753936768 + }, + { + "auxiliary_loss_clip": 0.01074211, + "auxiliary_loss_mlp": 0.01032134, + "balance_loss_clip": 1.03300643, + "balance_loss_mlp": 1.01693499, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.9877748586155848, + "language_loss": 0.77017039, + "learning_rate": 3.348973500311086e-06, + "loss": 0.7912339, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.665947675704956 + }, + { + "auxiliary_loss_clip": 0.01082301, + "auxiliary_loss_mlp": 0.01037437, + "balance_loss_clip": 1.03544354, + "balance_loss_mlp": 1.02087843, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.8290139770287999, + "language_loss": 0.70829111, + "learning_rate": 3.348685940258466e-06, + "loss": 0.72948849, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.628901243209839 + }, + { + "auxiliary_loss_clip": 0.01102465, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.03734291, + "balance_loss_mlp": 1.01645398, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.5376691934191742, + "language_loss": 0.75795531, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77928901, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.01104144, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.03744173, + "balance_loss_mlp": 1.01753259, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.647819946096484, + "language_loss": 0.77577174, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79713619, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.5905375480651855 + }, + { + "auxiliary_loss_clip": 0.01115833, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.03832793, + "balance_loss_mlp": 1.02550316, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.276744519599467, + "language_loss": 0.65263474, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67420185, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.50030255317688 + }, + { + "auxiliary_loss_clip": 0.01096996, + "auxiliary_loss_mlp": 0.01038809, + "balance_loss_clip": 1.03745532, + "balance_loss_mlp": 1.02359772, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.7549147503826092, + "language_loss": 0.70613432, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72749233, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 2.584151268005371 + }, + { + "auxiliary_loss_clip": 0.01055116, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03502548, + "balance_loss_mlp": 1.01564646, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.7194739031678272, + "language_loss": 0.75249588, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77335066, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.688523292541504 + }, + { + "auxiliary_loss_clip": 0.010677, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.0360949, + "balance_loss_mlp": 1.02270007, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 2.851625224551909, + "language_loss": 0.67515272, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69620526, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.7142579555511475 + }, + { + "auxiliary_loss_clip": 0.01025652, + "auxiliary_loss_mlp": 0.01008518, + "balance_loss_clip": 1.01001191, + "balance_loss_mlp": 1.00642002, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.79776367849269, + "language_loss": 0.56877196, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58911365, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.0853025913238525 + }, + { + "auxiliary_loss_clip": 0.01045135, + "auxiliary_loss_mlp": 0.00750558, + "balance_loss_clip": 1.03445125, + "balance_loss_mlp": 1.00056338, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.3057159299806314, + "language_loss": 0.83142424, + "learning_rate": 3.346383619630856e-06, + "loss": 0.84938115, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 2.67903208732605 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.03734541, + "balance_loss_mlp": 1.02056241, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.157735209211496, + "language_loss": 0.77898383, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80052292, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 2.5474562644958496 + }, + { + "auxiliary_loss_clip": 0.0109548, + "auxiliary_loss_mlp": 0.01042759, + "balance_loss_clip": 1.03879786, + "balance_loss_mlp": 1.02747047, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 3.066047187300597, + "language_loss": 0.73466009, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75604248, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 2.5269925594329834 + }, + { + "auxiliary_loss_clip": 0.01108906, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_clip": 1.03891551, + "balance_loss_mlp": 1.02760279, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.618276437668253, + "language_loss": 0.88535649, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90686888, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 2.57051157951355 + }, + { + "auxiliary_loss_clip": 0.01107121, + "auxiliary_loss_mlp": 0.01039552, + "balance_loss_clip": 1.0396378, + "balance_loss_mlp": 1.02569366, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.6362611738611896, + "language_loss": 0.74190062, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76336735, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.5583789348602295 + }, + { + "auxiliary_loss_clip": 0.01105253, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_clip": 1.04223311, + "balance_loss_mlp": 1.03059161, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.7958299745278383, + "language_loss": 0.79940081, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.8209098, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 2.5616939067840576 + }, + { + "auxiliary_loss_clip": 0.0110056, + "auxiliary_loss_mlp": 0.01044765, + "balance_loss_clip": 1.04380989, + "balance_loss_mlp": 1.02988744, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.5309319034799205, + "language_loss": 0.74047089, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76192409, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 2.608785390853882 + }, + { + "auxiliary_loss_clip": 0.01094207, + "auxiliary_loss_mlp": 0.0104113, + "balance_loss_clip": 1.0364393, + "balance_loss_mlp": 1.02560282, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5343281772818578, + "language_loss": 0.7614435, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78279692, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 2.592193126678467 + }, + { + "auxiliary_loss_clip": 0.01076699, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.03416848, + "balance_loss_mlp": 1.02486372, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.7916540598475499, + "language_loss": 0.81313407, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83428717, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.607525587081909 + }, + { + "auxiliary_loss_clip": 0.01081142, + "auxiliary_loss_mlp": 0.0104425, + "balance_loss_clip": 1.03933144, + "balance_loss_mlp": 1.02815676, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 2.2223584803790524, + "language_loss": 0.86449325, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88574713, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 2.6570725440979004 + }, + { + "auxiliary_loss_clip": 0.01091279, + "auxiliary_loss_mlp": 0.01039516, + "balance_loss_clip": 1.04396129, + "balance_loss_mlp": 1.02430475, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.4130816973817624, + "language_loss": 0.7097109, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73101884, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.6255640983581543 + }, + { + "auxiliary_loss_clip": 0.01098134, + "auxiliary_loss_mlp": 0.0104602, + "balance_loss_clip": 1.04204345, + "balance_loss_mlp": 1.03099322, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 1.8437286124638144, + "language_loss": 0.77070916, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79215074, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 2.6278152465820312 + }, + { + "auxiliary_loss_clip": 0.01072098, + "auxiliary_loss_mlp": 0.01049083, + "balance_loss_clip": 1.03696907, + "balance_loss_mlp": 1.03339458, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.4371286064786275, + "language_loss": 0.75994813, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.78116, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 2.648669958114624 + }, + { + "auxiliary_loss_clip": 0.01120054, + "auxiliary_loss_mlp": 0.01042275, + "balance_loss_clip": 1.04098797, + "balance_loss_mlp": 1.02777338, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.8780941769082977, + "language_loss": 0.83161962, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85324287, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 2.599503517150879 + }, + { + "auxiliary_loss_clip": 0.01086118, + "auxiliary_loss_mlp": 0.00750077, + "balance_loss_clip": 1.03929305, + "balance_loss_mlp": 1.00061941, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 2.0693280276668675, + "language_loss": 0.79723346, + "learning_rate": 3.342346699429516e-06, + "loss": 0.81559539, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.5920417308807373 + }, + { + "auxiliary_loss_clip": 0.01096392, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.03877711, + "balance_loss_mlp": 1.02186275, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 1.7604949943573613, + "language_loss": 0.83077538, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85210282, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.5959599018096924 + }, + { + "auxiliary_loss_clip": 0.01083871, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.04219306, + "balance_loss_mlp": 1.02841496, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.8144123803788537, + "language_loss": 0.73653603, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75780731, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 2.703463554382324 + }, + { + "auxiliary_loss_clip": 0.0110038, + "auxiliary_loss_mlp": 0.01037089, + "balance_loss_clip": 1.03847897, + "balance_loss_mlp": 1.02342713, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.8163132668140152, + "language_loss": 0.84054482, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86191946, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.5903115272521973 + }, + { + "auxiliary_loss_clip": 0.01108682, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.03931451, + "balance_loss_mlp": 1.02435672, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.7944127534101897, + "language_loss": 0.78203893, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80351329, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 4.1441943645477295 + }, + { + "auxiliary_loss_clip": 0.01097846, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.04029608, + "balance_loss_mlp": 1.01857471, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.9814836612081406, + "language_loss": 0.70483363, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72613978, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.592808961868286 + }, + { + "auxiliary_loss_clip": 0.0108222, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.04403973, + "balance_loss_mlp": 1.02290821, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 1.8828550049052817, + "language_loss": 0.79253227, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81372589, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 2.7067666053771973 + }, + { + "auxiliary_loss_clip": 0.0109326, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.03790188, + "balance_loss_mlp": 1.0205065, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.717170368543228, + "language_loss": 0.77830255, + "learning_rate": 3.340324496161797e-06, + "loss": 0.79957628, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.7747323513031006 + }, + { + "auxiliary_loss_clip": 0.01108712, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.0403924, + "balance_loss_mlp": 1.02341104, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.1489064188670954, + "language_loss": 0.82625383, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84772134, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.6176302433013916 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_clip": 1.03993654, + "balance_loss_mlp": 1.02792251, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 1.8929911454736712, + "language_loss": 0.75019825, + "learning_rate": 3.339746266208074e-06, + "loss": 0.77164435, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 4.17826509475708 + }, + { + "auxiliary_loss_clip": 0.01112676, + "auxiliary_loss_mlp": 0.01038912, + "balance_loss_clip": 1.04034758, + "balance_loss_mlp": 1.02303386, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.7674590406001605, + "language_loss": 0.72859764, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.75011361, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 2.560499906539917 + }, + { + "auxiliary_loss_clip": 0.01076919, + "auxiliary_loss_mlp": 0.00750067, + "balance_loss_clip": 1.03334677, + "balance_loss_mlp": 1.00059056, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 1.8791223614169286, + "language_loss": 0.7383194, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.75658923, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 4.027567386627197 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01044077, + "balance_loss_clip": 1.03826261, + "balance_loss_mlp": 1.02804315, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 3.8037993101482743, + "language_loss": 0.64909923, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67062938, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.5451548099517822 + }, + { + "auxiliary_loss_clip": 0.01119531, + "auxiliary_loss_mlp": 0.01044097, + "balance_loss_clip": 1.04002059, + "balance_loss_mlp": 1.02916026, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.7211698088121823, + "language_loss": 0.81893468, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84057099, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.6231918334960938 + }, + { + "auxiliary_loss_clip": 0.01077905, + "auxiliary_loss_mlp": 0.01037796, + "balance_loss_clip": 1.03748989, + "balance_loss_mlp": 1.02300239, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.7604171577732102, + "language_loss": 0.90919888, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93035591, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 4.168782949447632 + }, + { + "auxiliary_loss_clip": 0.01091836, + "auxiliary_loss_mlp": 0.00750159, + "balance_loss_clip": 1.03867269, + "balance_loss_mlp": 1.00058746, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.194743378496233, + "language_loss": 0.73953581, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75795579, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 2.6274218559265137 + }, + { + "auxiliary_loss_clip": 0.01006173, + "auxiliary_loss_mlp": 0.01009486, + "balance_loss_clip": 1.01040149, + "balance_loss_mlp": 1.00767374, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.9776941977050897, + "language_loss": 0.63026047, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65041709, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.1420459747314453 + }, + { + "auxiliary_loss_clip": 0.01060399, + "auxiliary_loss_mlp": 0.01043969, + "balance_loss_clip": 1.03303623, + "balance_loss_mlp": 1.02907372, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 3.1660951319480146, + "language_loss": 0.70591819, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72696185, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 2.6645092964172363 + }, + { + "auxiliary_loss_clip": 0.01110804, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.03869212, + "balance_loss_mlp": 1.02287149, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 2.1498235509785286, + "language_loss": 0.67941177, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70091707, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 2.6092982292175293 + }, + { + "auxiliary_loss_clip": 0.01108862, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.03907728, + "balance_loss_mlp": 1.02242315, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.782625113226434, + "language_loss": 0.69388932, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71535277, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.625162363052368 + }, + { + "auxiliary_loss_clip": 0.01098463, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.04140627, + "balance_loss_mlp": 1.02524948, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.3312556034442378, + "language_loss": 0.71640176, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7377916, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 2.795896053314209 + }, + { + "auxiliary_loss_clip": 0.01080854, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.03760171, + "balance_loss_mlp": 1.02639318, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.6152714518211249, + "language_loss": 0.81482083, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83604962, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.6402297019958496 + }, + { + "auxiliary_loss_clip": 0.01074581, + "auxiliary_loss_mlp": 0.01043142, + "balance_loss_clip": 1.03881645, + "balance_loss_mlp": 1.02763319, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.5332402366321993, + "language_loss": 0.78153533, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80271256, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.678250312805176 + }, + { + "auxiliary_loss_clip": 0.01064562, + "auxiliary_loss_mlp": 0.01040838, + "balance_loss_clip": 1.03283405, + "balance_loss_mlp": 1.02474451, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.7711881337835467, + "language_loss": 0.7878505, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80890453, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.6903514862060547 + }, + { + "auxiliary_loss_clip": 0.0106424, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.03410649, + "balance_loss_mlp": 1.01714063, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 2.039620120065142, + "language_loss": 0.76788533, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.78884363, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 2.6902499198913574 + }, + { + "auxiliary_loss_clip": 0.01105321, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.0380168, + "balance_loss_mlp": 1.0236764, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.4563777516714524, + "language_loss": 0.76922846, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79067498, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 2.6870012283325195 + }, + { + "auxiliary_loss_clip": 0.01008673, + "auxiliary_loss_mlp": 0.01015042, + "balance_loss_clip": 1.01845551, + "balance_loss_mlp": 1.01290846, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8432256286697777, + "language_loss": 0.60252023, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62275743, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 3.3361618518829346 + }, + { + "auxiliary_loss_clip": 0.01069352, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.03468394, + "balance_loss_mlp": 1.02124608, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 2.503754174585482, + "language_loss": 0.82124728, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84230852, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 2.664004325866699 + }, + { + "auxiliary_loss_clip": 0.01079911, + "auxiliary_loss_mlp": 0.0104634, + "balance_loss_clip": 1.04078007, + "balance_loss_mlp": 1.03093815, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.671358412065705, + "language_loss": 0.72143209, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74269462, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 2.737703800201416 + }, + { + "auxiliary_loss_clip": 0.01105696, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.04004598, + "balance_loss_mlp": 1.02924633, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.5029601390533673, + "language_loss": 0.70106184, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.7225495, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 2.5634098052978516 + }, + { + "auxiliary_loss_clip": 0.01095239, + "auxiliary_loss_mlp": 0.01044526, + "balance_loss_clip": 1.03910542, + "balance_loss_mlp": 1.02845621, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.259661962438362, + "language_loss": 0.74135268, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76275039, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 2.607546329498291 + }, + { + "auxiliary_loss_clip": 0.01088077, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.04063249, + "balance_loss_mlp": 1.0424149, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.8621796545600995, + "language_loss": 0.7599377, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78140175, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.697126865386963 + }, + { + "auxiliary_loss_clip": 0.0105343, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.0377264, + "balance_loss_mlp": 1.02293026, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.7655797823820794, + "language_loss": 0.79938495, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82030976, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.6475636959075928 + }, + { + "auxiliary_loss_clip": 0.01086934, + "auxiliary_loss_mlp": 0.01044641, + "balance_loss_clip": 1.03855109, + "balance_loss_mlp": 1.02783275, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.859049180624582, + "language_loss": 0.78542006, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80673587, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 2.6202356815338135 + }, + { + "auxiliary_loss_clip": 0.01073379, + "auxiliary_loss_mlp": 0.01035291, + "balance_loss_clip": 1.03900313, + "balance_loss_mlp": 1.02084923, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 1.9155740027508192, + "language_loss": 0.72625041, + "learning_rate": 3.332501274072231e-06, + "loss": 0.74733716, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 2.7529406547546387 + }, + { + "auxiliary_loss_clip": 0.01107279, + "auxiliary_loss_mlp": 0.01038108, + "balance_loss_clip": 1.03867435, + "balance_loss_mlp": 1.02304018, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.9303293511688506, + "language_loss": 0.71855706, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74001098, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 2.575918197631836 + }, + { + "auxiliary_loss_clip": 0.01108147, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.04217076, + "balance_loss_mlp": 1.02906263, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.6949485883867594, + "language_loss": 0.66036797, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68188804, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.60262131690979 + }, + { + "auxiliary_loss_clip": 0.01081895, + "auxiliary_loss_mlp": 0.010423, + "balance_loss_clip": 1.03395271, + "balance_loss_mlp": 1.02758932, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 2.099491826294218, + "language_loss": 0.81181926, + "learning_rate": 3.331629749427164e-06, + "loss": 0.83306122, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 2.6452255249023438 + }, + { + "auxiliary_loss_clip": 0.0111983, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.03878295, + "balance_loss_mlp": 1.02643812, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 2.0175160189973846, + "language_loss": 0.72029746, + "learning_rate": 3.331339140206385e-06, + "loss": 0.7419163, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 2.5725018978118896 + }, + { + "auxiliary_loss_clip": 0.0112244, + "auxiliary_loss_mlp": 0.0103977, + "balance_loss_clip": 1.0422374, + "balance_loss_mlp": 1.02460647, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.750240360477071, + "language_loss": 0.73593819, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75756031, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.531120777130127 + }, + { + "auxiliary_loss_clip": 0.01106562, + "auxiliary_loss_mlp": 0.01040484, + "balance_loss_clip": 1.03791046, + "balance_loss_mlp": 1.02676249, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 2.446881576785549, + "language_loss": 0.68190175, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70337218, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.6220946311950684 + }, + { + "auxiliary_loss_clip": 0.01099176, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.03785419, + "balance_loss_mlp": 1.02617049, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.8725447480308557, + "language_loss": 0.80100119, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82241744, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 2.578807830810547 + }, + { + "auxiliary_loss_clip": 0.01116797, + "auxiliary_loss_mlp": 0.01043171, + "balance_loss_clip": 1.03920603, + "balance_loss_mlp": 1.02872837, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 3.3477312398489323, + "language_loss": 0.8080622, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82966191, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.5172009468078613 + }, + { + "auxiliary_loss_clip": 0.01087589, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.03650153, + "balance_loss_mlp": 1.02710319, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 2.423310582203983, + "language_loss": 0.82537949, + "learning_rate": 3.329885337055249e-06, + "loss": 0.8466748, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 2.655956745147705 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01046162, + "balance_loss_clip": 1.03813601, + "balance_loss_mlp": 1.03105855, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 3.1383098111055894, + "language_loss": 0.78908837, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81062531, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 4.141339540481567 + }, + { + "auxiliary_loss_clip": 0.01112777, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.03723681, + "balance_loss_mlp": 1.02567458, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 2.875861780981978, + "language_loss": 0.74543005, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76695037, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 2.586203098297119 + }, + { + "auxiliary_loss_clip": 0.01091805, + "auxiliary_loss_mlp": 0.01028066, + "balance_loss_clip": 1.03680336, + "balance_loss_mlp": 1.01563847, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6995354942757803, + "language_loss": 0.76045543, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78165412, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.6408798694610596 + }, + { + "auxiliary_loss_clip": 0.01080748, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.03463614, + "balance_loss_mlp": 1.0208962, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 2.1332116738816373, + "language_loss": 0.64200878, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.66316968, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.6093814373016357 + }, + { + "auxiliary_loss_clip": 0.0108914, + "auxiliary_loss_mlp": 0.01028164, + "balance_loss_clip": 1.03660035, + "balance_loss_mlp": 1.01545072, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.645258031999652, + "language_loss": 0.71420664, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.7353797, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.6617000102996826 + }, + { + "auxiliary_loss_clip": 0.01086448, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.03507113, + "balance_loss_mlp": 1.01813781, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 1.626783595627134, + "language_loss": 0.79271579, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.8138907, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 4.116674900054932 + }, + { + "auxiliary_loss_clip": 0.01074184, + "auxiliary_loss_mlp": 0.0103941, + "balance_loss_clip": 1.03698981, + "balance_loss_mlp": 1.02399611, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 3.1423686090489564, + "language_loss": 0.80961841, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.83075434, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 4.149541854858398 + }, + { + "auxiliary_loss_clip": 0.01089051, + "auxiliary_loss_mlp": 0.01033467, + "balance_loss_clip": 1.03498805, + "balance_loss_mlp": 1.01928127, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8467091570475276, + "language_loss": 0.67685026, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69807541, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.720609664916992 + }, + { + "auxiliary_loss_clip": 0.01119439, + "auxiliary_loss_mlp": 0.00750195, + "balance_loss_clip": 1.03972495, + "balance_loss_mlp": 1.00069427, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.572515610253202, + "language_loss": 0.71403623, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73273259, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.554542303085327 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.03669059, + "balance_loss_mlp": 1.01766896, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.8363583117467766, + "language_loss": 0.76318371, + "learning_rate": 3.326973949928776e-06, + "loss": 0.78463888, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 4.1823179721832275 + }, + { + "auxiliary_loss_clip": 0.01069063, + "auxiliary_loss_mlp": 0.01038494, + "balance_loss_clip": 1.03514004, + "balance_loss_mlp": 1.02364051, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.6239567747682064, + "language_loss": 0.60491836, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62599397, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.705453872680664 + }, + { + "auxiliary_loss_clip": 0.01092286, + "auxiliary_loss_mlp": 0.01033717, + "balance_loss_clip": 1.03678179, + "balance_loss_mlp": 1.01933408, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.283967206244534, + "language_loss": 0.71206218, + "learning_rate": 3.326391068322232e-06, + "loss": 0.7333222, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 2.694857120513916 + }, + { + "auxiliary_loss_clip": 0.01102288, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.03681672, + "balance_loss_mlp": 1.01565051, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.62119752053764, + "language_loss": 0.7299341, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75124466, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.575272798538208 + }, + { + "auxiliary_loss_clip": 0.01061836, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.03603208, + "balance_loss_mlp": 1.01859665, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.9694359174790104, + "language_loss": 0.5810737, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60201961, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 2.6734817028045654 + }, + { + "auxiliary_loss_clip": 0.01110009, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.04127431, + "balance_loss_mlp": 1.019804, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 3.0793020622552625, + "language_loss": 0.86690331, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.888349, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.635289192199707 + }, + { + "auxiliary_loss_clip": 0.0109149, + "auxiliary_loss_mlp": 0.01046125, + "balance_loss_clip": 1.03776252, + "balance_loss_mlp": 1.03065205, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7676118109242935, + "language_loss": 0.67269075, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69406688, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.700444221496582 + }, + { + "auxiliary_loss_clip": 0.0109202, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.03871715, + "balance_loss_mlp": 1.01650167, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.956744170228753, + "language_loss": 0.70075333, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72197694, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.729978561401367 + }, + { + "auxiliary_loss_clip": 0.01104707, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.03808379, + "balance_loss_mlp": 1.01816893, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.555520962375328, + "language_loss": 0.73758268, + "learning_rate": 3.324641216731237e-06, + "loss": 0.7589578, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.6504979133605957 + }, + { + "auxiliary_loss_clip": 0.01096664, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_clip": 1.03580487, + "balance_loss_mlp": 1.02842927, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 1.9847417016691917, + "language_loss": 0.76908278, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79049754, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 2.5355374813079834 + }, + { + "auxiliary_loss_clip": 0.01098705, + "auxiliary_loss_mlp": 0.01036632, + "balance_loss_clip": 1.03722394, + "balance_loss_mlp": 1.02189124, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.6240554508362133, + "language_loss": 0.78098768, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80234098, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 2.6385204792022705 + }, + { + "auxiliary_loss_clip": 0.0108863, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.03783822, + "balance_loss_mlp": 1.01952291, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 3.4392134454934995, + "language_loss": 0.76142049, + "learning_rate": 3.323765612674296e-06, + "loss": 0.78264958, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 2.667367696762085 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.04065728, + "balance_loss_mlp": 1.0269767, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.4047417973778167, + "language_loss": 0.77610558, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79756099, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 2.611766815185547 + }, + { + "auxiliary_loss_clip": 0.01092919, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.03703511, + "balance_loss_mlp": 1.02495706, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.5801131826336177, + "language_loss": 0.78099877, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80232191, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 2.6717488765716553 + }, + { + "auxiliary_loss_clip": 0.01085104, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.03874278, + "balance_loss_mlp": 1.01941454, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.421422262940512, + "language_loss": 0.8774004, + "learning_rate": 3.322889556841445e-06, + "loss": 0.89859307, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 2.647667407989502 + }, + { + "auxiliary_loss_clip": 0.01101745, + "auxiliary_loss_mlp": 0.01051161, + "balance_loss_clip": 1.03826356, + "balance_loss_mlp": 1.03428149, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.8314101513190792, + "language_loss": 0.86218238, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88371146, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 2.643178939819336 + }, + { + "auxiliary_loss_clip": 0.01021709, + "auxiliary_loss_mlp": 0.01014194, + "balance_loss_clip": 1.00858259, + "balance_loss_mlp": 1.01235795, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.8130792096167444, + "language_loss": 0.6022262, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62258518, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.276231288909912 + }, + { + "auxiliary_loss_clip": 0.01088828, + "auxiliary_loss_mlp": 0.0075028, + "balance_loss_clip": 1.03504276, + "balance_loss_mlp": 1.00069404, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 1.9374152747482714, + "language_loss": 0.68293989, + "learning_rate": 3.322013049531664e-06, + "loss": 0.7013309, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.7134692668914795 + }, + { + "auxiliary_loss_clip": 0.01099965, + "auxiliary_loss_mlp": 0.00750113, + "balance_loss_clip": 1.03621995, + "balance_loss_mlp": 1.00063884, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 1.7923230197084754, + "language_loss": 0.83601701, + "learning_rate": 3.321720780151895e-06, + "loss": 0.85451782, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 2.7056496143341064 + }, + { + "auxiliary_loss_clip": 0.01115918, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.03977287, + "balance_loss_mlp": 1.02302551, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 1.8988500310241911, + "language_loss": 0.77307606, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79460496, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 2.558396816253662 + }, + { + "auxiliary_loss_clip": 0.01076572, + "auxiliary_loss_mlp": 0.01037021, + "balance_loss_clip": 1.03849769, + "balance_loss_mlp": 1.02135706, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.7893034178539136, + "language_loss": 0.68559563, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.70673156, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 2.673628330230713 + }, + { + "auxiliary_loss_clip": 0.01091683, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.03921556, + "balance_loss_mlp": 1.02327919, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.1116113981748668, + "language_loss": 0.75862163, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77989644, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 2.7336299419403076 + }, + { + "auxiliary_loss_clip": 0.01101821, + "auxiliary_loss_mlp": 0.01039856, + "balance_loss_clip": 1.03736508, + "balance_loss_mlp": 1.02612281, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.6150913741127677, + "language_loss": 0.91135842, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93277514, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.606710195541382 + }, + { + "auxiliary_loss_clip": 0.0110405, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.03715301, + "balance_loss_mlp": 1.01998925, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.6131520315472723, + "language_loss": 0.73496354, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75633818, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 2.625230073928833 + }, + { + "auxiliary_loss_clip": 0.01033679, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.03202629, + "balance_loss_mlp": 1.02042317, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.8190925689854567, + "language_loss": 0.77872157, + "learning_rate": 3.319966111745842e-06, + "loss": 0.79939437, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 2.8115394115448 + }, + { + "auxiliary_loss_clip": 0.01073646, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.03526807, + "balance_loss_mlp": 1.02556551, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 1.6466422968099863, + "language_loss": 0.8148483, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8360005, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.678070068359375 + }, + { + "auxiliary_loss_clip": 0.01062457, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.03942037, + "balance_loss_mlp": 1.02553737, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 2.11185866654308, + "language_loss": 0.84983587, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87086773, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.7473409175872803 + }, + { + "auxiliary_loss_clip": 0.0108013, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.03328264, + "balance_loss_mlp": 1.01913643, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.5907556557727187, + "language_loss": 0.75422072, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77534682, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 2.7725508213043213 + }, + { + "auxiliary_loss_clip": 0.01054121, + "auxiliary_loss_mlp": 0.01040401, + "balance_loss_clip": 1.03316283, + "balance_loss_mlp": 1.02544022, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 1.9325986826625474, + "language_loss": 0.7314899, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75243509, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 2.67164945602417 + }, + { + "auxiliary_loss_clip": 0.01059696, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.03309536, + "balance_loss_mlp": 1.01558959, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.5163352860789059, + "language_loss": 0.74382019, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76472306, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.7016818523406982 + }, + { + "auxiliary_loss_clip": 0.01087084, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.03659117, + "balance_loss_mlp": 1.01508546, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.6389610272050916, + "language_loss": 0.76257372, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78374034, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 2.6811349391937256 + }, + { + "auxiliary_loss_clip": 0.01101889, + "auxiliary_loss_mlp": 0.01041582, + "balance_loss_clip": 1.03663385, + "balance_loss_mlp": 1.02619171, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 1.9064193142107044, + "language_loss": 0.67471635, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69615102, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 4.199722528457642 + }, + { + "auxiliary_loss_clip": 0.01081148, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.03252745, + "balance_loss_mlp": 1.02571964, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.9545178965676115, + "language_loss": 0.77397263, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79518634, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.6585965156555176 + }, + { + "auxiliary_loss_clip": 0.01049825, + "auxiliary_loss_mlp": 0.01039491, + "balance_loss_clip": 1.03427088, + "balance_loss_mlp": 1.02373123, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 1.920198759671973, + "language_loss": 0.7289843, + "learning_rate": 3.317330731292164e-06, + "loss": 0.74987739, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.682814121246338 + }, + { + "auxiliary_loss_clip": 0.01104212, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.03632641, + "balance_loss_mlp": 1.02053714, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8631105709862696, + "language_loss": 0.78115225, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80254722, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.6096553802490234 + }, + { + "auxiliary_loss_clip": 0.01062633, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.03686547, + "balance_loss_mlp": 1.02045393, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.6693685739130952, + "language_loss": 0.77291435, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79389799, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.6082465648651123 + }, + { + "auxiliary_loss_clip": 0.01110539, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.04117429, + "balance_loss_mlp": 1.01682115, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.5187613815239793, + "language_loss": 0.69006133, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71147507, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 3.979058027267456 + }, + { + "auxiliary_loss_clip": 0.01092252, + "auxiliary_loss_mlp": 0.01035506, + "balance_loss_clip": 1.0346384, + "balance_loss_mlp": 1.02148747, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.026134603174205, + "language_loss": 0.82199895, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84327656, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 4.00825572013855 + }, + { + "auxiliary_loss_clip": 0.0110962, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.039276, + "balance_loss_mlp": 1.02217042, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 2.2401966727510336, + "language_loss": 0.67721438, + "learning_rate": 3.315864882155911e-06, + "loss": 0.69867259, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.430082082748413 + }, + { + "auxiliary_loss_clip": 0.01070375, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.03449333, + "balance_loss_mlp": 1.02112782, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.7976536877155649, + "language_loss": 0.73820239, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75926143, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.640474557876587 + }, + { + "auxiliary_loss_clip": 0.01076151, + "auxiliary_loss_mlp": 0.00750306, + "balance_loss_clip": 1.04002666, + "balance_loss_mlp": 1.00064802, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 1.986606382453765, + "language_loss": 0.66267848, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68094301, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 4.1373186111450195 + }, + { + "auxiliary_loss_clip": 0.01100798, + "auxiliary_loss_mlp": 0.01046625, + "balance_loss_clip": 1.03613615, + "balance_loss_mlp": 1.03206944, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.4677435402991144, + "language_loss": 0.70421791, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72569215, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 2.5466158390045166 + }, + { + "auxiliary_loss_clip": 0.01085663, + "auxiliary_loss_mlp": 0.00750258, + "balance_loss_clip": 1.0368824, + "balance_loss_mlp": 1.00072658, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 1.7317331138074632, + "language_loss": 0.83820838, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85656756, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 2.5749313831329346 + }, + { + "auxiliary_loss_clip": 0.01121007, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.04014099, + "balance_loss_mlp": 1.0264051, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.539923169003327, + "language_loss": 0.71546477, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73709106, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.511622428894043 + }, + { + "auxiliary_loss_clip": 0.01094738, + "auxiliary_loss_mlp": 0.01035359, + "balance_loss_clip": 1.03771901, + "balance_loss_mlp": 1.02064264, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 2.081102613361503, + "language_loss": 0.91972077, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.9410218, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.558422565460205 + }, + { + "auxiliary_loss_clip": 0.0110966, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.04137897, + "balance_loss_mlp": 1.02416253, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.1162212721154847, + "language_loss": 0.73379934, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75527787, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.5502829551696777 + }, + { + "auxiliary_loss_clip": 0.01094266, + "auxiliary_loss_mlp": 0.0104214, + "balance_loss_clip": 1.03652811, + "balance_loss_mlp": 1.02760792, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.1890735046997367, + "language_loss": 0.85341293, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87477696, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 2.4982967376708984 + }, + { + "auxiliary_loss_clip": 0.01086272, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.03591526, + "balance_loss_mlp": 1.0245266, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.532056530871692, + "language_loss": 0.7706573, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79190767, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 2.6407198905944824 + }, + { + "auxiliary_loss_clip": 0.01095844, + "auxiliary_loss_mlp": 0.01039865, + "balance_loss_clip": 1.03977811, + "balance_loss_mlp": 1.02612638, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.109489143176677, + "language_loss": 0.79919374, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.82055086, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.537447452545166 + }, + { + "auxiliary_loss_clip": 0.01087397, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.02150178, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.463270831125341, + "language_loss": 0.5523873, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57361698, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 2.69618558883667 + }, + { + "auxiliary_loss_clip": 0.01108528, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.03910148, + "balance_loss_mlp": 1.02057052, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.7331676537814462, + "language_loss": 0.84738278, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86882889, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.4975063800811768 + }, + { + "auxiliary_loss_clip": 0.0110828, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.03953171, + "balance_loss_mlp": 1.02617526, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.8640895379157354, + "language_loss": 0.72613084, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74761999, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 2.4749996662139893 + }, + { + "auxiliary_loss_clip": 0.01119581, + "auxiliary_loss_mlp": 0.01045464, + "balance_loss_clip": 1.0399828, + "balance_loss_mlp": 1.02945423, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 2.0866804308331828, + "language_loss": 0.77331102, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79496145, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 2.5611982345581055 + }, + { + "auxiliary_loss_clip": 0.01115296, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.03800392, + "balance_loss_mlp": 1.01838851, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.667413438522068, + "language_loss": 0.78403461, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80552113, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.585648775100708 + }, + { + "auxiliary_loss_clip": 0.01081225, + "auxiliary_loss_mlp": 0.01040812, + "balance_loss_clip": 1.038167, + "balance_loss_mlp": 1.02583289, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.8495543318171468, + "language_loss": 0.84924084, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87046123, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.787221670150757 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.03704691, + "balance_loss_mlp": 1.02075171, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.742936732341863, + "language_loss": 0.90779483, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92917466, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.5397071838378906 + }, + { + "auxiliary_loss_clip": 0.01109992, + "auxiliary_loss_mlp": 0.01039069, + "balance_loss_clip": 1.0396899, + "balance_loss_mlp": 1.02416193, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 1.643777244652686, + "language_loss": 0.86632907, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88781971, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.5374209880828857 + }, + { + "auxiliary_loss_clip": 0.01115059, + "auxiliary_loss_mlp": 0.01046559, + "balance_loss_clip": 1.04338038, + "balance_loss_mlp": 1.0309428, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 1.8303711755331578, + "language_loss": 0.73650151, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75811768, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 2.5949108600616455 + }, + { + "auxiliary_loss_clip": 0.01101939, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_clip": 1.03568006, + "balance_loss_mlp": 1.02529359, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 2.0663735177750286, + "language_loss": 0.73932326, + "learning_rate": 3.309989025093813e-06, + "loss": 0.7607581, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 2.5317678451538086 + }, + { + "auxiliary_loss_clip": 0.01115484, + "auxiliary_loss_mlp": 0.01042407, + "balance_loss_clip": 1.04544091, + "balance_loss_mlp": 1.02506185, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.81303620958441, + "language_loss": 0.69362473, + "learning_rate": 3.309694709912618e-06, + "loss": 0.7152037, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.526000499725342 + }, + { + "auxiliary_loss_clip": 0.0109293, + "auxiliary_loss_mlp": 0.00750372, + "balance_loss_clip": 1.03573895, + "balance_loss_mlp": 1.00073183, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 2.1399519434676573, + "language_loss": 0.78965402, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.80808711, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 2.5883560180664062 + }, + { + "auxiliary_loss_clip": 0.01073883, + "auxiliary_loss_mlp": 0.01049272, + "balance_loss_clip": 1.03128219, + "balance_loss_mlp": 1.0328567, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.7657072130599956, + "language_loss": 0.80862248, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.82985401, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.5570967197418213 + }, + { + "auxiliary_loss_clip": 0.0108857, + "auxiliary_loss_mlp": 0.01033789, + "balance_loss_clip": 1.03760529, + "balance_loss_mlp": 1.01987696, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.0991781989152236, + "language_loss": 0.58118272, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60240632, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 2.611870765686035 + }, + { + "auxiliary_loss_clip": 0.01094806, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.03767681, + "balance_loss_mlp": 1.02465677, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.978849513342615, + "language_loss": 0.75519705, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77653265, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 2.5622968673706055 + }, + { + "auxiliary_loss_clip": 0.01088253, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.03761482, + "balance_loss_mlp": 1.02505422, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 3.3285161090412623, + "language_loss": 0.61987615, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64117599, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.6690049171447754 + }, + { + "auxiliary_loss_clip": 0.01107636, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.0384376, + "balance_loss_mlp": 1.01997697, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 4.037811817747616, + "language_loss": 0.73003536, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75145257, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 2.5637166500091553 + }, + { + "auxiliary_loss_clip": 0.01081948, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.03713071, + "balance_loss_mlp": 1.02492476, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6104477358537674, + "language_loss": 0.81719065, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.83841336, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 2.676112174987793 + }, + { + "auxiliary_loss_clip": 0.01064423, + "auxiliary_loss_mlp": 0.01034124, + "balance_loss_clip": 1.0334053, + "balance_loss_mlp": 1.01975358, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.9110250246398461, + "language_loss": 0.8790772, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.90006268, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 2.6322596073150635 + }, + { + "auxiliary_loss_clip": 0.01122212, + "auxiliary_loss_mlp": 0.01039133, + "balance_loss_clip": 1.04095745, + "balance_loss_mlp": 1.0238024, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.181568950265625, + "language_loss": 0.81747812, + "learning_rate": 3.307043639752782e-06, + "loss": 0.8390916, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.540239095687866 + }, + { + "auxiliary_loss_clip": 0.01038485, + "auxiliary_loss_mlp": 0.01017534, + "balance_loss_clip": 1.01284647, + "balance_loss_mlp": 1.01581705, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7853460372839804, + "language_loss": 0.5727762, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59333634, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 2.933568000793457 + }, + { + "auxiliary_loss_clip": 0.01106875, + "auxiliary_loss_mlp": 0.00750155, + "balance_loss_clip": 1.04122019, + "balance_loss_mlp": 1.00072491, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.4202121830309729, + "language_loss": 0.86413574, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88270593, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 2.6445772647857666 + }, + { + "auxiliary_loss_clip": 0.01100827, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.03729439, + "balance_loss_mlp": 1.02114391, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.8362300118856472, + "language_loss": 0.72876209, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75011438, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 4.133903980255127 + }, + { + "auxiliary_loss_clip": 0.01104536, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.03947258, + "balance_loss_mlp": 1.02001524, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.5711683097791858, + "language_loss": 0.89867008, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92005688, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.51564621925354 + }, + { + "auxiliary_loss_clip": 0.01086783, + "auxiliary_loss_mlp": 0.0104832, + "balance_loss_clip": 1.03784943, + "balance_loss_mlp": 1.03219128, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.3190582753493238, + "language_loss": 0.83262116, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.8539722, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.627270460128784 + }, + { + "auxiliary_loss_clip": 0.01116167, + "auxiliary_loss_mlp": 0.01040917, + "balance_loss_clip": 1.03799188, + "balance_loss_mlp": 1.02721381, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.7521255323136429, + "language_loss": 0.7715348, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79310572, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.5287764072418213 + }, + { + "auxiliary_loss_clip": 0.01092987, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.03681207, + "balance_loss_mlp": 1.01964617, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 2.0887325271631476, + "language_loss": 0.81694019, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83821535, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 2.7473561763763428 + }, + { + "auxiliary_loss_clip": 0.01040088, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.03442514, + "balance_loss_mlp": 1.02349973, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.9440395627742615, + "language_loss": 0.844293, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.86507457, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 4.251288414001465 + }, + { + "auxiliary_loss_clip": 0.01098218, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.03497279, + "balance_loss_mlp": 1.01939738, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 2.0249507908645508, + "language_loss": 0.70030892, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.72162211, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 3.991194248199463 + }, + { + "auxiliary_loss_clip": 0.01097955, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.04027867, + "balance_loss_mlp": 1.0204165, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 1.885677525866867, + "language_loss": 0.90895844, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93028224, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.588031053543091 + }, + { + "auxiliary_loss_clip": 0.01120385, + "auxiliary_loss_mlp": 0.01039408, + "balance_loss_clip": 1.04149878, + "balance_loss_mlp": 1.02504945, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 2.2674273419036592, + "language_loss": 0.72027487, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74187279, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.5550878047943115 + }, + { + "auxiliary_loss_clip": 0.01093432, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_clip": 1.03736854, + "balance_loss_mlp": 1.02700198, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.8154044295449137, + "language_loss": 0.75817454, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77952421, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 4.040539741516113 + }, + { + "auxiliary_loss_clip": 0.01094353, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_clip": 1.04135954, + "balance_loss_mlp": 1.03091049, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.5632123667146383, + "language_loss": 0.68916512, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.71057224, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 2.5576884746551514 + }, + { + "auxiliary_loss_clip": 0.01096328, + "auxiliary_loss_mlp": 0.01045137, + "balance_loss_clip": 1.04005551, + "balance_loss_mlp": 1.02981257, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 3.500903585905781, + "language_loss": 0.74802357, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76943827, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.5736851692199707 + }, + { + "auxiliary_loss_clip": 0.01124045, + "auxiliary_loss_mlp": 0.00750266, + "balance_loss_clip": 1.04157877, + "balance_loss_mlp": 1.00078571, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 1.806846594551223, + "language_loss": 0.76868516, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78742826, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.5381839275360107 + }, + { + "auxiliary_loss_clip": 0.01092486, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.03878784, + "balance_loss_mlp": 1.02348661, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.6867324740711245, + "language_loss": 0.86378479, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88509262, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 2.607391595840454 + }, + { + "auxiliary_loss_clip": 0.01105617, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.03905106, + "balance_loss_mlp": 1.01899898, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.4365872665813342, + "language_loss": 0.81951535, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84091419, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 2.5637924671173096 + }, + { + "auxiliary_loss_clip": 0.01049081, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.03061998, + "balance_loss_mlp": 1.03380752, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.5080035130657814, + "language_loss": 0.85659498, + "learning_rate": 3.301729463727452e-06, + "loss": 0.8775816, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.6060352325439453 + }, + { + "auxiliary_loss_clip": 0.01084144, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.03891897, + "balance_loss_mlp": 1.01661551, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 1.9011064005279021, + "language_loss": 0.86266297, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88381267, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 2.572154998779297 + }, + { + "auxiliary_loss_clip": 0.01106644, + "auxiliary_loss_mlp": 0.01038981, + "balance_loss_clip": 1.04019094, + "balance_loss_mlp": 1.02479541, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.7539733427660318, + "language_loss": 0.80498129, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.82643753, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 2.502258539199829 + }, + { + "auxiliary_loss_clip": 0.0109861, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.03851485, + "balance_loss_mlp": 1.01971436, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 2.2288892430197422, + "language_loss": 0.7249245, + "learning_rate": 3.300842211064773e-06, + "loss": 0.7462796, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 2.6374728679656982 + }, + { + "auxiliary_loss_clip": 0.01091549, + "auxiliary_loss_mlp": 0.01045952, + "balance_loss_clip": 1.03759551, + "balance_loss_mlp": 1.02843964, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.105452492639403, + "language_loss": 0.72044051, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74181557, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 2.5421535968780518 + }, + { + "auxiliary_loss_clip": 0.01006977, + "auxiliary_loss_mlp": 0.01004972, + "balance_loss_clip": 1.01660442, + "balance_loss_mlp": 1.00302875, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8122733030970295, + "language_loss": 0.60685325, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62697268, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 3.103037118911743 + }, + { + "auxiliary_loss_clip": 0.01005623, + "auxiliary_loss_mlp": 0.01008215, + "balance_loss_clip": 1.02747154, + "balance_loss_mlp": 1.00655842, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.740022206194957, + "language_loss": 0.52347791, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54361629, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.1135082244873047 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01035804, + "balance_loss_clip": 1.03721428, + "balance_loss_mlp": 1.02133846, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 1.608178235487397, + "language_loss": 0.81751394, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83889174, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.5742127895355225 + }, + { + "auxiliary_loss_clip": 0.01063654, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.03322423, + "balance_loss_mlp": 1.01825023, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.7289992696185508, + "language_loss": 0.75251949, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77349174, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.6674067974090576 + }, + { + "auxiliary_loss_clip": 0.01099239, + "auxiliary_loss_mlp": 0.01041312, + "balance_loss_clip": 1.03805208, + "balance_loss_mlp": 1.02551711, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.7778094598063359, + "language_loss": 0.62551475, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64692026, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 2.591693162918091 + }, + { + "auxiliary_loss_clip": 0.01101913, + "auxiliary_loss_mlp": 0.01037972, + "balance_loss_clip": 1.03810775, + "balance_loss_mlp": 1.02326143, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4060916084345372, + "language_loss": 0.79794383, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81934273, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 2.619405746459961 + }, + { + "auxiliary_loss_clip": 0.01076243, + "auxiliary_loss_mlp": 0.01041804, + "balance_loss_clip": 1.03862464, + "balance_loss_mlp": 1.02641392, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.8314939537063508, + "language_loss": 0.73796666, + "learning_rate": 3.298474034352309e-06, + "loss": 0.75914711, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.7679669857025146 + }, + { + "auxiliary_loss_clip": 0.01067897, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.03689051, + "balance_loss_mlp": 1.01726818, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.7406091555665242, + "language_loss": 0.78124893, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80225062, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.6723759174346924 + }, + { + "auxiliary_loss_clip": 0.01091071, + "auxiliary_loss_mlp": 0.0104837, + "balance_loss_clip": 1.0403496, + "balance_loss_mlp": 1.03156114, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 1.7733125365074933, + "language_loss": 0.76864219, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79003656, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.553743362426758 + }, + { + "auxiliary_loss_clip": 0.01079645, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.0356729, + "balance_loss_mlp": 1.02026153, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5380498385652137, + "language_loss": 0.78004599, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80119908, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 2.665268659591675 + }, + { + "auxiliary_loss_clip": 0.01090561, + "auxiliary_loss_mlp": 0.01041996, + "balance_loss_clip": 1.03711057, + "balance_loss_mlp": 1.02515757, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 1.4784647137983002, + "language_loss": 0.75415558, + "learning_rate": 3.297288763918435e-06, + "loss": 0.7754811, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.626643419265747 + }, + { + "auxiliary_loss_clip": 0.01111789, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_clip": 1.0406419, + "balance_loss_mlp": 1.03301847, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.123553380074076, + "language_loss": 0.74017274, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.76178193, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 2.724736452102661 + }, + { + "auxiliary_loss_clip": 0.0108574, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.03864491, + "balance_loss_mlp": 1.02349782, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.730391920650179, + "language_loss": 0.70498323, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72623652, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 2.624498128890991 + }, + { + "auxiliary_loss_clip": 0.01097159, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.0390799, + "balance_loss_mlp": 1.02082515, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 2.2931470140073165, + "language_loss": 0.79825771, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.81959206, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 2.5771002769470215 + }, + { + "auxiliary_loss_clip": 0.01092611, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.03814054, + "balance_loss_mlp": 1.02298677, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.324614741876059, + "language_loss": 0.82910842, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85040522, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 2.6055848598480225 + }, + { + "auxiliary_loss_clip": 0.01064636, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.03557491, + "balance_loss_mlp": 1.01986408, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 1.9345940548219802, + "language_loss": 0.66918826, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69017076, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 2.629507303237915 + }, + { + "auxiliary_loss_clip": 0.01102624, + "auxiliary_loss_mlp": 0.00750519, + "balance_loss_clip": 1.04098535, + "balance_loss_mlp": 1.00069642, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 2.168216355423228, + "language_loss": 0.73979056, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.758322, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 2.5944344997406006 + }, + { + "auxiliary_loss_clip": 0.01087471, + "auxiliary_loss_mlp": 0.01042179, + "balance_loss_clip": 1.03971159, + "balance_loss_mlp": 1.02666974, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.1351033113133093, + "language_loss": 0.73260242, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75389892, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 2.622509479522705 + }, + { + "auxiliary_loss_clip": 0.01114953, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.03833294, + "balance_loss_mlp": 1.01838887, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 1.8860826867731972, + "language_loss": 0.83834273, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.85981286, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.4837381839752197 + }, + { + "auxiliary_loss_clip": 0.01101818, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.03619456, + "balance_loss_mlp": 1.0212692, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 1.9142598635495602, + "language_loss": 0.71363056, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73500776, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 2.6451003551483154 + }, + { + "auxiliary_loss_clip": 0.01067628, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.0360049, + "balance_loss_mlp": 1.02697265, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 2.113618715112224, + "language_loss": 0.82610077, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84718215, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 4.223731994628906 + }, + { + "auxiliary_loss_clip": 0.01090308, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.03403711, + "balance_loss_mlp": 1.01942217, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.427939413317003, + "language_loss": 0.74036276, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76161301, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.566790819168091 + }, + { + "auxiliary_loss_clip": 0.01035441, + "auxiliary_loss_mlp": 0.01045717, + "balance_loss_clip": 1.03237772, + "balance_loss_mlp": 1.02899194, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.6706241590392903, + "language_loss": 0.83927238, + "learning_rate": 3.293728232937228e-06, + "loss": 0.86008394, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.7349753379821777 + }, + { + "auxiliary_loss_clip": 0.0109138, + "auxiliary_loss_mlp": 0.01034764, + "balance_loss_clip": 1.03718519, + "balance_loss_mlp": 1.01967239, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.1628439929889764, + "language_loss": 0.73852432, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.75978577, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 2.552880048751831 + }, + { + "auxiliary_loss_clip": 0.01113706, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.03760529, + "balance_loss_mlp": 1.01993227, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.9971733504007803, + "language_loss": 0.75867414, + "learning_rate": 3.293134123765452e-06, + "loss": 0.7801497, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 2.4873733520507812 + }, + { + "auxiliary_loss_clip": 0.01067621, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.036255, + "balance_loss_mlp": 1.0207485, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 1.8257111701926252, + "language_loss": 0.72182012, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74285895, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 4.109066963195801 + }, + { + "auxiliary_loss_clip": 0.01106115, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.03678036, + "balance_loss_mlp": 1.02439189, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.6550243660531463, + "language_loss": 0.79104853, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81250799, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 2.535315990447998 + }, + { + "auxiliary_loss_clip": 0.01109589, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.03957462, + "balance_loss_mlp": 1.02324986, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5213306864866125, + "language_loss": 0.70276964, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72424769, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 4.0305328369140625 + }, + { + "auxiliary_loss_clip": 0.0108102, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.03845155, + "balance_loss_mlp": 1.02615571, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.7487908510201033, + "language_loss": 0.78698492, + "learning_rate": 3.291945317082743e-06, + "loss": 0.80820262, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 2.6197144985198975 + }, + { + "auxiliary_loss_clip": 0.01101592, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_clip": 1.03610086, + "balance_loss_mlp": 1.03166556, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.7193324199878488, + "language_loss": 0.79637295, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81784701, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 4.0156261920928955 + }, + { + "auxiliary_loss_clip": 0.01082329, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.03560483, + "balance_loss_mlp": 1.02505338, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.3283006508399686, + "language_loss": 0.74294293, + "learning_rate": 3.291350619752129e-06, + "loss": 0.76417822, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.5937740802764893 + }, + { + "auxiliary_loss_clip": 0.01104639, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.03747022, + "balance_loss_mlp": 1.02127206, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.8167550104521712, + "language_loss": 0.62771916, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64912105, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 2.5643067359924316 + }, + { + "auxiliary_loss_clip": 0.01104533, + "auxiliary_loss_mlp": 0.01039499, + "balance_loss_clip": 1.04014206, + "balance_loss_mlp": 1.02388275, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 2.4582147338797737, + "language_loss": 0.83272654, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85416687, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 2.5193231105804443 + }, + { + "auxiliary_loss_clip": 0.01087005, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.04114866, + "balance_loss_mlp": 1.01894236, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.1018231568041235, + "language_loss": 0.66565585, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68687558, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.6479969024658203 + }, + { + "auxiliary_loss_clip": 0.01100255, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.03555703, + "balance_loss_mlp": 1.01825571, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.95241568499515, + "language_loss": 0.71126449, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.7325865, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 2.5495948791503906 + }, + { + "auxiliary_loss_clip": 0.01121203, + "auxiliary_loss_mlp": 0.01043374, + "balance_loss_clip": 1.0419848, + "balance_loss_mlp": 1.02757859, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 1.7784759079312664, + "language_loss": 0.6609565, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68260229, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 2.545016050338745 + }, + { + "auxiliary_loss_clip": 0.01122415, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.04308891, + "balance_loss_mlp": 1.02151895, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.798298084360099, + "language_loss": 0.73798931, + "learning_rate": 3.289565352885785e-06, + "loss": 0.75957787, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 2.498335838317871 + }, + { + "auxiliary_loss_clip": 0.01081508, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.03334546, + "balance_loss_mlp": 1.02047896, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9218313072800919, + "language_loss": 0.71168476, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73285174, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.5531203746795654 + }, + { + "auxiliary_loss_clip": 0.0109546, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.03558326, + "balance_loss_mlp": 1.01547527, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 1.590178871592234, + "language_loss": 0.76727974, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.78854394, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.6396896839141846 + }, + { + "auxiliary_loss_clip": 0.011166, + "auxiliary_loss_mlp": 0.01031863, + "balance_loss_clip": 1.038697, + "balance_loss_mlp": 1.01826715, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.838201069088027, + "language_loss": 0.70158947, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72307414, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 2.4925739765167236 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.03789616, + "balance_loss_mlp": 1.02089489, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.8753465598525647, + "language_loss": 0.85110211, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87255853, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.4974629878997803 + }, + { + "auxiliary_loss_clip": 0.01084343, + "auxiliary_loss_mlp": 0.01047399, + "balance_loss_clip": 1.03605819, + "balance_loss_mlp": 1.0307219, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.6480434845479757, + "language_loss": 0.79282796, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81414533, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.559018135070801 + }, + { + "auxiliary_loss_clip": 0.0111822, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.03986919, + "balance_loss_mlp": 1.02563453, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.8451347891549483, + "language_loss": 0.85454893, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87613428, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.505262851715088 + }, + { + "auxiliary_loss_clip": 0.01082959, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.03545654, + "balance_loss_mlp": 1.01807892, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.5645520095511534, + "language_loss": 0.77965963, + "learning_rate": 3.287480316742863e-06, + "loss": 0.80081922, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 2.562889337539673 + }, + { + "auxiliary_loss_clip": 0.01088065, + "auxiliary_loss_mlp": 0.00750456, + "balance_loss_clip": 1.03727448, + "balance_loss_mlp": 1.00071216, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 1.5889342040394399, + "language_loss": 0.72236502, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74075019, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 2.6529688835144043 + }, + { + "auxiliary_loss_clip": 0.01110726, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.04292226, + "balance_loss_mlp": 1.01990414, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.7974391565948036, + "language_loss": 0.75732958, + "learning_rate": 3.286884152568687e-06, + "loss": 0.77879602, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.5857040882110596 + }, + { + "auxiliary_loss_clip": 0.01102273, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.03768396, + "balance_loss_mlp": 1.02352214, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.645835351107295, + "language_loss": 0.86214721, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88354939, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.5121967792510986 + }, + { + "auxiliary_loss_clip": 0.01094792, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.0402379, + "balance_loss_mlp": 1.02249146, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.4363147234167315, + "language_loss": 0.68571663, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.7070362, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.6493654251098633 + }, + { + "auxiliary_loss_clip": 0.01101394, + "auxiliary_loss_mlp": 0.01034345, + "balance_loss_clip": 1.04312205, + "balance_loss_mlp": 1.01937234, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 1.7744343413981276, + "language_loss": 0.75899351, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78035092, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 2.5747525691986084 + }, + { + "auxiliary_loss_clip": 0.01059122, + "auxiliary_loss_mlp": 0.01038676, + "balance_loss_clip": 1.03094172, + "balance_loss_mlp": 1.02164102, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.7869237081927147, + "language_loss": 0.68371874, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70469677, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.749055862426758 + }, + { + "auxiliary_loss_clip": 0.01104721, + "auxiliary_loss_mlp": 0.0075019, + "balance_loss_clip": 1.04084563, + "balance_loss_mlp": 1.00068331, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.8887639999328578, + "language_loss": 0.73624498, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75479406, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 2.5879876613616943 + }, + { + "auxiliary_loss_clip": 0.01106797, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.03730237, + "balance_loss_mlp": 1.02281392, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.4708546269510592, + "language_loss": 0.86244285, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.88388181, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 2.59094500541687 + }, + { + "auxiliary_loss_clip": 0.01097397, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.04107153, + "balance_loss_mlp": 1.02038252, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.213554781727403, + "language_loss": 0.86581069, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88715136, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 2.566080331802368 + }, + { + "auxiliary_loss_clip": 0.01101356, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.04000068, + "balance_loss_mlp": 1.02459395, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 2.028374953646516, + "language_loss": 0.78680825, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80820382, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.5826613903045654 + }, + { + "auxiliary_loss_clip": 0.01084618, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.03893113, + "balance_loss_mlp": 1.02848959, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.761390607156397, + "language_loss": 0.78086841, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80215722, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.642873525619507 + }, + { + "auxiliary_loss_clip": 0.01055191, + "auxiliary_loss_mlp": 0.01043383, + "balance_loss_clip": 1.03290856, + "balance_loss_mlp": 1.0245719, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 2.2304689924584364, + "language_loss": 0.71556103, + "learning_rate": 3.283900405580837e-06, + "loss": 0.73654675, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 2.9006991386413574 + }, + { + "auxiliary_loss_clip": 0.01096916, + "auxiliary_loss_mlp": 0.01038793, + "balance_loss_clip": 1.03882241, + "balance_loss_mlp": 1.0237962, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.6258881029970793, + "language_loss": 0.73494846, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75630552, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 2.572094678878784 + }, + { + "auxiliary_loss_clip": 0.01081918, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.03560472, + "balance_loss_mlp": 1.01898098, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 1.9032640306754365, + "language_loss": 0.80212462, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82327032, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.59092378616333 + }, + { + "auxiliary_loss_clip": 0.01083345, + "auxiliary_loss_mlp": 0.00750418, + "balance_loss_clip": 1.03563869, + "balance_loss_mlp": 1.00069296, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.606565899024543, + "language_loss": 0.70731121, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72564882, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 2.612497329711914 + }, + { + "auxiliary_loss_clip": 0.01082253, + "auxiliary_loss_mlp": 0.01037904, + "balance_loss_clip": 1.03976464, + "balance_loss_mlp": 1.02256179, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 1.7703459716186842, + "language_loss": 0.85258627, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87378788, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 4.180767297744751 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.03911805, + "balance_loss_mlp": 1.0198884, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.5935022425478673, + "language_loss": 0.66975355, + "learning_rate": 3.28240670566841e-06, + "loss": 0.6911881, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 2.5931804180145264 + }, + { + "auxiliary_loss_clip": 0.01092331, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.03441489, + "balance_loss_mlp": 1.01808178, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.824515794231855, + "language_loss": 0.7904644, + "learning_rate": 3.28210781975363e-06, + "loss": 0.81172955, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.5614938735961914 + }, + { + "auxiliary_loss_clip": 0.01116986, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.0406276, + "balance_loss_mlp": 1.01870692, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.8720504257361719, + "language_loss": 0.8262282, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84773016, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.5967540740966797 + }, + { + "auxiliary_loss_clip": 0.01072087, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.03613579, + "balance_loss_mlp": 1.0266664, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.049925503164795, + "language_loss": 0.86254871, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88369763, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 2.6354434490203857 + }, + { + "auxiliary_loss_clip": 0.01090952, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.04329419, + "balance_loss_mlp": 1.0177269, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.4297761338290553, + "language_loss": 0.81057453, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83180749, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 2.6849422454833984 + }, + { + "auxiliary_loss_clip": 0.01091718, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.03945637, + "balance_loss_mlp": 1.01941776, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.8097728311170385, + "language_loss": 0.6752013, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69645333, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 4.269383192062378 + }, + { + "auxiliary_loss_clip": 0.01092703, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.03903198, + "balance_loss_mlp": 1.01885939, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 2.2905833210685365, + "language_loss": 0.75321901, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77447963, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.5925233364105225 + }, + { + "auxiliary_loss_clip": 0.01105603, + "auxiliary_loss_mlp": 0.01041043, + "balance_loss_clip": 1.03953791, + "balance_loss_mlp": 1.02733397, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.759995644791917, + "language_loss": 0.77752399, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79899043, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 4.0515031814575195 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.03766322, + "balance_loss_mlp": 1.02330887, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 2.53895873918848, + "language_loss": 0.73527193, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75674641, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.5211219787597656 + }, + { + "auxiliary_loss_clip": 0.01103643, + "auxiliary_loss_mlp": 0.01036851, + "balance_loss_clip": 1.03700233, + "balance_loss_mlp": 1.02240288, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.6299594547736735, + "language_loss": 0.76010776, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78151274, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 3.9847288131713867 + }, + { + "auxiliary_loss_clip": 0.01112256, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.03861094, + "balance_loss_mlp": 1.02182555, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.8361502924449253, + "language_loss": 0.8174029, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83886719, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 2.4667444229125977 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.0391171, + "balance_loss_mlp": 1.02565765, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.7473739161110557, + "language_loss": 0.80795872, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82941735, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 2.523634672164917 + }, + { + "auxiliary_loss_clip": 0.01064297, + "auxiliary_loss_mlp": 0.01043925, + "balance_loss_clip": 1.03587759, + "balance_loss_mlp": 1.02740288, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.6643965408070835, + "language_loss": 0.71204436, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73312658, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 2.648449420928955 + }, + { + "auxiliary_loss_clip": 0.01084776, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.03738713, + "balance_loss_mlp": 1.02249026, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 2.042316456636056, + "language_loss": 0.70492232, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72613889, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 2.678415060043335 + }, + { + "auxiliary_loss_clip": 0.01094054, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.03807938, + "balance_loss_mlp": 1.02222013, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.9556316345179012, + "language_loss": 0.81822872, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83954138, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 2.5181260108947754 + }, + { + "auxiliary_loss_clip": 0.01102258, + "auxiliary_loss_mlp": 0.01035632, + "balance_loss_clip": 1.03707027, + "balance_loss_mlp": 1.02190471, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.333247639103657, + "language_loss": 0.75114357, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77252245, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 2.5694708824157715 + }, + { + "auxiliary_loss_clip": 0.01075514, + "auxiliary_loss_mlp": 0.00750187, + "balance_loss_clip": 1.03467274, + "balance_loss_mlp": 1.00062847, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 1.8346404359377624, + "language_loss": 0.71395528, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.73221231, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.6578874588012695 + }, + { + "auxiliary_loss_clip": 0.01104301, + "auxiliary_loss_mlp": 0.01039253, + "balance_loss_clip": 1.03793454, + "balance_loss_mlp": 1.02402973, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.183320877131569, + "language_loss": 0.76664901, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78808451, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.5640933513641357 + }, + { + "auxiliary_loss_clip": 0.01102288, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.03950691, + "balance_loss_mlp": 1.01967144, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.806358246501473, + "language_loss": 0.84536308, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.86672586, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.5661113262176514 + }, + { + "auxiliary_loss_clip": 0.01111133, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.03905964, + "balance_loss_mlp": 1.02157772, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 2.130529680346885, + "language_loss": 0.83577174, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85725808, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 2.550283193588257 + }, + { + "auxiliary_loss_clip": 0.0108113, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.03606319, + "balance_loss_mlp": 1.01944065, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 2.180654839686389, + "language_loss": 0.85893059, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.88007438, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 2.6397030353546143 + }, + { + "auxiliary_loss_clip": 0.01092215, + "auxiliary_loss_mlp": 0.01036785, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.02245593, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.364231854935075, + "language_loss": 0.72232074, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74361074, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 2.6444482803344727 + }, + { + "auxiliary_loss_clip": 0.01102751, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.03702593, + "balance_loss_mlp": 1.02168822, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.358793765108991, + "language_loss": 0.87939662, + "learning_rate": 3.275820002334819e-06, + "loss": 0.90078247, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 2.530714988708496 + }, + { + "auxiliary_loss_clip": 0.01081805, + "auxiliary_loss_mlp": 0.01045486, + "balance_loss_clip": 1.03322268, + "balance_loss_mlp": 1.02686572, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 1.8604453205045202, + "language_loss": 0.8294369, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.8507098, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 2.5077710151672363 + }, + { + "auxiliary_loss_clip": 0.01068054, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.03465652, + "balance_loss_mlp": 1.02504969, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6579067101560843, + "language_loss": 0.68067354, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70175701, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.620560646057129 + }, + { + "auxiliary_loss_clip": 0.01088611, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.03543794, + "balance_loss_mlp": 1.02239561, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.2694200884966444, + "language_loss": 0.74733371, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.7685951, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.6006081104278564 + }, + { + "auxiliary_loss_clip": 0.01105685, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.03697109, + "balance_loss_mlp": 1.01664352, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.9967127005176963, + "language_loss": 0.653283, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67465508, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.587139129638672 + }, + { + "auxiliary_loss_clip": 0.01062002, + "auxiliary_loss_mlp": 0.01049441, + "balance_loss_clip": 1.03200459, + "balance_loss_mlp": 1.03297782, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 1.9217533386865056, + "language_loss": 0.68378818, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.70490265, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 2.562913417816162 + }, + { + "auxiliary_loss_clip": 0.01109316, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.03590512, + "balance_loss_mlp": 1.02052581, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 2.04155272888582, + "language_loss": 0.78790438, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.80933344, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.4970924854278564 + }, + { + "auxiliary_loss_clip": 0.01090475, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.03828192, + "balance_loss_mlp": 1.0259012, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 1.8297436831995773, + "language_loss": 0.69489557, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.71619833, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.5878963470458984 + }, + { + "auxiliary_loss_clip": 0.01118187, + "auxiliary_loss_mlp": 0.010417, + "balance_loss_clip": 1.03843546, + "balance_loss_mlp": 1.02721596, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 3.1326373201166335, + "language_loss": 0.7862407, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80783957, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 2.485802173614502 + }, + { + "auxiliary_loss_clip": 0.01104167, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.03611779, + "balance_loss_mlp": 1.02070475, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.252660168676421, + "language_loss": 0.76062214, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78201103, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 2.5362741947174072 + }, + { + "auxiliary_loss_clip": 0.01116286, + "auxiliary_loss_mlp": 0.01039183, + "balance_loss_clip": 1.0375309, + "balance_loss_mlp": 1.0240674, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.8551378010317185, + "language_loss": 0.69999373, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72154844, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.4614932537078857 + }, + { + "auxiliary_loss_clip": 0.01084979, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.0327425, + "balance_loss_mlp": 1.01759207, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 1.9063394714680792, + "language_loss": 0.71464074, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73580384, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.587610960006714 + }, + { + "auxiliary_loss_clip": 0.01100989, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.03668177, + "balance_loss_mlp": 1.02384305, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 2.701090012297705, + "language_loss": 0.73820072, + "learning_rate": 3.272217377978061e-06, + "loss": 0.75958776, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.5466525554656982 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.03904855, + "balance_loss_mlp": 1.02243781, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.630960479801757, + "language_loss": 0.67413068, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69553471, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 2.5633602142333984 + }, + { + "auxiliary_loss_clip": 0.01104886, + "auxiliary_loss_mlp": 0.01034933, + "balance_loss_clip": 1.03787017, + "balance_loss_mlp": 1.02013898, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.8027165524259403, + "language_loss": 0.85272408, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87412226, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.511690378189087 + }, + { + "auxiliary_loss_clip": 0.01080916, + "auxiliary_loss_mlp": 0.01035304, + "balance_loss_clip": 1.03631639, + "balance_loss_mlp": 1.02225065, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.6106152478604367, + "language_loss": 0.78606588, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80722809, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 2.585367441177368 + }, + { + "auxiliary_loss_clip": 0.0108583, + "auxiliary_loss_mlp": 0.01039874, + "balance_loss_clip": 1.03462172, + "balance_loss_mlp": 1.02543771, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 1.8168032519340591, + "language_loss": 0.77240539, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79366249, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 2.6882336139678955 + }, + { + "auxiliary_loss_clip": 0.01082471, + "auxiliary_loss_mlp": 0.01035855, + "balance_loss_clip": 1.03547931, + "balance_loss_mlp": 1.02009571, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 2.149561433818372, + "language_loss": 0.8222425, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84342575, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.6401243209838867 + }, + { + "auxiliary_loss_clip": 0.01064341, + "auxiliary_loss_mlp": 0.00750016, + "balance_loss_clip": 1.03384614, + "balance_loss_mlp": 1.00059712, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.776249744115135, + "language_loss": 0.69869077, + "learning_rate": 3.270413459468905e-06, + "loss": 0.71683437, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 4.256669044494629 + }, + { + "auxiliary_loss_clip": 0.01094939, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.03538799, + "balance_loss_mlp": 1.02024484, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.736019820474443, + "language_loss": 0.82741338, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.8487047, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.5261924266815186 + }, + { + "auxiliary_loss_clip": 0.01077782, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.03862, + "balance_loss_mlp": 1.02263391, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.0109341537074714, + "language_loss": 0.73832762, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75949442, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 2.6451847553253174 + }, + { + "auxiliary_loss_clip": 0.01099666, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.03676701, + "balance_loss_mlp": 1.02260911, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.3967389492089095, + "language_loss": 0.74182403, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76319814, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.5882081985473633 + }, + { + "auxiliary_loss_clip": 0.01113969, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.03697956, + "balance_loss_mlp": 1.01853108, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.6724079389880573, + "language_loss": 0.71884727, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74031365, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.538071393966675 + }, + { + "auxiliary_loss_clip": 0.01096728, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.03471375, + "balance_loss_mlp": 1.01798999, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 1.8177155555447668, + "language_loss": 0.87498635, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89626175, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 4.099309206008911 + }, + { + "auxiliary_loss_clip": 0.01073226, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.03493392, + "balance_loss_mlp": 1.03016329, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 1.4335823077788328, + "language_loss": 0.7722711, + "learning_rate": 3.268607806688536e-06, + "loss": 0.79345876, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 2.6784167289733887 + }, + { + "auxiliary_loss_clip": 0.0107204, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.03376818, + "balance_loss_mlp": 1.02597785, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 3.3357857251962315, + "language_loss": 0.77517724, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79629874, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 2.5315163135528564 + }, + { + "auxiliary_loss_clip": 0.01085844, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.03455687, + "balance_loss_mlp": 1.02252769, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.8148956226699757, + "language_loss": 0.74244559, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76366115, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 4.013225793838501 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.00749788, + "balance_loss_clip": 1.0374043, + "balance_loss_mlp": 1.000561, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.8509407510358595, + "language_loss": 0.80330241, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82189894, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 4.0511462688446045 + }, + { + "auxiliary_loss_clip": 0.01086904, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.0371778, + "balance_loss_mlp": 1.02072823, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.6005438702524744, + "language_loss": 0.82172775, + "learning_rate": 3.267403075901438e-06, + "loss": 0.84293473, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 2.581519365310669 + }, + { + "auxiliary_loss_clip": 0.01006476, + "auxiliary_loss_mlp": 0.01004602, + "balance_loss_clip": 1.01867485, + "balance_loss_mlp": 1.00317121, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.75758485639101, + "language_loss": 0.59457672, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61468756, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 3.3075122833251953 + }, + { + "auxiliary_loss_clip": 0.01116063, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.03931272, + "balance_loss_mlp": 1.01753497, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.6391439765794682, + "language_loss": 0.71495748, + "learning_rate": 3.266800422101892e-06, + "loss": 0.7364282, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 2.5274250507354736 + }, + { + "auxiliary_loss_clip": 0.0107125, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.03689313, + "balance_loss_mlp": 1.01666474, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 1.6262173307282424, + "language_loss": 0.69459176, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71560663, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 2.6532089710235596 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01028102, + "balance_loss_clip": 1.03721333, + "balance_loss_mlp": 1.01500106, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.38724632854104, + "language_loss": 0.7757138, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79700083, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.546417474746704 + }, + { + "auxiliary_loss_clip": 0.01114073, + "auxiliary_loss_mlp": 0.00749897, + "balance_loss_clip": 1.03791356, + "balance_loss_mlp": 1.00067854, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.6178424791388366, + "language_loss": 0.72485745, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74349719, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.572946071624756 + }, + { + "auxiliary_loss_clip": 0.01102405, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.03709435, + "balance_loss_mlp": 1.01610005, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 1.580310174217308, + "language_loss": 0.80717349, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.82851529, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.559501886367798 + }, + { + "auxiliary_loss_clip": 0.01051502, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.03131199, + "balance_loss_mlp": 1.02288699, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.8535456742760585, + "language_loss": 0.72138929, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74227154, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.641176223754883 + }, + { + "auxiliary_loss_clip": 0.01087815, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.03419459, + "balance_loss_mlp": 1.01651907, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.724636288166968, + "language_loss": 0.75713557, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.77830803, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 2.551827907562256 + }, + { + "auxiliary_loss_clip": 0.01103551, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.0361321, + "balance_loss_mlp": 1.01734757, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.6319441534102157, + "language_loss": 0.81886834, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84021455, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 2.6193671226501465 + }, + { + "auxiliary_loss_clip": 0.01078951, + "auxiliary_loss_mlp": 0.01033646, + "balance_loss_clip": 1.03689218, + "balance_loss_mlp": 1.01913822, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 1.9720644099065874, + "language_loss": 0.73701739, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.75814337, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 2.59132981300354 + }, + { + "auxiliary_loss_clip": 0.01054307, + "auxiliary_loss_mlp": 0.00749954, + "balance_loss_clip": 1.03559113, + "balance_loss_mlp": 1.00064969, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.643390960318682, + "language_loss": 0.76722175, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78526437, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.7104580402374268 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.03800333, + "balance_loss_mlp": 1.02073181, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.9157875369328488, + "language_loss": 0.82721698, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84872639, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 2.498340129852295 + }, + { + "auxiliary_loss_clip": 0.01089051, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.03510165, + "balance_loss_mlp": 1.02185857, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5439297349649233, + "language_loss": 0.70975369, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73100251, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.5541584491729736 + }, + { + "auxiliary_loss_clip": 0.01113386, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.03751814, + "balance_loss_mlp": 1.01687837, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 2.279346812356842, + "language_loss": 0.69545031, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71689373, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.496209144592285 + }, + { + "auxiliary_loss_clip": 0.01087971, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.03719687, + "balance_loss_mlp": 1.02050233, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 1.9570307116382786, + "language_loss": 0.67587078, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69710195, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 2.5647330284118652 + }, + { + "auxiliary_loss_clip": 0.01086148, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.03587437, + "balance_loss_mlp": 1.02466464, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.710230576933797, + "language_loss": 0.82226801, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84351552, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.62384033203125 + }, + { + "auxiliary_loss_clip": 0.01090717, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.03569746, + "balance_loss_mlp": 1.02062964, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 1.7321488297928873, + "language_loss": 0.89318854, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91444975, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.588599681854248 + }, + { + "auxiliary_loss_clip": 0.0107689, + "auxiliary_loss_mlp": 0.01039586, + "balance_loss_clip": 1.03511453, + "balance_loss_mlp": 1.02499473, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.074624567836097, + "language_loss": 0.71280289, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73396766, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 2.6428964138031006 + }, + { + "auxiliary_loss_clip": 0.0105883, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.03823411, + "balance_loss_mlp": 1.02449358, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 1.6219713765627526, + "language_loss": 0.72966063, + "learning_rate": 3.26167011603268e-06, + "loss": 0.7506308, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 2.771577835083008 + }, + { + "auxiliary_loss_clip": 0.01115005, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.03745222, + "balance_loss_mlp": 1.02359331, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.7203380781146387, + "language_loss": 0.7702207, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79174376, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 2.5129740238189697 + }, + { + "auxiliary_loss_clip": 0.01077438, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.03825951, + "balance_loss_mlp": 1.01870513, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.5990000170284326, + "language_loss": 0.81774652, + "learning_rate": 3.261065640514415e-06, + "loss": 0.83886641, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.6491405963897705 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.03517747, + "balance_loss_mlp": 1.01643562, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 2.775885564921913, + "language_loss": 0.74668974, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76807666, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.55590558052063 + }, + { + "auxiliary_loss_clip": 0.01101606, + "auxiliary_loss_mlp": 0.00749949, + "balance_loss_clip": 1.03671551, + "balance_loss_mlp": 1.00063884, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.6857601553917547, + "language_loss": 0.84272701, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86124253, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.585177421569824 + }, + { + "auxiliary_loss_clip": 0.01100579, + "auxiliary_loss_mlp": 0.01034896, + "balance_loss_clip": 1.03532279, + "balance_loss_mlp": 1.02054286, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.849229026071953, + "language_loss": 0.75701565, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.77837038, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 2.54299259185791 + }, + { + "auxiliary_loss_clip": 0.01087923, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.035882, + "balance_loss_mlp": 1.02292418, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 1.8268483157795374, + "language_loss": 0.62386239, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64512599, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 2.6432912349700928 + }, + { + "auxiliary_loss_clip": 0.01089033, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_clip": 1.03912127, + "balance_loss_mlp": 1.0261296, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 2.278909186927108, + "language_loss": 0.83125871, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85256201, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.55533766746521 + }, + { + "auxiliary_loss_clip": 0.01111707, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.03745556, + "balance_loss_mlp": 1.02270412, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.076406270987088, + "language_loss": 0.62965369, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65113556, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 2.551811933517456 + }, + { + "auxiliary_loss_clip": 0.01102633, + "auxiliary_loss_mlp": 0.01025983, + "balance_loss_clip": 1.03687501, + "balance_loss_mlp": 1.0118804, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.8551161013796684, + "language_loss": 0.74921525, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77050138, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.5738236904144287 + }, + { + "auxiliary_loss_clip": 0.01075788, + "auxiliary_loss_mlp": 0.01041874, + "balance_loss_clip": 1.03479004, + "balance_loss_mlp": 1.02727687, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 3.3495134287253006, + "language_loss": 0.75786656, + "learning_rate": 3.258645826569261e-06, + "loss": 0.7790432, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 2.59145450592041 + }, + { + "auxiliary_loss_clip": 0.01118599, + "auxiliary_loss_mlp": 0.00750138, + "balance_loss_clip": 1.0386337, + "balance_loss_mlp": 1.00073051, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.853210139469103, + "language_loss": 0.81728524, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83597267, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 4.137121200561523 + }, + { + "auxiliary_loss_clip": 0.0108333, + "auxiliary_loss_mlp": 0.01042691, + "balance_loss_clip": 1.03335726, + "balance_loss_mlp": 1.02601337, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.689049832139184, + "language_loss": 0.76286775, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78412795, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 2.5961053371429443 + }, + { + "auxiliary_loss_clip": 0.01080021, + "auxiliary_loss_mlp": 0.01036837, + "balance_loss_clip": 1.03801608, + "balance_loss_mlp": 1.02179313, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 1.893392265050062, + "language_loss": 0.70882887, + "learning_rate": 3.257737608512723e-06, + "loss": 0.72999752, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.6482784748077393 + }, + { + "auxiliary_loss_clip": 0.01109444, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.04025471, + "balance_loss_mlp": 1.02549422, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.1169696918774505, + "language_loss": 0.76447338, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78597462, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.5191683769226074 + }, + { + "auxiliary_loss_clip": 0.01092674, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.03790784, + "balance_loss_mlp": 1.01943588, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.870827863324128, + "language_loss": 0.74667263, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76793051, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.599066734313965 + }, + { + "auxiliary_loss_clip": 0.01119911, + "auxiliary_loss_mlp": 0.01037544, + "balance_loss_clip": 1.03934622, + "balance_loss_mlp": 1.02120018, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.221687557513424, + "language_loss": 0.7546953, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77626985, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 3.9478726387023926 + }, + { + "auxiliary_loss_clip": 0.01094227, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.03913498, + "balance_loss_mlp": 1.02294159, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.6197306062773242, + "language_loss": 0.79386497, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81517828, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 2.575962781906128 + }, + { + "auxiliary_loss_clip": 0.01064257, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.03419828, + "balance_loss_mlp": 1.01456332, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.65259664556924, + "language_loss": 0.7467826, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76770604, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.619168281555176 + }, + { + "auxiliary_loss_clip": 0.01061047, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_clip": 1.03415966, + "balance_loss_mlp": 1.02908778, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 2.0140751918606723, + "language_loss": 0.6704343, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69148529, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 4.086487770080566 + }, + { + "auxiliary_loss_clip": 0.011064, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.03905797, + "balance_loss_mlp": 1.01964998, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 2.6824240811601694, + "language_loss": 0.79587519, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.81727624, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 4.019045114517212 + }, + { + "auxiliary_loss_clip": 0.01104057, + "auxiliary_loss_mlp": 0.00750016, + "balance_loss_clip": 1.0386889, + "balance_loss_mlp": 1.000633, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.0746687184984514, + "language_loss": 0.81157923, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83011997, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 2.5998942852020264 + }, + { + "auxiliary_loss_clip": 0.01101282, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.03790307, + "balance_loss_mlp": 1.02044702, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.6794216806780609, + "language_loss": 0.71709508, + "learning_rate": 3.255010380132783e-06, + "loss": 0.73845041, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.6234536170959473 + }, + { + "auxiliary_loss_clip": 0.01106659, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.03702235, + "balance_loss_mlp": 1.02004111, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.8755373540307192, + "language_loss": 0.7293874, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75081199, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.6152241230010986 + }, + { + "auxiliary_loss_clip": 0.01085065, + "auxiliary_loss_mlp": 0.00750311, + "balance_loss_clip": 1.03388405, + "balance_loss_mlp": 1.0007019, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 1.8054840660377123, + "language_loss": 0.71223283, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73058665, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.7220568656921387 + }, + { + "auxiliary_loss_clip": 0.01074211, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.03403258, + "balance_loss_mlp": 1.02142513, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.115474678953527, + "language_loss": 0.77986467, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80099297, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 2.5964365005493164 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.03690815, + "balance_loss_mlp": 1.02128303, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.640867073420459, + "language_loss": 0.78125077, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.8027209, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.4989192485809326 + }, + { + "auxiliary_loss_clip": 0.0107793, + "auxiliary_loss_mlp": 0.0104263, + "balance_loss_clip": 1.0342077, + "balance_loss_mlp": 1.02659643, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.7538999362020609, + "language_loss": 0.76341683, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78462243, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.509398937225342 + }, + { + "auxiliary_loss_clip": 0.01106134, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.03705752, + "balance_loss_mlp": 1.02321374, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 1.944266255101298, + "language_loss": 0.72650874, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74796051, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 2.5786468982696533 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.03742051, + "balance_loss_mlp": 1.02106547, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 3.089617888619674, + "language_loss": 0.79574561, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81716979, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 2.5265181064605713 + }, + { + "auxiliary_loss_clip": 0.01092941, + "auxiliary_loss_mlp": 0.01038029, + "balance_loss_clip": 1.03914988, + "balance_loss_mlp": 1.02272224, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 2.041690448705591, + "language_loss": 0.77044261, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79175234, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 2.6123483180999756 + }, + { + "auxiliary_loss_clip": 0.01096505, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_clip": 1.03700161, + "balance_loss_mlp": 1.02807522, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.6846576810227865, + "language_loss": 0.7640239, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78542536, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 2.6309313774108887 + }, + { + "auxiliary_loss_clip": 0.01051799, + "auxiliary_loss_mlp": 0.01051124, + "balance_loss_clip": 1.03838086, + "balance_loss_mlp": 1.03270602, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.9534691536769355, + "language_loss": 0.71838975, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.73941898, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.7318856716156006 + }, + { + "auxiliary_loss_clip": 0.01097468, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.04004753, + "balance_loss_mlp": 1.02112424, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 1.7832993551402114, + "language_loss": 0.82021666, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84154642, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.5681281089782715 + }, + { + "auxiliary_loss_clip": 0.01117512, + "auxiliary_loss_mlp": 0.00750216, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.00070691, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.6567076900577187, + "language_loss": 0.74887115, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.76754844, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.5508878231048584 + }, + { + "auxiliary_loss_clip": 0.01093736, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.03844917, + "balance_loss_mlp": 1.02401447, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 2.7766637708406248, + "language_loss": 0.75857955, + "learning_rate": 3.251064247058868e-06, + "loss": 0.77990174, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 2.582843542098999 + }, + { + "auxiliary_loss_clip": 0.01101719, + "auxiliary_loss_mlp": 0.01036226, + "balance_loss_clip": 1.03771019, + "balance_loss_mlp": 1.02220654, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.8236002958185529, + "language_loss": 0.80404782, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82542729, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 2.579763174057007 + }, + { + "auxiliary_loss_clip": 0.01105272, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.03802478, + "balance_loss_mlp": 1.01489711, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 1.9234020023289482, + "language_loss": 0.81724238, + "learning_rate": 3.250456437422258e-06, + "loss": 0.83858478, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.5116004943847656 + }, + { + "auxiliary_loss_clip": 0.0111605, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.03775299, + "balance_loss_mlp": 1.01875472, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.863627532087401, + "language_loss": 0.77799153, + "learning_rate": 3.250152461472041e-06, + "loss": 0.79949582, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 2.534278631210327 + }, + { + "auxiliary_loss_clip": 0.01071698, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.0377183, + "balance_loss_mlp": 1.01982701, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.7681124086664213, + "language_loss": 0.84107399, + "learning_rate": 3.249848438115917e-06, + "loss": 0.8621366, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.698129177093506 + }, + { + "auxiliary_loss_clip": 0.01116319, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.03693676, + "balance_loss_mlp": 1.02743292, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.7134759874472665, + "language_loss": 0.8530547, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87464392, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 2.5336899757385254 + }, + { + "auxiliary_loss_clip": 0.01078044, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.03465199, + "balance_loss_mlp": 1.01736927, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 2.150028120380395, + "language_loss": 0.79297256, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81408823, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 2.602177143096924 + }, + { + "auxiliary_loss_clip": 0.01079444, + "auxiliary_loss_mlp": 0.01045064, + "balance_loss_clip": 1.03802705, + "balance_loss_mlp": 1.0287025, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 2.1998370013437505, + "language_loss": 0.80054295, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82178801, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.6190929412841797 + }, + { + "auxiliary_loss_clip": 0.01118562, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.0407269, + "balance_loss_mlp": 1.02040803, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.7651850608580695, + "language_loss": 0.89053142, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91208178, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.518789768218994 + }, + { + "auxiliary_loss_clip": 0.01092896, + "auxiliary_loss_mlp": 0.0103988, + "balance_loss_clip": 1.03459382, + "balance_loss_mlp": 1.02481794, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.664545239495233, + "language_loss": 0.73838675, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.75971454, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 2.5626120567321777 + }, + { + "auxiliary_loss_clip": 0.01102071, + "auxiliary_loss_mlp": 0.00750325, + "balance_loss_clip": 1.036098, + "balance_loss_mlp": 1.0005846, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8646288618669231, + "language_loss": 0.73163593, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75015986, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.579707384109497 + }, + { + "auxiliary_loss_clip": 0.01094447, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.03871679, + "balance_loss_mlp": 1.02463245, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 1.8506473884258678, + "language_loss": 0.87439454, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89574194, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 2.6171810626983643 + }, + { + "auxiliary_loss_clip": 0.01083371, + "auxiliary_loss_mlp": 0.0104046, + "balance_loss_clip": 1.03611875, + "balance_loss_mlp": 1.02437913, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.200933081311276, + "language_loss": 0.7136215, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73485982, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 2.5968990325927734 + }, + { + "auxiliary_loss_clip": 0.01079408, + "auxiliary_loss_mlp": 0.01045862, + "balance_loss_clip": 1.03527021, + "balance_loss_mlp": 1.03030562, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 2.030833896863177, + "language_loss": 0.71905935, + "learning_rate": 3.247110096547814e-06, + "loss": 0.7403121, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 2.618973970413208 + }, + { + "auxiliary_loss_clip": 0.01087549, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.03662193, + "balance_loss_mlp": 1.02437472, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.5552968368726445, + "language_loss": 0.85609549, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87735927, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.5841431617736816 + }, + { + "auxiliary_loss_clip": 0.01084791, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.03475618, + "balance_loss_mlp": 1.01691484, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.6158189507285239, + "language_loss": 0.670358, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69151866, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.649078130722046 + }, + { + "auxiliary_loss_clip": 0.01102208, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.03723717, + "balance_loss_mlp": 1.01476789, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 2.5962747184890995, + "language_loss": 0.77375644, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79505944, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 2.5865814685821533 + }, + { + "auxiliary_loss_clip": 0.01116661, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.03929198, + "balance_loss_mlp": 1.02208424, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.769643348977224, + "language_loss": 0.66982019, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69134986, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 4.056272268295288 + }, + { + "auxiliary_loss_clip": 0.01111435, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.04116917, + "balance_loss_mlp": 1.02289701, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 2.0375787479005765, + "language_loss": 0.79488707, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81639761, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 2.603482723236084 + }, + { + "auxiliary_loss_clip": 0.010805, + "auxiliary_loss_mlp": 0.00750114, + "balance_loss_clip": 1.03778768, + "balance_loss_mlp": 1.00067878, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.86594732019452, + "language_loss": 0.76582468, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.78413081, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.645775556564331 + }, + { + "auxiliary_loss_clip": 0.01076764, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.03582883, + "balance_loss_mlp": 1.01903844, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.7753564931101844, + "language_loss": 0.62124598, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64236748, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 2.654038667678833 + }, + { + "auxiliary_loss_clip": 0.01109031, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.03978086, + "balance_loss_mlp": 1.0231142, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 2.00334803326734, + "language_loss": 0.82798886, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84945738, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 4.1316704750061035 + }, + { + "auxiliary_loss_clip": 0.01094417, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.03844082, + "balance_loss_mlp": 1.02430296, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.6917775496285212, + "language_loss": 0.76009631, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78143185, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 2.5963361263275146 + }, + { + "auxiliary_loss_clip": 0.01075766, + "auxiliary_loss_mlp": 0.0104026, + "balance_loss_clip": 1.03767884, + "balance_loss_mlp": 1.02340341, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.1771699025591063, + "language_loss": 0.71280372, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.73396397, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 2.6397266387939453 + }, + { + "auxiliary_loss_clip": 0.01079246, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.04355049, + "balance_loss_mlp": 1.0204668, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.846661243525308, + "language_loss": 0.74535984, + "learning_rate": 3.243758033520219e-06, + "loss": 0.7665025, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 4.201795339584351 + }, + { + "auxiliary_loss_clip": 0.01110466, + "auxiliary_loss_mlp": 0.01050365, + "balance_loss_clip": 1.04063988, + "balance_loss_mlp": 1.03366351, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.9429377178379155, + "language_loss": 0.79874784, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82035625, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 4.085798501968384 + }, + { + "auxiliary_loss_clip": 0.01101979, + "auxiliary_loss_mlp": 0.01041057, + "balance_loss_clip": 1.03520608, + "balance_loss_mlp": 1.02649009, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.7024636073428319, + "language_loss": 0.79928106, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82071143, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.5275235176086426 + }, + { + "auxiliary_loss_clip": 0.0108071, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.03670728, + "balance_loss_mlp": 1.01777589, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.4134982253587716, + "language_loss": 0.82509631, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84622657, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.6759321689605713 + }, + { + "auxiliary_loss_clip": 0.01034414, + "auxiliary_loss_mlp": 0.0100162, + "balance_loss_clip": 1.01899362, + "balance_loss_mlp": 1.00010061, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.9360308818414319, + "language_loss": 0.58620191, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60656226, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.2874598503112793 + }, + { + "auxiliary_loss_clip": 0.01111835, + "auxiliary_loss_mlp": 0.00750225, + "balance_loss_clip": 1.03982806, + "balance_loss_mlp": 1.00064421, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.881277548172833, + "language_loss": 0.83450127, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85312194, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.624129056930542 + }, + { + "auxiliary_loss_clip": 0.01120337, + "auxiliary_loss_mlp": 0.01037834, + "balance_loss_clip": 1.04096675, + "balance_loss_mlp": 1.02380335, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 2.0847610460290924, + "language_loss": 0.79702818, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.81860983, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.521439552307129 + }, + { + "auxiliary_loss_clip": 0.01102107, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.03766942, + "balance_loss_mlp": 1.016132, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.8852162759286049, + "language_loss": 0.64676791, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66811121, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 2.539046049118042 + }, + { + "auxiliary_loss_clip": 0.01075919, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.03958428, + "balance_loss_mlp": 1.02321959, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.6561517644408248, + "language_loss": 0.86616749, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88730627, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 2.709801435470581 + }, + { + "auxiliary_loss_clip": 0.01064846, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.03443182, + "balance_loss_mlp": 1.01929271, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.560424521244644, + "language_loss": 0.68641537, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70740557, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 2.7008564472198486 + }, + { + "auxiliary_loss_clip": 0.01108206, + "auxiliary_loss_mlp": 0.00750115, + "balance_loss_clip": 1.03907418, + "balance_loss_mlp": 1.00058174, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.900706841127015, + "language_loss": 0.71793938, + "learning_rate": 3.240705750931993e-06, + "loss": 0.73652256, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.593543529510498 + }, + { + "auxiliary_loss_clip": 0.01008558, + "auxiliary_loss_mlp": 0.01007318, + "balance_loss_clip": 1.01298225, + "balance_loss_mlp": 1.00586915, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8389993055920841, + "language_loss": 0.59288275, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61304152, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 3.108314037322998 + }, + { + "auxiliary_loss_clip": 0.01094414, + "auxiliary_loss_mlp": 0.01037021, + "balance_loss_clip": 1.03658605, + "balance_loss_mlp": 1.0213567, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.4595925889296386, + "language_loss": 0.73036468, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75167894, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.6456573009490967 + }, + { + "auxiliary_loss_clip": 0.01075729, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.03478909, + "balance_loss_mlp": 1.01710367, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.784303641833504, + "language_loss": 0.71135569, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73242509, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 2.662785768508911 + }, + { + "auxiliary_loss_clip": 0.01112479, + "auxiliary_loss_mlp": 0.00750001, + "balance_loss_clip": 1.03855062, + "balance_loss_mlp": 1.00053716, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7825905168609038, + "language_loss": 0.90464318, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92326796, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.494284152984619 + }, + { + "auxiliary_loss_clip": 0.01099948, + "auxiliary_loss_mlp": 0.01045481, + "balance_loss_clip": 1.03898859, + "balance_loss_mlp": 1.03032982, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.7764926599324202, + "language_loss": 0.67356461, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69501889, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 2.6809189319610596 + }, + { + "auxiliary_loss_clip": 0.01105351, + "auxiliary_loss_mlp": 0.01043329, + "balance_loss_clip": 1.03975415, + "balance_loss_mlp": 1.02803457, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.3542553064012495, + "language_loss": 0.82869387, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85018063, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 2.55901837348938 + }, + { + "auxiliary_loss_clip": 0.01006072, + "auxiliary_loss_mlp": 0.01004736, + "balance_loss_clip": 1.00925434, + "balance_loss_mlp": 1.00305557, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7066182877338766, + "language_loss": 0.55304146, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57314956, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.1867074966430664 + }, + { + "auxiliary_loss_clip": 0.01096276, + "auxiliary_loss_mlp": 0.00750167, + "balance_loss_clip": 1.03809714, + "balance_loss_mlp": 1.00061309, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 1.9919206507180152, + "language_loss": 0.76164287, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78010732, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 3.0030014514923096 + }, + { + "auxiliary_loss_clip": 0.01080994, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.03613234, + "balance_loss_mlp": 1.01939571, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 1.7365991177186921, + "language_loss": 0.80157471, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82271117, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.590181827545166 + }, + { + "auxiliary_loss_clip": 0.0105296, + "auxiliary_loss_mlp": 0.01043785, + "balance_loss_clip": 1.03237367, + "balance_loss_mlp": 1.02688098, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.3664000489630612, + "language_loss": 0.8142668, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83523428, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 2.6920788288116455 + }, + { + "auxiliary_loss_clip": 0.01106045, + "auxiliary_loss_mlp": 0.01038053, + "balance_loss_clip": 1.03727174, + "balance_loss_mlp": 1.0217092, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.370874098121356, + "language_loss": 0.77455509, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79599607, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 2.581437826156616 + }, + { + "auxiliary_loss_clip": 0.01078957, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.03624773, + "balance_loss_mlp": 1.02480078, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 4.700228518920098, + "language_loss": 0.78215188, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80332625, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.641103982925415 + }, + { + "auxiliary_loss_clip": 0.01096197, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.03746486, + "balance_loss_mlp": 1.02269459, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.05927341440291, + "language_loss": 0.86896193, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89030313, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.5716984272003174 + }, + { + "auxiliary_loss_clip": 0.01106673, + "auxiliary_loss_mlp": 0.01039962, + "balance_loss_clip": 1.03824866, + "balance_loss_mlp": 1.02501273, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.6302708144763414, + "language_loss": 0.790398, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81186438, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 2.5593924522399902 + }, + { + "auxiliary_loss_clip": 0.01085149, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.03786612, + "balance_loss_mlp": 1.02102947, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.8028203403065317, + "language_loss": 0.71902657, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74023664, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 2.626241683959961 + }, + { + "auxiliary_loss_clip": 0.01106944, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.03641677, + "balance_loss_mlp": 1.02379322, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 2.095543778099626, + "language_loss": 0.74281788, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76427835, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 2.7149810791015625 + }, + { + "auxiliary_loss_clip": 0.01085245, + "auxiliary_loss_mlp": 0.01040661, + "balance_loss_clip": 1.03467155, + "balance_loss_mlp": 1.02556896, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.595458005515603, + "language_loss": 0.76395559, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78521466, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 2.604414463043213 + }, + { + "auxiliary_loss_clip": 0.01085513, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.03394032, + "balance_loss_mlp": 1.02035069, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.213099089110519, + "language_loss": 0.66314405, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68434346, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.5882644653320312 + }, + { + "auxiliary_loss_clip": 0.01109124, + "auxiliary_loss_mlp": 0.01038812, + "balance_loss_clip": 1.040025, + "balance_loss_mlp": 1.02491212, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 1.7189792253973306, + "language_loss": 0.74409932, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.76557869, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.5910706520080566 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01044095, + "balance_loss_clip": 1.03926873, + "balance_loss_mlp": 1.02801371, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.16557440714472, + "language_loss": 0.72081685, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74237585, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.4982564449310303 + }, + { + "auxiliary_loss_clip": 0.01074498, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.03675544, + "balance_loss_mlp": 1.02162123, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 1.8296620733999902, + "language_loss": 0.84837043, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.86949289, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 2.665241003036499 + }, + { + "auxiliary_loss_clip": 0.01060486, + "auxiliary_loss_mlp": 0.01042374, + "balance_loss_clip": 1.03255737, + "balance_loss_mlp": 1.02591121, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.7261001653615515, + "language_loss": 0.78597152, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80700016, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 2.6458022594451904 + }, + { + "auxiliary_loss_clip": 0.01108045, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.03876042, + "balance_loss_mlp": 1.01961744, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 1.9676396301132695, + "language_loss": 0.67174929, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69318008, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 4.034724950790405 + }, + { + "auxiliary_loss_clip": 0.01059048, + "auxiliary_loss_mlp": 0.01043625, + "balance_loss_clip": 1.0352006, + "balance_loss_mlp": 1.02865243, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 1.8924434543905395, + "language_loss": 0.8245616, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84558833, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.7285027503967285 + }, + { + "auxiliary_loss_clip": 0.01091581, + "auxiliary_loss_mlp": 0.00750098, + "balance_loss_clip": 1.03772306, + "balance_loss_mlp": 1.00060511, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.7295456510437137, + "language_loss": 0.74065077, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.75906754, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 2.6111161708831787 + }, + { + "auxiliary_loss_clip": 0.01106997, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.03870106, + "balance_loss_mlp": 1.01841617, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 1.9336163085361606, + "language_loss": 0.76279914, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78419757, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 4.031680345535278 + }, + { + "auxiliary_loss_clip": 0.01105554, + "auxiliary_loss_mlp": 0.01039221, + "balance_loss_clip": 1.04312444, + "balance_loss_mlp": 1.02344942, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 2.1882625913725087, + "language_loss": 0.78801489, + "learning_rate": 3.232441120452094e-06, + "loss": 0.80946273, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 2.5810415744781494 + }, + { + "auxiliary_loss_clip": 0.01105256, + "auxiliary_loss_mlp": 0.01038587, + "balance_loss_clip": 1.03906107, + "balance_loss_mlp": 1.02212441, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.014745808151471, + "language_loss": 0.75360799, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77504641, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.5291531085968018 + }, + { + "auxiliary_loss_clip": 0.01078392, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.03600144, + "balance_loss_mlp": 1.02548456, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.5882376451642717, + "language_loss": 0.69220901, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71338952, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.675855875015259 + }, + { + "auxiliary_loss_clip": 0.01070021, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.03483176, + "balance_loss_mlp": 1.02427936, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 2.074029519137235, + "language_loss": 0.84781307, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.8688854, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 4.146032094955444 + }, + { + "auxiliary_loss_clip": 0.01092507, + "auxiliary_loss_mlp": 0.01037415, + "balance_loss_clip": 1.03616071, + "balance_loss_mlp": 1.02293062, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.7334905091184258, + "language_loss": 0.8469255, + "learning_rate": 3.231213827702462e-06, + "loss": 0.86822474, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 3.980153799057007 + }, + { + "auxiliary_loss_clip": 0.01105445, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.03805327, + "balance_loss_mlp": 1.02354598, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 1.8886517556422027, + "language_loss": 0.75586784, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77729571, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.5167672634124756 + }, + { + "auxiliary_loss_clip": 0.01107004, + "auxiliary_loss_mlp": 0.01041242, + "balance_loss_clip": 1.03736448, + "balance_loss_mlp": 1.02655542, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.846262684776845, + "language_loss": 0.8172884, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83877087, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.4713616371154785 + }, + { + "auxiliary_loss_clip": 0.01104133, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.03827393, + "balance_loss_mlp": 1.02128029, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.5472860056039721, + "language_loss": 0.82928556, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85066903, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.451331853866577 + }, + { + "auxiliary_loss_clip": 0.01121134, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.04131866, + "balance_loss_mlp": 1.0259223, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.7131550773106872, + "language_loss": 0.75972283, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78134072, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 2.486128568649292 + }, + { + "auxiliary_loss_clip": 0.01075302, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.03979683, + "balance_loss_mlp": 1.02539134, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.897981792108335, + "language_loss": 0.74634808, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76750648, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 2.622610092163086 + }, + { + "auxiliary_loss_clip": 0.01077539, + "auxiliary_loss_mlp": 0.01049232, + "balance_loss_clip": 1.03936207, + "balance_loss_mlp": 1.03274548, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.4982398628533025, + "language_loss": 0.7623024, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78357005, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 2.5775625705718994 + }, + { + "auxiliary_loss_clip": 0.01095442, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.03753924, + "balance_loss_mlp": 1.02293277, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.1309339088049533, + "language_loss": 0.73356414, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75490189, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 2.516789197921753 + }, + { + "auxiliary_loss_clip": 0.01003196, + "auxiliary_loss_mlp": 0.01009837, + "balance_loss_clip": 1.02552295, + "balance_loss_mlp": 1.00818002, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7156021075052399, + "language_loss": 0.52991503, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55004537, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 3.2414920330047607 + }, + { + "auxiliary_loss_clip": 0.01111529, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.04164076, + "balance_loss_mlp": 1.024544, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.6227121648443146, + "language_loss": 0.78735209, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.8088702, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.5090978145599365 + }, + { + "auxiliary_loss_clip": 0.010941, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.03614151, + "balance_loss_mlp": 1.02387881, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.5432041950030115, + "language_loss": 0.63880074, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66012275, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.7094411849975586 + }, + { + "auxiliary_loss_clip": 0.01077624, + "auxiliary_loss_mlp": 0.00750381, + "balance_loss_clip": 1.03742301, + "balance_loss_mlp": 1.00061953, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.284795452862813, + "language_loss": 0.77769107, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79597116, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.6875221729278564 + }, + { + "auxiliary_loss_clip": 0.01074649, + "auxiliary_loss_mlp": 0.01044506, + "balance_loss_clip": 1.03816366, + "balance_loss_mlp": 1.02884161, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.7846321724328016, + "language_loss": 0.83793771, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85912925, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 2.6428723335266113 + }, + { + "auxiliary_loss_clip": 0.01066538, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_clip": 1.03674865, + "balance_loss_mlp": 1.02817118, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 2.0160883087583787, + "language_loss": 0.84109235, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86219943, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.6590986251831055 + }, + { + "auxiliary_loss_clip": 0.01116334, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.04076171, + "balance_loss_mlp": 1.02054119, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 2.2734282558190246, + "language_loss": 0.83335632, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85486352, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.557447671890259 + }, + { + "auxiliary_loss_clip": 0.01095748, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.03823471, + "balance_loss_mlp": 1.02451253, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.690011533782354, + "language_loss": 0.85299385, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87434775, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 2.570822238922119 + }, + { + "auxiliary_loss_clip": 0.01055863, + "auxiliary_loss_mlp": 0.01040105, + "balance_loss_clip": 1.03484488, + "balance_loss_mlp": 1.02383304, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 2.188589834021587, + "language_loss": 0.832219, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85317862, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 2.70076584815979 + }, + { + "auxiliary_loss_clip": 0.01097324, + "auxiliary_loss_mlp": 0.01039341, + "balance_loss_clip": 1.03386772, + "balance_loss_mlp": 1.02410603, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 6.509402019033274, + "language_loss": 0.80343062, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.82479727, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.5642523765563965 + }, + { + "auxiliary_loss_clip": 0.01101617, + "auxiliary_loss_mlp": 0.00750205, + "balance_loss_clip": 1.03865337, + "balance_loss_mlp": 1.00058854, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.8976528137867776, + "language_loss": 0.80731523, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.8258335, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 2.5467257499694824 + }, + { + "auxiliary_loss_clip": 0.01091646, + "auxiliary_loss_mlp": 0.01041842, + "balance_loss_clip": 1.03899837, + "balance_loss_mlp": 1.02764428, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.7833570204458082, + "language_loss": 0.81476462, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83609945, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 2.6560633182525635 + }, + { + "auxiliary_loss_clip": 0.01083145, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.03753328, + "balance_loss_mlp": 1.02933955, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 3.544204283308976, + "language_loss": 0.7856003, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80687892, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 2.582104444503784 + }, + { + "auxiliary_loss_clip": 0.0108531, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.04359794, + "balance_loss_mlp": 1.01679242, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.773045501328602, + "language_loss": 0.83458519, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85575533, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.663379669189453 + }, + { + "auxiliary_loss_clip": 0.01083937, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.03748035, + "balance_loss_mlp": 1.02489185, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.6038083666181306, + "language_loss": 0.74056458, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76178706, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 2.707335948944092 + }, + { + "auxiliary_loss_clip": 0.01066543, + "auxiliary_loss_mlp": 0.00750209, + "balance_loss_clip": 1.03589666, + "balance_loss_mlp": 1.00067258, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.922850130240081, + "language_loss": 0.70674086, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72490835, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 2.7706081867218018 + }, + { + "auxiliary_loss_clip": 0.01012628, + "auxiliary_loss_mlp": 0.01008533, + "balance_loss_clip": 1.01973331, + "balance_loss_mlp": 1.00706065, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.943126134919592, + "language_loss": 0.59639329, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61660492, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 3.209320306777954 + }, + { + "auxiliary_loss_clip": 0.01086817, + "auxiliary_loss_mlp": 0.01041619, + "balance_loss_clip": 1.03473651, + "balance_loss_mlp": 1.027493, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.340277374235957, + "language_loss": 0.69813693, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71942133, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.6499531269073486 + }, + { + "auxiliary_loss_clip": 0.01098283, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.04459333, + "balance_loss_mlp": 1.03082001, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.8360353034313537, + "language_loss": 0.63226569, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.65370375, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 2.6643195152282715 + }, + { + "auxiliary_loss_clip": 0.01099336, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.03935826, + "balance_loss_mlp": 1.02707052, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.1597226160947427, + "language_loss": 0.86348259, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.8849048, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.599395275115967 + }, + { + "auxiliary_loss_clip": 0.01118418, + "auxiliary_loss_mlp": 0.00750121, + "balance_loss_clip": 1.03958607, + "balance_loss_mlp": 1.00064123, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.5484254264441921, + "language_loss": 0.63104868, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.64973408, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.670624017715454 + }, + { + "auxiliary_loss_clip": 0.01083628, + "auxiliary_loss_mlp": 0.01040958, + "balance_loss_clip": 1.04209185, + "balance_loss_mlp": 1.02616394, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.1599604041000906, + "language_loss": 0.83010972, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85135555, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 2.595287561416626 + }, + { + "auxiliary_loss_clip": 0.01000888, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.02731478, + "balance_loss_mlp": 1.02093434, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.9747393769649695, + "language_loss": 0.79080516, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81117815, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 2.916283369064331 + }, + { + "auxiliary_loss_clip": 0.01055507, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_clip": 1.03333926, + "balance_loss_mlp": 1.03615546, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.4335991194569022, + "language_loss": 0.75008285, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77117974, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 3.2704074382781982 + }, + { + "auxiliary_loss_clip": 0.01033665, + "auxiliary_loss_mlp": 0.00747218, + "balance_loss_clip": 1.0182476, + "balance_loss_mlp": 1.00009465, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8373376191684468, + "language_loss": 0.63978469, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65759349, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 4.951761484146118 + }, + { + "auxiliary_loss_clip": 0.01106366, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.03769541, + "balance_loss_mlp": 1.02133155, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 2.0699290423041727, + "language_loss": 0.80082786, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82226604, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 2.5850138664245605 + }, + { + "auxiliary_loss_clip": 0.01086931, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.03486848, + "balance_loss_mlp": 1.02584696, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.8831271309268274, + "language_loss": 0.72338355, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74469024, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 2.6355738639831543 + }, + { + "auxiliary_loss_clip": 0.01118628, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.04036832, + "balance_loss_mlp": 1.02424526, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.405551966041261, + "language_loss": 0.76211768, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78368342, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 4.083196401596069 + }, + { + "auxiliary_loss_clip": 0.01118899, + "auxiliary_loss_mlp": 0.01035565, + "balance_loss_clip": 1.03949404, + "balance_loss_mlp": 1.02083087, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.419807883954067, + "language_loss": 0.77824008, + "learning_rate": 3.220134667280476e-06, + "loss": 0.79978472, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.5984983444213867 + }, + { + "auxiliary_loss_clip": 0.01017019, + "auxiliary_loss_mlp": 0.00747068, + "balance_loss_clip": 1.01302898, + "balance_loss_mlp": 1.00004387, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7629296939245569, + "language_loss": 0.54725438, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56489527, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.162567377090454 + }, + { + "auxiliary_loss_clip": 0.01116517, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.0404985, + "balance_loss_mlp": 1.01855946, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.659000250222548, + "language_loss": 0.6649186, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68641341, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.5409669876098633 + }, + { + "auxiliary_loss_clip": 0.01072509, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.03437948, + "balance_loss_mlp": 1.0216769, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.182939752964847, + "language_loss": 0.69217521, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71327877, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 4.120787143707275 + }, + { + "auxiliary_loss_clip": 0.0110743, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.03914022, + "balance_loss_mlp": 1.02775383, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.7706406782462074, + "language_loss": 0.78598523, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.80748856, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 4.245525121688843 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.04110527, + "balance_loss_mlp": 1.01662207, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 1.960508510043382, + "language_loss": 0.84013951, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86152142, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.6050429344177246 + }, + { + "auxiliary_loss_clip": 0.01120152, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.041713, + "balance_loss_mlp": 1.02195334, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 1.9428086088632186, + "language_loss": 0.69113481, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71270674, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 2.5324783325195312 + }, + { + "auxiliary_loss_clip": 0.01117563, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.0396117, + "balance_loss_mlp": 1.02489424, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.8109996115421922, + "language_loss": 0.83795321, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.85951161, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 2.596914291381836 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.03694534, + "balance_loss_mlp": 1.01918721, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.530985112864727, + "language_loss": 0.60803258, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62899053, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 2.73396897315979 + }, + { + "auxiliary_loss_clip": 0.01078894, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.03506994, + "balance_loss_mlp": 1.01690197, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.9302244759741036, + "language_loss": 0.65950429, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68059433, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.619584560394287 + }, + { + "auxiliary_loss_clip": 0.01108676, + "auxiliary_loss_mlp": 0.01046002, + "balance_loss_clip": 1.04054034, + "balance_loss_mlp": 1.03044546, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.5118120963211465, + "language_loss": 0.76761085, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78915763, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.654587984085083 + }, + { + "auxiliary_loss_clip": 0.01114407, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.03780186, + "balance_loss_mlp": 1.01913548, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 2.9054847417593703, + "language_loss": 0.83318377, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85465842, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 2.537860155105591 + }, + { + "auxiliary_loss_clip": 0.01100286, + "auxiliary_loss_mlp": 0.01038689, + "balance_loss_clip": 1.03726399, + "balance_loss_mlp": 1.02610064, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 2.6579218362512727, + "language_loss": 0.71100092, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73239064, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.6118741035461426 + }, + { + "auxiliary_loss_clip": 0.01096996, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_clip": 1.04076147, + "balance_loss_mlp": 1.0247395, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.7618055849876895, + "language_loss": 0.7438904, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76525283, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.6317758560180664 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.037714, + "balance_loss_mlp": 1.01948237, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.6879496767995947, + "language_loss": 0.77473462, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79622281, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.5967822074890137 + }, + { + "auxiliary_loss_clip": 0.01101573, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.0386591, + "balance_loss_mlp": 1.01963282, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.7830177101413383, + "language_loss": 0.79052997, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81187522, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 2.6070504188537598 + }, + { + "auxiliary_loss_clip": 0.01104189, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.03937685, + "balance_loss_mlp": 1.02100587, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.7326161230412798, + "language_loss": 0.78995734, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81134188, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.6304290294647217 + }, + { + "auxiliary_loss_clip": 0.01099342, + "auxiliary_loss_mlp": 0.01046618, + "balance_loss_clip": 1.04085815, + "balance_loss_mlp": 1.03196096, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 3.0555635423486955, + "language_loss": 0.7096715, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73113114, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 2.7160353660583496 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.04082143, + "balance_loss_mlp": 1.02293527, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 1.891249113060944, + "language_loss": 0.77824473, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79968905, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 2.6088480949401855 + }, + { + "auxiliary_loss_clip": 0.01076629, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.04297543, + "balance_loss_mlp": 1.01786304, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.668235901781687, + "language_loss": 0.82590783, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84698594, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.7930428981781006 + }, + { + "auxiliary_loss_clip": 0.01108014, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.04106081, + "balance_loss_mlp": 1.01884127, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.185393896319885, + "language_loss": 0.79714084, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81855059, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.5598435401916504 + }, + { + "auxiliary_loss_clip": 0.01092619, + "auxiliary_loss_mlp": 0.01044114, + "balance_loss_clip": 1.03729522, + "balance_loss_mlp": 1.02812755, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 1.963378170058887, + "language_loss": 0.68404794, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70541525, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.6935720443725586 + }, + { + "auxiliary_loss_clip": 0.01094832, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.03897071, + "balance_loss_mlp": 1.01934004, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.5916159505054241, + "language_loss": 0.80787766, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82915455, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 2.6211037635803223 + }, + { + "auxiliary_loss_clip": 0.0111407, + "auxiliary_loss_mlp": 0.01036083, + "balance_loss_clip": 1.03706896, + "balance_loss_mlp": 1.02173054, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.356449855779687, + "language_loss": 0.69414127, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.71564281, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 2.5008959770202637 + }, + { + "auxiliary_loss_clip": 0.0109253, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.03713512, + "balance_loss_mlp": 1.02377152, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 2.138132172595573, + "language_loss": 0.79354417, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81484032, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 2.584683656692505 + }, + { + "auxiliary_loss_clip": 0.01105784, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.03906107, + "balance_loss_mlp": 1.02019715, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.917901337381045, + "language_loss": 0.72931623, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75070852, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.5760741233825684 + }, + { + "auxiliary_loss_clip": 0.01078007, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.03347826, + "balance_loss_mlp": 1.02046371, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.942770822182084, + "language_loss": 0.82071817, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84183651, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 2.5813987255096436 + }, + { + "auxiliary_loss_clip": 0.01107007, + "auxiliary_loss_mlp": 0.01037297, + "balance_loss_clip": 1.03835821, + "balance_loss_mlp": 1.02190137, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.910319369816248, + "language_loss": 0.70203888, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72348201, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.5377955436706543 + }, + { + "auxiliary_loss_clip": 0.01093021, + "auxiliary_loss_mlp": 0.00749936, + "balance_loss_clip": 1.03350353, + "balance_loss_mlp": 1.00056696, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.498378457222419, + "language_loss": 0.80223924, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82066882, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 2.6049914360046387 + }, + { + "auxiliary_loss_clip": 0.01113655, + "auxiliary_loss_mlp": 0.01039921, + "balance_loss_clip": 1.04233944, + "balance_loss_mlp": 1.02425086, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 1.8908878075740592, + "language_loss": 0.57297182, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.59450758, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 2.7524590492248535 + }, + { + "auxiliary_loss_clip": 0.01063163, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.03309309, + "balance_loss_mlp": 1.01707006, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.7374010638993453, + "language_loss": 0.81702405, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83795917, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 2.666807174682617 + }, + { + "auxiliary_loss_clip": 0.01098505, + "auxiliary_loss_mlp": 0.01053091, + "balance_loss_clip": 1.03759813, + "balance_loss_mlp": 1.03654516, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 2.336631241205363, + "language_loss": 0.74262387, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76413989, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 2.566920518875122 + }, + { + "auxiliary_loss_clip": 0.01103496, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.04080021, + "balance_loss_mlp": 1.02189744, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 2.2914497534572167, + "language_loss": 0.67618585, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.6975801, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 2.636162281036377 + }, + { + "auxiliary_loss_clip": 0.01094841, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.03809118, + "balance_loss_mlp": 1.02674127, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 2.3088340700517818, + "language_loss": 0.79921597, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82056808, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 2.6501283645629883 + }, + { + "auxiliary_loss_clip": 0.01092136, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.03809309, + "balance_loss_mlp": 1.02236366, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.757616408386562, + "language_loss": 0.70127475, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72256488, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 2.6337430477142334 + }, + { + "auxiliary_loss_clip": 0.01071059, + "auxiliary_loss_mlp": 0.01048825, + "balance_loss_clip": 1.03371334, + "balance_loss_mlp": 1.03219509, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.616655181489138, + "language_loss": 0.79544795, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81664681, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 4.2265465259552 + }, + { + "auxiliary_loss_clip": 0.01088148, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.03905833, + "balance_loss_mlp": 1.02032995, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.1260965350532093, + "language_loss": 0.85015869, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87139446, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.594583511352539 + }, + { + "auxiliary_loss_clip": 0.01055338, + "auxiliary_loss_mlp": 0.01061243, + "balance_loss_clip": 1.03158283, + "balance_loss_mlp": 1.04550755, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.6047238371589658, + "language_loss": 0.79856884, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.81973469, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 2.7158195972442627 + }, + { + "auxiliary_loss_clip": 0.01071872, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.03777075, + "balance_loss_mlp": 1.02046657, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 2.091474741097146, + "language_loss": 0.70753574, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.7286005, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 2.9374701976776123 + }, + { + "auxiliary_loss_clip": 0.01083172, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.04051244, + "balance_loss_mlp": 1.02103186, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8594719477009782, + "language_loss": 0.72403932, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74522579, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 4.2520904541015625 + }, + { + "auxiliary_loss_clip": 0.01105733, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.03805459, + "balance_loss_mlp": 1.01987195, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 1.7650529086408366, + "language_loss": 0.78956944, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81096745, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.5872318744659424 + }, + { + "auxiliary_loss_clip": 0.01120417, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.04029799, + "balance_loss_mlp": 1.02133203, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.5599388510073302, + "language_loss": 0.75847721, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78004539, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.6643736362457275 + }, + { + "auxiliary_loss_clip": 0.01113358, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.03923678, + "balance_loss_mlp": 1.02094734, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 1.9016412625640389, + "language_loss": 0.7995553, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82102817, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 2.522639751434326 + }, + { + "auxiliary_loss_clip": 0.0102623, + "auxiliary_loss_mlp": 0.01005062, + "balance_loss_clip": 1.01041651, + "balance_loss_mlp": 1.00333333, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8430847134352497, + "language_loss": 0.67939532, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69970822, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 3.1064391136169434 + }, + { + "auxiliary_loss_clip": 0.01101131, + "auxiliary_loss_mlp": 0.0104208, + "balance_loss_clip": 1.04044318, + "balance_loss_mlp": 1.02558136, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.459318713212475, + "language_loss": 0.82648456, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84791672, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 5.688628196716309 + }, + { + "auxiliary_loss_clip": 0.01092059, + "auxiliary_loss_mlp": 0.00749978, + "balance_loss_clip": 1.04017925, + "balance_loss_mlp": 1.00051785, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 1.5824562753965117, + "language_loss": 0.81158876, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83000916, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 2.6988561153411865 + }, + { + "auxiliary_loss_clip": 0.01118315, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.04318929, + "balance_loss_mlp": 1.02093554, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.8672358676653753, + "language_loss": 0.7399565, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76148152, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.5490341186523438 + }, + { + "auxiliary_loss_clip": 0.01080192, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.03503847, + "balance_loss_mlp": 1.02170861, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.8065468681046781, + "language_loss": 0.74020445, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.76137829, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.7555153369903564 + }, + { + "auxiliary_loss_clip": 0.01105458, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.03923094, + "balance_loss_mlp": 1.01739609, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 1.7987553585301115, + "language_loss": 0.64258516, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66395396, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.558239221572876 + }, + { + "auxiliary_loss_clip": 0.01073938, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.04030657, + "balance_loss_mlp": 1.02176654, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.6702479155779875, + "language_loss": 0.91113186, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93222666, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 2.709080219268799 + }, + { + "auxiliary_loss_clip": 0.01109463, + "auxiliary_loss_mlp": 0.01042009, + "balance_loss_clip": 1.04138243, + "balance_loss_mlp": 1.0270009, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 1.876092068341454, + "language_loss": 0.7504586, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77197337, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 2.654090166091919 + }, + { + "auxiliary_loss_clip": 0.01118594, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.040344, + "balance_loss_mlp": 1.02279758, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.8960010652610682, + "language_loss": 0.61507654, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63663, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.729088544845581 + }, + { + "auxiliary_loss_clip": 0.0110888, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.04046226, + "balance_loss_mlp": 1.02372372, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.242941394554092, + "language_loss": 0.82184494, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.8433131, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.577641248703003 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_clip": 1.04102063, + "balance_loss_mlp": 1.02822495, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8120006651757188, + "language_loss": 0.84459573, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.86603248, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.6128029823303223 + }, + { + "auxiliary_loss_clip": 0.01084884, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.0403024, + "balance_loss_mlp": 1.02040601, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.6021580034752525, + "language_loss": 0.85353529, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87473965, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 2.774150848388672 + }, + { + "auxiliary_loss_clip": 0.01092725, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.0397172, + "balance_loss_mlp": 1.02270365, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 2.220292272798786, + "language_loss": 0.68323785, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70454031, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.6364152431488037 + }, + { + "auxiliary_loss_clip": 0.01118601, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.04225755, + "balance_loss_mlp": 1.02135015, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.6714398160075963, + "language_loss": 0.78489238, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80643451, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 2.5959115028381348 + }, + { + "auxiliary_loss_clip": 0.01100215, + "auxiliary_loss_mlp": 0.0104699, + "balance_loss_clip": 1.040452, + "balance_loss_mlp": 1.03171301, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.4977047116448479, + "language_loss": 0.74085486, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76232696, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 2.705378532409668 + }, + { + "auxiliary_loss_clip": 0.01096131, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.03931499, + "balance_loss_mlp": 1.01977992, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 2.0003572408380093, + "language_loss": 0.7365303, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75783885, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.71227765083313 + }, + { + "auxiliary_loss_clip": 0.01108273, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.04083753, + "balance_loss_mlp": 1.02141619, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.734032223928161, + "language_loss": 0.7745679, + "learning_rate": 3.201847741843128e-06, + "loss": 0.79600286, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.5940845012664795 + }, + { + "auxiliary_loss_clip": 0.01096345, + "auxiliary_loss_mlp": 0.01035695, + "balance_loss_clip": 1.04019678, + "balance_loss_mlp": 1.02028108, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 1.9827720844234948, + "language_loss": 0.7788583, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80017865, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.6444523334503174 + }, + { + "auxiliary_loss_clip": 0.01078199, + "auxiliary_loss_mlp": 0.01039488, + "balance_loss_clip": 1.04041171, + "balance_loss_mlp": 1.02681053, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.6219486486311825, + "language_loss": 0.71378076, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.7349577, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 2.654893159866333 + }, + { + "auxiliary_loss_clip": 0.01109352, + "auxiliary_loss_mlp": 0.0103829, + "balance_loss_clip": 1.04092741, + "balance_loss_mlp": 1.02292943, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 4.044723411032421, + "language_loss": 0.76511246, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.78658891, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 2.5738866329193115 + }, + { + "auxiliary_loss_clip": 0.01084802, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.03647149, + "balance_loss_mlp": 1.02677679, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 1.895733836088372, + "language_loss": 0.72522599, + "learning_rate": 3.200602180731467e-06, + "loss": 0.74648809, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.5953588485717773 + }, + { + "auxiliary_loss_clip": 0.01088028, + "auxiliary_loss_mlp": 0.00749964, + "balance_loss_clip": 1.03785849, + "balance_loss_mlp": 1.00042105, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.8473137638819876, + "language_loss": 0.66661078, + "learning_rate": 3.20029067660664e-06, + "loss": 0.6849907, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.653411388397217 + }, + { + "auxiliary_loss_clip": 0.01102721, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.0350939, + "balance_loss_mlp": 1.01668262, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.8466692970192506, + "language_loss": 0.72331393, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74464357, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 2.6192126274108887 + }, + { + "auxiliary_loss_clip": 0.01027516, + "auxiliary_loss_mlp": 0.01001213, + "balance_loss_clip": 1.01200843, + "balance_loss_mlp": 0.99930537, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7412314171001816, + "language_loss": 0.50676012, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.5270474, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.1103456020355225 + }, + { + "auxiliary_loss_clip": 0.01106817, + "auxiliary_loss_mlp": 0.01038996, + "balance_loss_clip": 1.04156494, + "balance_loss_mlp": 1.0251441, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4916321000961483, + "language_loss": 0.85076982, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87222797, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 2.617553472518921 + }, + { + "auxiliary_loss_clip": 0.01086777, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_clip": 1.03599024, + "balance_loss_mlp": 1.02702916, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.5364946415434253, + "language_loss": 0.82001865, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.84129059, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 2.6358702182769775 + }, + { + "auxiliary_loss_clip": 0.01090604, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.0367837, + "balance_loss_mlp": 1.02219868, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 2.2385439945061405, + "language_loss": 0.79613322, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81740689, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 2.57563853263855 + }, + { + "auxiliary_loss_clip": 0.01108654, + "auxiliary_loss_mlp": 0.01039578, + "balance_loss_clip": 1.04013133, + "balance_loss_mlp": 1.02473617, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.7793812139629446, + "language_loss": 0.74779761, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.76927996, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 2.616335868835449 + }, + { + "auxiliary_loss_clip": 0.01081099, + "auxiliary_loss_mlp": 0.01040025, + "balance_loss_clip": 1.03550684, + "balance_loss_mlp": 1.02563024, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 1.9724864016162391, + "language_loss": 0.79492533, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.8161366, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 2.654337167739868 + }, + { + "auxiliary_loss_clip": 0.01017115, + "auxiliary_loss_mlp": 0.01006989, + "balance_loss_clip": 1.01169908, + "balance_loss_mlp": 1.00516534, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7577760535809162, + "language_loss": 0.57873613, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59897721, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 3.15690279006958 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.03975677, + "balance_loss_mlp": 1.0208267, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.4085377055942065, + "language_loss": 0.72960353, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75113833, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 2.6723928451538086 + }, + { + "auxiliary_loss_clip": 0.01081173, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_clip": 1.03596842, + "balance_loss_mlp": 1.02725542, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 1.8891948105578058, + "language_loss": 0.79905832, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82028425, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 2.707451105117798 + }, + { + "auxiliary_loss_clip": 0.01120911, + "auxiliary_loss_mlp": 0.01042124, + "balance_loss_clip": 1.03990531, + "balance_loss_mlp": 1.02638817, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 3.7649008641848, + "language_loss": 0.79448539, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81611574, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 4.0769054889678955 + }, + { + "auxiliary_loss_clip": 0.01119937, + "auxiliary_loss_mlp": 0.0103728, + "balance_loss_clip": 1.04024029, + "balance_loss_mlp": 1.02160382, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.8378498210236296, + "language_loss": 0.73172259, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75329477, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.6060187816619873 + }, + { + "auxiliary_loss_clip": 0.01095686, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.03612471, + "balance_loss_mlp": 1.02590346, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 2.010242998556614, + "language_loss": 0.69036078, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71174294, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 2.8196375370025635 + }, + { + "auxiliary_loss_clip": 0.01105779, + "auxiliary_loss_mlp": 0.00749882, + "balance_loss_clip": 1.03944898, + "balance_loss_mlp": 1.00049353, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.6437467873670983, + "language_loss": 0.67762673, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69618338, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 2.5859766006469727 + }, + { + "auxiliary_loss_clip": 0.01070069, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_clip": 1.03508449, + "balance_loss_mlp": 1.02885151, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.5857558200466675, + "language_loss": 0.80603862, + "learning_rate": 3.195612659536081e-06, + "loss": 0.8271805, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.7207040786743164 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.03749204, + "balance_loss_mlp": 1.02583528, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.6570662103677587, + "language_loss": 0.73151171, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.75299048, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 4.076708793640137 + }, + { + "auxiliary_loss_clip": 0.01098111, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04283977, + "balance_loss_mlp": 1.01787841, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.5335468563805212, + "language_loss": 0.7787869, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80008745, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 2.7530486583709717 + }, + { + "auxiliary_loss_clip": 0.01086816, + "auxiliary_loss_mlp": 0.01040104, + "balance_loss_clip": 1.03646755, + "balance_loss_mlp": 1.02257991, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.6821078369199325, + "language_loss": 0.78633332, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.80760258, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.690981388092041 + }, + { + "auxiliary_loss_clip": 0.01023147, + "auxiliary_loss_mlp": 0.01003063, + "balance_loss_clip": 1.00899458, + "balance_loss_mlp": 1.00146568, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.871841909334883, + "language_loss": 0.62858856, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64885062, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 2.9120407104492188 + }, + { + "auxiliary_loss_clip": 0.01122496, + "auxiliary_loss_mlp": 0.01042796, + "balance_loss_clip": 1.04043365, + "balance_loss_mlp": 1.02682197, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.8773407992190947, + "language_loss": 0.80880976, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83046257, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 5.634445667266846 + }, + { + "auxiliary_loss_clip": 0.01085079, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.03973663, + "balance_loss_mlp": 1.03032303, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.5791490405387503, + "language_loss": 0.78225052, + "learning_rate": 3.19373859419346e-06, + "loss": 0.8035444, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.7341439723968506 + }, + { + "auxiliary_loss_clip": 0.01094704, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.0376935, + "balance_loss_mlp": 1.02260435, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.5300844602497345, + "language_loss": 0.78144163, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80276692, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.676246404647827 + }, + { + "auxiliary_loss_clip": 0.01093596, + "auxiliary_loss_mlp": 0.01043968, + "balance_loss_clip": 1.03857923, + "balance_loss_mlp": 1.02717185, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 3.5265831109765107, + "language_loss": 0.67555559, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69693124, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.6249406337738037 + }, + { + "auxiliary_loss_clip": 0.01021819, + "auxiliary_loss_mlp": 0.01002685, + "balance_loss_clip": 1.01009846, + "balance_loss_mlp": 1.00109982, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.756643556949408, + "language_loss": 0.52781081, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54805589, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 3.0744802951812744 + }, + { + "auxiliary_loss_clip": 0.01101121, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.04261291, + "balance_loss_mlp": 1.02125263, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.8369850552182752, + "language_loss": 0.70533693, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72670078, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 2.6021063327789307 + }, + { + "auxiliary_loss_clip": 0.0103352, + "auxiliary_loss_mlp": 0.01002879, + "balance_loss_clip": 1.00855231, + "balance_loss_mlp": 1.00118625, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8132440777935883, + "language_loss": 0.60473293, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62509692, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 3.1727750301361084 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01044326, + "balance_loss_clip": 1.03998435, + "balance_loss_mlp": 1.02850676, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.7029479774380702, + "language_loss": 0.72047353, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74212849, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.6875576972961426 + }, + { + "auxiliary_loss_clip": 0.01108257, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.03860974, + "balance_loss_mlp": 1.03078246, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.8246261285854832, + "language_loss": 0.75605822, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77761042, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.6085572242736816 + }, + { + "auxiliary_loss_clip": 0.01105074, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.03804159, + "balance_loss_mlp": 1.01694965, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 1.6376512816390594, + "language_loss": 0.87545323, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89680248, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.632941246032715 + }, + { + "auxiliary_loss_clip": 0.01102309, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.04034102, + "balance_loss_mlp": 1.02009058, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.6925317871949042, + "language_loss": 0.68037254, + "learning_rate": 3.190924441478572e-06, + "loss": 0.70173311, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 2.5735840797424316 + }, + { + "auxiliary_loss_clip": 0.01097114, + "auxiliary_loss_mlp": 0.01043366, + "balance_loss_clip": 1.03831661, + "balance_loss_mlp": 1.02797556, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.833107642541439, + "language_loss": 0.79719818, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.81860304, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 2.8002631664276123 + }, + { + "auxiliary_loss_clip": 0.01083533, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.04284465, + "balance_loss_mlp": 1.02053142, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.078197823574751, + "language_loss": 0.79911929, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82031488, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 2.707913875579834 + }, + { + "auxiliary_loss_clip": 0.01098914, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.03655934, + "balance_loss_mlp": 1.02077866, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.7864301142848913, + "language_loss": 0.74760348, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.76892996, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.577796697616577 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.03915679, + "balance_loss_mlp": 1.02601838, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.8563887751563957, + "language_loss": 0.74046063, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76188618, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.620502471923828 + }, + { + "auxiliary_loss_clip": 0.01119066, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.03924406, + "balance_loss_mlp": 1.01716924, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.9608482826022349, + "language_loss": 0.76116812, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78268284, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.5886952877044678 + }, + { + "auxiliary_loss_clip": 0.01081292, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.03731596, + "balance_loss_mlp": 1.02457881, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.8020673748622398, + "language_loss": 0.69432294, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71552777, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 2.6733181476593018 + }, + { + "auxiliary_loss_clip": 0.01093401, + "auxiliary_loss_mlp": 0.01038772, + "balance_loss_clip": 1.03869581, + "balance_loss_mlp": 1.02456856, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.6519631920079332, + "language_loss": 0.77714741, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79846913, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 2.636108636856079 + }, + { + "auxiliary_loss_clip": 0.01085314, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.03958893, + "balance_loss_mlp": 1.01560521, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.7532690270264646, + "language_loss": 0.79692936, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81808847, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.6842737197875977 + }, + { + "auxiliary_loss_clip": 0.01103148, + "auxiliary_loss_mlp": 0.01042209, + "balance_loss_clip": 1.04148018, + "balance_loss_mlp": 1.02802861, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 1.8792952144682387, + "language_loss": 0.7430433, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76449692, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 2.6158909797668457 + }, + { + "auxiliary_loss_clip": 0.01109074, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.04506755, + "balance_loss_mlp": 1.03098309, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 2.0275497070248543, + "language_loss": 0.78270316, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80425322, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.769190549850464 + }, + { + "auxiliary_loss_clip": 0.01093426, + "auxiliary_loss_mlp": 0.01043361, + "balance_loss_clip": 1.03643203, + "balance_loss_mlp": 1.02825761, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 2.06161710336872, + "language_loss": 0.8378005, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.85916835, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 2.626955032348633 + }, + { + "auxiliary_loss_clip": 0.01106779, + "auxiliary_loss_mlp": 0.01044685, + "balance_loss_clip": 1.04120326, + "balance_loss_mlp": 1.02968812, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.246892479347482, + "language_loss": 0.77170146, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79321611, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 2.561891555786133 + }, + { + "auxiliary_loss_clip": 0.0111138, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.03662062, + "balance_loss_mlp": 1.01915932, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.7568586931011083, + "language_loss": 0.79540598, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81685507, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 2.6331889629364014 + }, + { + "auxiliary_loss_clip": 0.01111989, + "auxiliary_loss_mlp": 0.01042897, + "balance_loss_clip": 1.04154193, + "balance_loss_mlp": 1.02739954, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.0387378798027163, + "language_loss": 0.73364699, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75519586, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 2.5775692462921143 + }, + { + "auxiliary_loss_clip": 0.01074491, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.03278446, + "balance_loss_mlp": 1.02529693, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 2.010943677430562, + "language_loss": 0.7208581, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74199712, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 2.607882499694824 + }, + { + "auxiliary_loss_clip": 0.01095716, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.03895497, + "balance_loss_mlp": 1.02200341, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.6774080724280782, + "language_loss": 0.63719773, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.65850747, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 2.5896105766296387 + }, + { + "auxiliary_loss_clip": 0.01088263, + "auxiliary_loss_mlp": 0.01045181, + "balance_loss_clip": 1.03756905, + "balance_loss_mlp": 1.03029156, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 1.9241016294694495, + "language_loss": 0.79671955, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81805396, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 2.6411550045013428 + }, + { + "auxiliary_loss_clip": 0.01086369, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.03556049, + "balance_loss_mlp": 1.02213669, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.7683140019204888, + "language_loss": 0.77401036, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.79524034, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 2.6083688735961914 + }, + { + "auxiliary_loss_clip": 0.01116044, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.04142928, + "balance_loss_mlp": 1.02838194, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 2.170449003741195, + "language_loss": 0.74112344, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76273143, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 2.492180824279785 + }, + { + "auxiliary_loss_clip": 0.01108072, + "auxiliary_loss_mlp": 0.01034544, + "balance_loss_clip": 1.03940296, + "balance_loss_mlp": 1.02062643, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.8020140406474336, + "language_loss": 0.82984459, + "learning_rate": 3.184657685014856e-06, + "loss": 0.85127074, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.58483624458313 + }, + { + "auxiliary_loss_clip": 0.01091332, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.03654039, + "balance_loss_mlp": 1.02303708, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.526837835852511, + "language_loss": 0.78342617, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80470026, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 4.1032538414001465 + }, + { + "auxiliary_loss_clip": 0.01077063, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.03454852, + "balance_loss_mlp": 1.02091455, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 1.8226217928568027, + "language_loss": 0.84272313, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86384726, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 2.7185287475585938 + }, + { + "auxiliary_loss_clip": 0.01065347, + "auxiliary_loss_mlp": 0.01045822, + "balance_loss_clip": 1.03218544, + "balance_loss_mlp": 1.02996755, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.566974410469504, + "language_loss": 0.79016602, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.81127775, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.6931514739990234 + }, + { + "auxiliary_loss_clip": 0.01104109, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.03777945, + "balance_loss_mlp": 1.01939881, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.3555544592285216, + "language_loss": 0.86360043, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88498014, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.563845157623291 + }, + { + "auxiliary_loss_clip": 0.0108485, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.03440547, + "balance_loss_mlp": 1.03189754, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7136119523729703, + "language_loss": 0.79707932, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.81841135, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.624114751815796 + }, + { + "auxiliary_loss_clip": 0.0107846, + "auxiliary_loss_mlp": 0.01053969, + "balance_loss_clip": 1.03582811, + "balance_loss_mlp": 1.03715491, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 1.70580438441749, + "language_loss": 0.67178226, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69310653, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 4.110800504684448 + }, + { + "auxiliary_loss_clip": 0.01102705, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.03622651, + "balance_loss_mlp": 1.02314889, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.4774422236585183, + "language_loss": 0.69244635, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.7138381, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 2.6747260093688965 + }, + { + "auxiliary_loss_clip": 0.01013588, + "auxiliary_loss_mlp": 0.01001973, + "balance_loss_clip": 1.00851965, + "balance_loss_mlp": 1.00030446, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7269247761860156, + "language_loss": 0.5305149, + "learning_rate": 3.182145945801628e-06, + "loss": 0.5506705, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 3.3336565494537354 + }, + { + "auxiliary_loss_clip": 0.01115092, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.03981256, + "balance_loss_mlp": 1.02254272, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.724540293350718, + "language_loss": 0.84070998, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86222476, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 4.507545709609985 + }, + { + "auxiliary_loss_clip": 0.01098145, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.03588569, + "balance_loss_mlp": 1.02776229, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.5236808895050034, + "language_loss": 0.63333303, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65473032, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 4.258737087249756 + }, + { + "auxiliary_loss_clip": 0.01088005, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.03552866, + "balance_loss_mlp": 1.02036202, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.8624492659003193, + "language_loss": 0.70657265, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72779822, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 2.616281509399414 + }, + { + "auxiliary_loss_clip": 0.01123469, + "auxiliary_loss_mlp": 0.0075035, + "balance_loss_clip": 1.04072142, + "balance_loss_mlp": 1.00057697, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 2.5128488200699732, + "language_loss": 0.86800641, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88674462, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.532846450805664 + }, + { + "auxiliary_loss_clip": 0.01088815, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.03488207, + "balance_loss_mlp": 1.01741791, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.5829747089652324, + "language_loss": 0.83458877, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85579497, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 2.6117234230041504 + }, + { + "auxiliary_loss_clip": 0.01099891, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.03647518, + "balance_loss_mlp": 1.0179013, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 3.810359335157594, + "language_loss": 0.78352565, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.80485815, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 2.5574264526367188 + }, + { + "auxiliary_loss_clip": 0.01091013, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.03724766, + "balance_loss_mlp": 1.01637721, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.683659070421787, + "language_loss": 0.79655421, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.81777894, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 2.6079328060150146 + }, + { + "auxiliary_loss_clip": 0.01106188, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.03839254, + "balance_loss_mlp": 1.02218366, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.5651193300321165, + "language_loss": 0.74762446, + "learning_rate": 3.179631337655037e-06, + "loss": 0.76904994, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.648921251296997 + }, + { + "auxiliary_loss_clip": 0.01073594, + "auxiliary_loss_mlp": 0.01038052, + "balance_loss_clip": 1.03698826, + "balance_loss_mlp": 1.02330029, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.7684986487275922, + "language_loss": 0.80907339, + "learning_rate": 3.179316810218701e-06, + "loss": 0.83018982, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.7091829776763916 + }, + { + "auxiliary_loss_clip": 0.01090113, + "auxiliary_loss_mlp": 0.01034469, + "balance_loss_clip": 1.03976989, + "balance_loss_mlp": 1.01960361, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.4171726370261373, + "language_loss": 0.77945787, + "learning_rate": 3.179002238062554e-06, + "loss": 0.8007037, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.7170050144195557 + }, + { + "auxiliary_loss_clip": 0.01065286, + "auxiliary_loss_mlp": 0.01040871, + "balance_loss_clip": 1.03675592, + "balance_loss_mlp": 1.02407455, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.6205409815869227, + "language_loss": 0.7407577, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76181936, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 2.728433847427368 + }, + { + "auxiliary_loss_clip": 0.01082605, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.03357887, + "balance_loss_mlp": 1.01969028, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 1.6576412370748115, + "language_loss": 0.70836103, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.72951913, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 2.627368927001953 + }, + { + "auxiliary_loss_clip": 0.01065809, + "auxiliary_loss_mlp": 0.01055286, + "balance_loss_clip": 1.03813434, + "balance_loss_mlp": 1.0366652, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.6538737415211426, + "language_loss": 0.8021692, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82338017, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 2.7446787357330322 + }, + { + "auxiliary_loss_clip": 0.01018803, + "auxiliary_loss_mlp": 0.0100194, + "balance_loss_clip": 1.00747061, + "balance_loss_mlp": 1.00036645, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8359575897416844, + "language_loss": 0.57839239, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59859985, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.0403811931610107 + }, + { + "auxiliary_loss_clip": 0.01074036, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_clip": 1.03437674, + "balance_loss_mlp": 1.02278972, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.933596039612061, + "language_loss": 0.73490238, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75601959, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 2.729203462600708 + }, + { + "auxiliary_loss_clip": 0.0109424, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.03685248, + "balance_loss_mlp": 1.0239079, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.6641891677792988, + "language_loss": 0.70502985, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72636306, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 2.659846544265747 + }, + { + "auxiliary_loss_clip": 0.01076046, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.03737307, + "balance_loss_mlp": 1.02317882, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.7428317757064664, + "language_loss": 0.77056384, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79170603, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 2.683129072189331 + }, + { + "auxiliary_loss_clip": 0.01105195, + "auxiliary_loss_mlp": 0.01039421, + "balance_loss_clip": 1.03896809, + "balance_loss_mlp": 1.02488923, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.610607610450107, + "language_loss": 0.68524408, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70669025, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.7297937870025635 + }, + { + "auxiliary_loss_clip": 0.01072978, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_clip": 1.03315127, + "balance_loss_mlp": 1.03091884, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7561876382527997, + "language_loss": 0.78830481, + "learning_rate": 3.176169078234487e-06, + "loss": 0.80949569, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 2.6624600887298584 + }, + { + "auxiliary_loss_clip": 0.01096714, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.03498149, + "balance_loss_mlp": 1.02069509, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.5901257738563144, + "language_loss": 0.74227393, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76358241, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.5698277950286865 + }, + { + "auxiliary_loss_clip": 0.01093179, + "auxiliary_loss_mlp": 0.01039229, + "balance_loss_clip": 1.0347687, + "balance_loss_mlp": 1.02375567, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 1.842431614769002, + "language_loss": 0.63363671, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65496081, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 2.643054485321045 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.03819072, + "balance_loss_mlp": 1.0231607, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 2.25031363653403, + "language_loss": 0.81474262, + "learning_rate": 3.175223888387192e-06, + "loss": 0.83628273, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 2.5600738525390625 + }, + { + "auxiliary_loss_clip": 0.01083132, + "auxiliary_loss_mlp": 0.01045256, + "balance_loss_clip": 1.03586364, + "balance_loss_mlp": 1.03082538, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.7567765883313922, + "language_loss": 0.76195437, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78323829, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 2.615849733352661 + }, + { + "auxiliary_loss_clip": 0.01081912, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.03982091, + "balance_loss_mlp": 1.02456951, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.8355968873301178, + "language_loss": 0.7952618, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.8164708, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 2.665355920791626 + }, + { + "auxiliary_loss_clip": 0.01094823, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.03858352, + "balance_loss_mlp": 1.02125645, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 2.271491017136658, + "language_loss": 0.74535447, + "learning_rate": 3.174278297458438e-06, + "loss": 0.76667309, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 2.638439416885376 + }, + { + "auxiliary_loss_clip": 0.01051491, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.03238177, + "balance_loss_mlp": 1.02069795, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.7404471558552577, + "language_loss": 0.8248862, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84576797, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 2.7625198364257812 + }, + { + "auxiliary_loss_clip": 0.0106876, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.0362556, + "balance_loss_mlp": 1.02087533, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 1.821650201009104, + "language_loss": 0.79453009, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81557763, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 2.6610021591186523 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.03609276, + "balance_loss_mlp": 1.02140284, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.602393642909806, + "language_loss": 0.8310166, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85229671, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 2.61352276802063 + }, + { + "auxiliary_loss_clip": 0.01087253, + "auxiliary_loss_mlp": 0.01038904, + "balance_loss_clip": 1.0382359, + "balance_loss_mlp": 1.02296543, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.5917176437004368, + "language_loss": 0.81732047, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83858204, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.7280731201171875 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.03814101, + "balance_loss_mlp": 1.02545404, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 3.8936488153693047, + "language_loss": 0.80065835, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82209033, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 2.5731852054595947 + }, + { + "auxiliary_loss_clip": 0.01088722, + "auxiliary_loss_mlp": 0.01051406, + "balance_loss_clip": 1.036798, + "balance_loss_mlp": 1.03612947, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 2.0159804244213007, + "language_loss": 0.84997141, + "learning_rate": 3.172385913647542e-06, + "loss": 0.8713727, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.611767292022705 + }, + { + "auxiliary_loss_clip": 0.01085181, + "auxiliary_loss_mlp": 0.01039603, + "balance_loss_clip": 1.03619146, + "balance_loss_mlp": 1.02458239, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 1.5615455527150093, + "language_loss": 0.80804849, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82929623, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 4.11592435836792 + }, + { + "auxiliary_loss_clip": 0.01105744, + "auxiliary_loss_mlp": 0.0104172, + "balance_loss_clip": 1.03843594, + "balance_loss_mlp": 1.02716446, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.5944518002909633, + "language_loss": 0.79728711, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.81876171, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.6151907444000244 + }, + { + "auxiliary_loss_clip": 0.01082804, + "auxiliary_loss_mlp": 0.01041356, + "balance_loss_clip": 1.0378623, + "balance_loss_mlp": 1.02569163, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.7765416625302597, + "language_loss": 0.75554395, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77678561, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 2.6605958938598633 + }, + { + "auxiliary_loss_clip": 0.0107401, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.03797317, + "balance_loss_mlp": 1.020473, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 1.8797317721701823, + "language_loss": 0.81857193, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83967125, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.6986348628997803 + }, + { + "auxiliary_loss_clip": 0.01055226, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.04123867, + "balance_loss_mlp": 1.02152705, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.4944251330231926, + "language_loss": 0.73131651, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75224006, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.8927268981933594 + }, + { + "auxiliary_loss_clip": 0.01081882, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.03489304, + "balance_loss_mlp": 1.02217078, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 2.51372697560668, + "language_loss": 0.83362198, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.85480481, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 2.9477932453155518 + }, + { + "auxiliary_loss_clip": 0.01121155, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.04166973, + "balance_loss_mlp": 1.03125477, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 1.7887756756470545, + "language_loss": 0.70937216, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73104227, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 4.115885972976685 + }, + { + "auxiliary_loss_clip": 0.01076258, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.0390538, + "balance_loss_mlp": 1.02353954, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.4874885600337624, + "language_loss": 0.67864454, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.69980103, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.7190117835998535 + }, + { + "auxiliary_loss_clip": 0.01011369, + "auxiliary_loss_mlp": 0.01004605, + "balance_loss_clip": 1.01058578, + "balance_loss_mlp": 1.00294828, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7071637043962496, + "language_loss": 0.58301389, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60317367, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.268502712249756 + }, + { + "auxiliary_loss_clip": 0.01054837, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.0341332, + "balance_loss_mlp": 1.02328038, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 1.6659526003459362, + "language_loss": 0.83656454, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.8575027, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 4.310226917266846 + }, + { + "auxiliary_loss_clip": 0.01104617, + "auxiliary_loss_mlp": 0.01035241, + "balance_loss_clip": 1.0364188, + "balance_loss_mlp": 1.02039909, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.5599568503080141, + "language_loss": 0.79605722, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81745589, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 4.196394443511963 + }, + { + "auxiliary_loss_clip": 0.01023719, + "auxiliary_loss_mlp": 0.0101442, + "balance_loss_clip": 1.00880349, + "balance_loss_mlp": 1.01289988, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.6600320203792001, + "language_loss": 0.57008326, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59046471, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 3.0002593994140625 + }, + { + "auxiliary_loss_clip": 0.01055158, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.03372586, + "balance_loss_mlp": 1.02331233, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.9638631483639688, + "language_loss": 0.7147702, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73570442, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 2.872767448425293 + }, + { + "auxiliary_loss_clip": 0.01097676, + "auxiliary_loss_mlp": 0.01045786, + "balance_loss_clip": 1.03765106, + "balance_loss_mlp": 1.03019965, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.6968990047165138, + "language_loss": 0.73849714, + "learning_rate": 3.167964131913135e-06, + "loss": 0.7599318, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.7675609588623047 + }, + { + "auxiliary_loss_clip": 0.01109026, + "auxiliary_loss_mlp": 0.01040104, + "balance_loss_clip": 1.03900838, + "balance_loss_mlp": 1.02508426, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.372142631993466, + "language_loss": 0.76889765, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79038894, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.607436180114746 + }, + { + "auxiliary_loss_clip": 0.01096322, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.03908718, + "balance_loss_mlp": 1.02278984, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.9871583350294513, + "language_loss": 0.77192765, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79327732, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 2.5625195503234863 + }, + { + "auxiliary_loss_clip": 0.0110288, + "auxiliary_loss_mlp": 0.01040249, + "balance_loss_clip": 1.04492509, + "balance_loss_mlp": 1.02550316, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 2.244718547157026, + "language_loss": 0.76667261, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78810394, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.6905102729797363 + }, + { + "auxiliary_loss_clip": 0.0108643, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.03567457, + "balance_loss_mlp": 1.01935887, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 2.117940050385247, + "language_loss": 0.72324276, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74445379, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 2.6085100173950195 + }, + { + "auxiliary_loss_clip": 0.01112948, + "auxiliary_loss_mlp": 0.01037774, + "balance_loss_clip": 1.03798366, + "balance_loss_mlp": 1.02367151, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 4.050110796627107, + "language_loss": 0.74341011, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76491737, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 2.5668935775756836 + }, + { + "auxiliary_loss_clip": 0.01073571, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.0321362, + "balance_loss_mlp": 1.02065468, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.6170079112051494, + "language_loss": 0.78484619, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80594116, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 2.6680049896240234 + }, + { + "auxiliary_loss_clip": 0.01075409, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.03748107, + "balance_loss_mlp": 1.01585913, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.9756049247697127, + "language_loss": 0.83582652, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85687983, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 2.6719677448272705 + }, + { + "auxiliary_loss_clip": 0.01117223, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.04039037, + "balance_loss_mlp": 1.02020168, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 2.1441763503532094, + "language_loss": 0.82610798, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.84762508, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 2.5746383666992188 + }, + { + "auxiliary_loss_clip": 0.01109507, + "auxiliary_loss_mlp": 0.0075012, + "balance_loss_clip": 1.04016423, + "balance_loss_mlp": 1.00043416, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 3.7081524884835093, + "language_loss": 0.88747305, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90606934, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.6537933349609375 + }, + { + "auxiliary_loss_clip": 0.01116789, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.04111362, + "balance_loss_mlp": 1.02654505, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.9284580993608649, + "language_loss": 0.72847164, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75005019, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.5575404167175293 + }, + { + "auxiliary_loss_clip": 0.01085301, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.0372684, + "balance_loss_mlp": 1.01678562, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.735161017517109, + "language_loss": 0.81472337, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83588719, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.6157443523406982 + }, + { + "auxiliary_loss_clip": 0.01070658, + "auxiliary_loss_mlp": 0.01040703, + "balance_loss_clip": 1.03426421, + "balance_loss_mlp": 1.02637398, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 1.9600851885184736, + "language_loss": 0.87331158, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89442515, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 2.6956231594085693 + }, + { + "auxiliary_loss_clip": 0.01117116, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.03765631, + "balance_loss_mlp": 1.01907969, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 2.0817257391711435, + "language_loss": 0.75782549, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.77934933, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 2.5785045623779297 + }, + { + "auxiliary_loss_clip": 0.01077719, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.04000974, + "balance_loss_mlp": 1.01903856, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.6532303456016488, + "language_loss": 0.66950226, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.690611, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 2.6601455211639404 + }, + { + "auxiliary_loss_clip": 0.01077144, + "auxiliary_loss_mlp": 0.01044643, + "balance_loss_clip": 1.03558969, + "balance_loss_mlp": 1.0278163, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.507623691734556, + "language_loss": 0.7234329, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74465078, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 2.6831297874450684 + }, + { + "auxiliary_loss_clip": 0.01098911, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.03736639, + "balance_loss_mlp": 1.0193758, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.8696474268073275, + "language_loss": 0.82461083, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84593928, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 2.6281235218048096 + }, + { + "auxiliary_loss_clip": 0.01105515, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.03745031, + "balance_loss_mlp": 1.02103877, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.6598095547778122, + "language_loss": 0.78905535, + "learning_rate": 3.162583158454388e-06, + "loss": 0.81045854, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 2.6710124015808105 + }, + { + "auxiliary_loss_clip": 0.01103065, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.0395391, + "balance_loss_mlp": 1.02511907, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.6070309093573805, + "language_loss": 0.76701522, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.78843433, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 2.627065420150757 + }, + { + "auxiliary_loss_clip": 0.01099578, + "auxiliary_loss_mlp": 0.01035092, + "balance_loss_clip": 1.03701949, + "balance_loss_mlp": 1.02218747, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.7423953323539747, + "language_loss": 0.71373457, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.73508132, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 2.596647024154663 + }, + { + "auxiliary_loss_clip": 0.01087345, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_clip": 1.0357933, + "balance_loss_mlp": 1.02938318, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.1235192229311437, + "language_loss": 0.70002294, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72135472, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 2.636401891708374 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.03917992, + "balance_loss_mlp": 1.02492952, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 2.388459046419263, + "language_loss": 0.78807402, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80948377, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 2.586434841156006 + }, + { + "auxiliary_loss_clip": 0.01059361, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.02599967, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 3.87271744216741, + "language_loss": 0.7459873, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76699704, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.7323882579803467 + }, + { + "auxiliary_loss_clip": 0.01091095, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_clip": 1.03995705, + "balance_loss_mlp": 1.0276823, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.8141706219439424, + "language_loss": 0.71493489, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.73626161, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.686640501022339 + }, + { + "auxiliary_loss_clip": 0.01117352, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.03779829, + "balance_loss_mlp": 1.02425623, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.7644982777637903, + "language_loss": 0.9420948, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96365559, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 2.589963674545288 + }, + { + "auxiliary_loss_clip": 0.01111575, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.04176056, + "balance_loss_mlp": 1.02562714, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.001536479624172, + "language_loss": 0.77340496, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79493409, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.602935791015625 + }, + { + "auxiliary_loss_clip": 0.01086723, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.03480077, + "balance_loss_mlp": 1.01890194, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 1.782128877247561, + "language_loss": 0.71367741, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73488212, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.73557448387146 + }, + { + "auxiliary_loss_clip": 0.01081518, + "auxiliary_loss_mlp": 0.01037286, + "balance_loss_clip": 1.03967237, + "balance_loss_mlp": 1.02228999, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 3.0820697738891347, + "language_loss": 0.80565411, + "learning_rate": 3.159411924656557e-06, + "loss": 0.82684213, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 4.078688859939575 + }, + { + "auxiliary_loss_clip": 0.01086783, + "auxiliary_loss_mlp": 0.01043743, + "balance_loss_clip": 1.03846049, + "balance_loss_mlp": 1.02814484, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 1.8737967136865492, + "language_loss": 0.7303223, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75162756, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 2.6440484523773193 + }, + { + "auxiliary_loss_clip": 0.01089357, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.03611636, + "balance_loss_mlp": 1.02349949, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.5865727611417137, + "language_loss": 0.7684809, + "learning_rate": 3.158777149931855e-06, + "loss": 0.78974223, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.5730950832366943 + }, + { + "auxiliary_loss_clip": 0.01087702, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.03617847, + "balance_loss_mlp": 1.02841139, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 2.3974148350746005, + "language_loss": 0.62389845, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64522541, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 2.6970431804656982 + }, + { + "auxiliary_loss_clip": 0.01102101, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.03839576, + "balance_loss_mlp": 1.02208281, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.9720841951728358, + "language_loss": 0.82121432, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84260285, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.59031081199646 + }, + { + "auxiliary_loss_clip": 0.01088806, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.02628946, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.6912292250280927, + "language_loss": 0.8119716, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83324993, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 2.705552577972412 + }, + { + "auxiliary_loss_clip": 0.01105864, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.04261172, + "balance_loss_mlp": 1.02278471, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 1.8275787504460879, + "language_loss": 0.83502394, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85644245, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 4.133347034454346 + }, + { + "auxiliary_loss_clip": 0.01079383, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.03835905, + "balance_loss_mlp": 1.02844429, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 2.050263108422433, + "language_loss": 0.75974715, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.7809931, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.6613409519195557 + }, + { + "auxiliary_loss_clip": 0.01082615, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.0391376, + "balance_loss_mlp": 1.02274728, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.0609677804090327, + "language_loss": 0.66405392, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68525064, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.6671998500823975 + }, + { + "auxiliary_loss_clip": 0.01086539, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.03592134, + "balance_loss_mlp": 1.01305985, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 2.8940260298658758, + "language_loss": 0.72937453, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75050801, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 4.128523349761963 + }, + { + "auxiliary_loss_clip": 0.01078099, + "auxiliary_loss_mlp": 0.0103734, + "balance_loss_clip": 1.035882, + "balance_loss_mlp": 1.02233768, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.0398389978269793, + "language_loss": 0.71187127, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73302567, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 4.144531011581421 + }, + { + "auxiliary_loss_clip": 0.01107057, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.03914177, + "balance_loss_mlp": 1.02289772, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 2.4099327663036894, + "language_loss": 0.79747266, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81890488, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 2.6483957767486572 + }, + { + "auxiliary_loss_clip": 0.01084548, + "auxiliary_loss_mlp": 0.01041478, + "balance_loss_clip": 1.03577518, + "balance_loss_mlp": 1.02525377, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.3486117747253794, + "language_loss": 0.87842464, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89968485, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.624479293823242 + }, + { + "auxiliary_loss_clip": 0.01053914, + "auxiliary_loss_mlp": 0.01044144, + "balance_loss_clip": 1.03090668, + "balance_loss_mlp": 1.02832448, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.937226422689456, + "language_loss": 0.84935641, + "learning_rate": 3.155282749751332e-06, + "loss": 0.87033695, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.6696157455444336 + }, + { + "auxiliary_loss_clip": 0.01078399, + "auxiliary_loss_mlp": 0.01039869, + "balance_loss_clip": 1.03588474, + "balance_loss_mlp": 1.02713132, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.4827939721247447, + "language_loss": 0.87507832, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89626098, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 2.654857635498047 + }, + { + "auxiliary_loss_clip": 0.01103177, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.0390172, + "balance_loss_mlp": 1.01963234, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6435193019348004, + "language_loss": 0.7218349, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74320543, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 2.634108304977417 + }, + { + "auxiliary_loss_clip": 0.0107585, + "auxiliary_loss_mlp": 0.01039596, + "balance_loss_clip": 1.03890872, + "balance_loss_mlp": 1.02518964, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 2.0960080046349683, + "language_loss": 0.82599413, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.8471486, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.685990571975708 + }, + { + "auxiliary_loss_clip": 0.01112296, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.03848338, + "balance_loss_mlp": 1.01697326, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.7817538089406695, + "language_loss": 0.87668514, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.89810699, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 2.633803367614746 + }, + { + "auxiliary_loss_clip": 0.01089698, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.03637409, + "balance_loss_mlp": 1.01951289, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.388228905342812, + "language_loss": 0.69349825, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71472842, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 2.6928067207336426 + }, + { + "auxiliary_loss_clip": 0.01108749, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.03809047, + "balance_loss_mlp": 1.019822, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.7410680415192223, + "language_loss": 0.77426624, + "learning_rate": 3.153374478034841e-06, + "loss": 0.7956953, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 2.628310203552246 + }, + { + "auxiliary_loss_clip": 0.01055701, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.03118396, + "balance_loss_mlp": 1.02535677, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 1.5987453688670796, + "language_loss": 0.83090818, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85186046, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.9007880687713623 + }, + { + "auxiliary_loss_clip": 0.01062315, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.03563809, + "balance_loss_mlp": 1.02007389, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.5401066499615204, + "language_loss": 0.71115172, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73210603, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.6580734252929688 + }, + { + "auxiliary_loss_clip": 0.01059982, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.03498769, + "balance_loss_mlp": 1.02281582, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.5682080342579683, + "language_loss": 0.83104986, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85200924, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.7334048748016357 + }, + { + "auxiliary_loss_clip": 0.01084999, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.03867328, + "balance_loss_mlp": 1.01948762, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 1.6414811648310108, + "language_loss": 0.80481666, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82601011, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.753401279449463 + }, + { + "auxiliary_loss_clip": 0.01091895, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.03820729, + "balance_loss_mlp": 1.01800346, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 2.4303580039394665, + "language_loss": 0.76917326, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79041779, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 2.6557679176330566 + }, + { + "auxiliary_loss_clip": 0.01004001, + "auxiliary_loss_mlp": 0.01012585, + "balance_loss_clip": 1.00951672, + "balance_loss_mlp": 1.01122558, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9089852336724995, + "language_loss": 0.63930774, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.65947354, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.137448310852051 + }, + { + "auxiliary_loss_clip": 0.01064229, + "auxiliary_loss_mlp": 0.01037655, + "balance_loss_clip": 1.03299975, + "balance_loss_mlp": 1.02363563, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 1.7472651644211086, + "language_loss": 0.73984236, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76086116, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 2.772765874862671 + }, + { + "auxiliary_loss_clip": 0.01037295, + "auxiliary_loss_mlp": 0.01005216, + "balance_loss_clip": 1.01273179, + "balance_loss_mlp": 1.00383329, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.8391355004297449, + "language_loss": 0.57940853, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59983361, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 3.2084720134735107 + }, + { + "auxiliary_loss_clip": 0.01013982, + "auxiliary_loss_mlp": 0.01006818, + "balance_loss_clip": 1.00933421, + "balance_loss_mlp": 1.00517261, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.8252342152511627, + "language_loss": 0.63366413, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65387213, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 3.2667038440704346 + }, + { + "auxiliary_loss_clip": 0.01085667, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.03817701, + "balance_loss_mlp": 1.02310228, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 2.0842383004404583, + "language_loss": 0.69316107, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71437979, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 2.6041440963745117 + }, + { + "auxiliary_loss_clip": 0.01103571, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.03881121, + "balance_loss_mlp": 1.01996613, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.6766760194044428, + "language_loss": 0.76865125, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79003155, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 2.590878486633301 + }, + { + "auxiliary_loss_clip": 0.01099915, + "auxiliary_loss_mlp": 0.00749726, + "balance_loss_clip": 1.03581786, + "balance_loss_mlp": 1.00033581, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.5213211804337359, + "language_loss": 0.80132663, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.81982303, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 2.636601448059082 + }, + { + "auxiliary_loss_clip": 0.01110737, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.03831744, + "balance_loss_mlp": 1.01896858, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.5852689413738617, + "language_loss": 0.75369304, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77511907, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 2.595940351486206 + }, + { + "auxiliary_loss_clip": 0.01079829, + "auxiliary_loss_mlp": 0.00749609, + "balance_loss_clip": 1.03817284, + "balance_loss_mlp": 1.00028086, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.168653147361433, + "language_loss": 0.62640345, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64469779, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.621196985244751 + }, + { + "auxiliary_loss_clip": 0.01077844, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.03388357, + "balance_loss_mlp": 1.0165906, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 2.2602314419177936, + "language_loss": 0.74642479, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76748681, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.631018877029419 + }, + { + "auxiliary_loss_clip": 0.01084983, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.03701425, + "balance_loss_mlp": 1.02018881, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.5922373152278644, + "language_loss": 0.76863849, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.7898156, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.685159921646118 + }, + { + "auxiliary_loss_clip": 0.01083072, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.03596544, + "balance_loss_mlp": 1.02364922, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 2.363650075965482, + "language_loss": 0.77945918, + "learning_rate": 3.147959166423428e-06, + "loss": 0.80067873, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 2.724561929702759 + }, + { + "auxiliary_loss_clip": 0.01067938, + "auxiliary_loss_mlp": 0.01036226, + "balance_loss_clip": 1.03646052, + "balance_loss_mlp": 1.02143228, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.727056639786864, + "language_loss": 0.73738742, + "learning_rate": 3.147640226324893e-06, + "loss": 0.75842905, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.700629949569702 + }, + { + "auxiliary_loss_clip": 0.01080271, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.03517866, + "balance_loss_mlp": 1.02089906, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.608635301965707, + "language_loss": 0.79119778, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.8123523, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 2.692312240600586 + }, + { + "auxiliary_loss_clip": 0.01100081, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.0362463, + "balance_loss_mlp": 1.02461505, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.5917570384775064, + "language_loss": 0.71175373, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73313606, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.5674538612365723 + }, + { + "auxiliary_loss_clip": 0.01078451, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.03699064, + "balance_loss_mlp": 1.02262044, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.579221686085342, + "language_loss": 0.78664207, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80777693, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 4.198997974395752 + }, + { + "auxiliary_loss_clip": 0.01066106, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.03903866, + "balance_loss_mlp": 1.02412105, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 4.549903645307167, + "language_loss": 0.84356564, + "learning_rate": 3.146364030865399e-06, + "loss": 0.8646161, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 2.685986042022705 + }, + { + "auxiliary_loss_clip": 0.01096619, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.03679323, + "balance_loss_mlp": 1.01888812, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.65699155587855, + "language_loss": 0.70585525, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72714174, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 2.5858726501464844 + }, + { + "auxiliary_loss_clip": 0.01049937, + "auxiliary_loss_mlp": 0.01051397, + "balance_loss_clip": 1.0322063, + "balance_loss_mlp": 1.036901, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.5363603858043315, + "language_loss": 0.84359735, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86461073, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 2.6800761222839355 + }, + { + "auxiliary_loss_clip": 0.01089228, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.03828716, + "balance_loss_mlp": 1.01683128, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.479930924267899, + "language_loss": 0.85695422, + "learning_rate": 3.145406427790931e-06, + "loss": 0.87814355, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.60933256149292 + }, + { + "auxiliary_loss_clip": 0.01089459, + "auxiliary_loss_mlp": 0.01037804, + "balance_loss_clip": 1.03615344, + "balance_loss_mlp": 1.02370155, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.9369602976338693, + "language_loss": 0.87765408, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89892673, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 4.204620838165283 + }, + { + "auxiliary_loss_clip": 0.01112203, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.03813267, + "balance_loss_mlp": 1.017066, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.247464972903345, + "language_loss": 0.75890708, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78032559, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.5446090698242188 + }, + { + "auxiliary_loss_clip": 0.01113456, + "auxiliary_loss_mlp": 0.01031513, + "balance_loss_clip": 1.03920627, + "balance_loss_mlp": 1.01848972, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.9096236513749525, + "language_loss": 0.719024, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74047369, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.579092025756836 + }, + { + "auxiliary_loss_clip": 0.01068503, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.0334506, + "balance_loss_mlp": 1.01915836, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.5743500539098396, + "language_loss": 0.63670027, + "learning_rate": 3.144129015673189e-06, + "loss": 0.65773106, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.711817502975464 + }, + { + "auxiliary_loss_clip": 0.01104713, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.0407151, + "balance_loss_mlp": 1.02049088, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.7651773971835165, + "language_loss": 0.74440515, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76579642, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 4.241560697555542 + }, + { + "auxiliary_loss_clip": 0.01104526, + "auxiliary_loss_mlp": 0.01039655, + "balance_loss_clip": 1.03963053, + "balance_loss_mlp": 1.02502775, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.2290916166768144, + "language_loss": 0.75010365, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77154541, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 2.627246379852295 + }, + { + "auxiliary_loss_clip": 0.01101756, + "auxiliary_loss_mlp": 0.007497, + "balance_loss_clip": 1.0386771, + "balance_loss_mlp": 1.00020361, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9992799175964997, + "language_loss": 0.84488612, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.8634007, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.584047317504883 + }, + { + "auxiliary_loss_clip": 0.01097524, + "auxiliary_loss_mlp": 0.01036137, + "balance_loss_clip": 1.03511024, + "balance_loss_mlp": 1.02179646, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 2.0049033301445687, + "language_loss": 0.86188447, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88322109, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.5960710048675537 + }, + { + "auxiliary_loss_clip": 0.01078638, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.037889, + "balance_loss_mlp": 1.01755583, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.7381785319774787, + "language_loss": 0.7713871, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79250056, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 2.7024173736572266 + }, + { + "auxiliary_loss_clip": 0.01084193, + "auxiliary_loss_mlp": 0.00749709, + "balance_loss_clip": 1.03699505, + "balance_loss_mlp": 1.00025272, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.4188928353260857, + "language_loss": 0.81874478, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83708376, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 2.690106153488159 + }, + { + "auxiliary_loss_clip": 0.010714, + "auxiliary_loss_mlp": 0.0103701, + "balance_loss_clip": 1.03613174, + "balance_loss_mlp": 1.02275229, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.288993427071783, + "language_loss": 0.59143555, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61251962, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.6860811710357666 + }, + { + "auxiliary_loss_clip": 0.01107424, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.0408895, + "balance_loss_mlp": 1.02001309, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.3873680402887354, + "language_loss": 0.88797796, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90939593, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 2.6020774841308594 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.04366446, + "balance_loss_mlp": 1.01981521, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.7534098034731311, + "language_loss": 0.79207897, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81351817, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 2.6792893409729004 + }, + { + "auxiliary_loss_clip": 0.01082365, + "auxiliary_loss_mlp": 0.00749895, + "balance_loss_clip": 1.03532219, + "balance_loss_mlp": 1.00024199, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 2.9449642894942665, + "language_loss": 0.73011231, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.7484349, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.6792237758636475 + }, + { + "auxiliary_loss_clip": 0.01114531, + "auxiliary_loss_mlp": 0.01039277, + "balance_loss_clip": 1.03904521, + "balance_loss_mlp": 1.02539492, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.4251487563709493, + "language_loss": 0.66905147, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69058955, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.687713623046875 + }, + { + "auxiliary_loss_clip": 0.01083908, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.03909993, + "balance_loss_mlp": 1.01711893, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.497758739898532, + "language_loss": 0.6528486, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67399359, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.7081751823425293 + }, + { + "auxiliary_loss_clip": 0.01102983, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.03890145, + "balance_loss_mlp": 1.02422309, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.4930286676636542, + "language_loss": 0.77107662, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79248226, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.613290786743164 + }, + { + "auxiliary_loss_clip": 0.01108, + "auxiliary_loss_mlp": 0.01039402, + "balance_loss_clip": 1.04060543, + "balance_loss_mlp": 1.02447081, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.4812069799514505, + "language_loss": 0.70663875, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72811282, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 2.6729378700256348 + }, + { + "auxiliary_loss_clip": 0.01087997, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.03636932, + "balance_loss_mlp": 1.02008653, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.5885546360403315, + "language_loss": 0.78961682, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.81083119, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 2.647279977798462 + }, + { + "auxiliary_loss_clip": 0.01107258, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.03940094, + "balance_loss_mlp": 1.01863992, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 1.9719567559805329, + "language_loss": 0.75603151, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77742773, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.7106504440307617 + }, + { + "auxiliary_loss_clip": 0.01042651, + "auxiliary_loss_mlp": 0.0104278, + "balance_loss_clip": 1.02888083, + "balance_loss_mlp": 1.02952361, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.1301496903287203, + "language_loss": 0.76885587, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.78971016, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 2.6385130882263184 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.0104109, + "balance_loss_clip": 1.03951454, + "balance_loss_mlp": 1.02591443, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.7522129176255306, + "language_loss": 0.73912048, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76058865, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.7236287593841553 + }, + { + "auxiliary_loss_clip": 0.01115968, + "auxiliary_loss_mlp": 0.01041466, + "balance_loss_clip": 1.03915501, + "balance_loss_mlp": 1.02778053, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.396585210800322, + "language_loss": 0.78233588, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80391026, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 2.656022787094116 + }, + { + "auxiliary_loss_clip": 0.01080911, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.03491139, + "balance_loss_mlp": 1.01915574, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.4828242329358368, + "language_loss": 0.78707474, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.80821526, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 2.756098747253418 + }, + { + "auxiliary_loss_clip": 0.01096736, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.03792691, + "balance_loss_mlp": 1.02330112, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 1.9301638423666683, + "language_loss": 0.72935516, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.7506935, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 2.608564853668213 + }, + { + "auxiliary_loss_clip": 0.01096112, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.03996086, + "balance_loss_mlp": 1.02419662, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8545048438484226, + "language_loss": 0.8409431, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86228341, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 2.725297451019287 + }, + { + "auxiliary_loss_clip": 0.01113173, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.03682065, + "balance_loss_mlp": 1.01759136, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.9688521078903318, + "language_loss": 0.76593345, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78737092, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 2.583357810974121 + }, + { + "auxiliary_loss_clip": 0.01093934, + "auxiliary_loss_mlp": 0.01036742, + "balance_loss_clip": 1.03662229, + "balance_loss_mlp": 1.02125096, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.7960580310231165, + "language_loss": 0.62708676, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.64839351, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.7406928539276123 + }, + { + "auxiliary_loss_clip": 0.01111533, + "auxiliary_loss_mlp": 0.00749656, + "balance_loss_clip": 1.03856635, + "balance_loss_mlp": 1.00022435, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.4168681872714564, + "language_loss": 0.78340954, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80202138, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.583028554916382 + }, + { + "auxiliary_loss_clip": 0.01083841, + "auxiliary_loss_mlp": 0.01034835, + "balance_loss_clip": 1.0377171, + "balance_loss_mlp": 1.02043414, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.572506538453671, + "language_loss": 0.69750118, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.71868795, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.630862236022949 + }, + { + "auxiliary_loss_clip": 0.01101711, + "auxiliary_loss_mlp": 0.01035098, + "balance_loss_clip": 1.03864086, + "balance_loss_mlp": 1.02157927, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.7413795921350668, + "language_loss": 0.72387433, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74524248, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 2.57302188873291 + }, + { + "auxiliary_loss_clip": 0.01084685, + "auxiliary_loss_mlp": 0.0104681, + "balance_loss_clip": 1.0364933, + "balance_loss_mlp": 1.03192663, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.7934266422553278, + "language_loss": 0.83126521, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85258013, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 2.64509654045105 + }, + { + "auxiliary_loss_clip": 0.01092158, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.03656769, + "balance_loss_mlp": 1.01886368, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.8623510955583502, + "language_loss": 0.79102588, + "learning_rate": 3.134847066213879e-06, + "loss": 0.8122713, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.6571459770202637 + }, + { + "auxiliary_loss_clip": 0.01091915, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.0365634, + "balance_loss_mlp": 1.01811361, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.6676426090111578, + "language_loss": 0.74346519, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76470542, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.7033824920654297 + }, + { + "auxiliary_loss_clip": 0.01087725, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.03740036, + "balance_loss_mlp": 1.02596438, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.7651381691889316, + "language_loss": 0.78262681, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80391461, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 2.6774120330810547 + }, + { + "auxiliary_loss_clip": 0.01077951, + "auxiliary_loss_mlp": 0.0103146, + "balance_loss_clip": 1.03475356, + "balance_loss_mlp": 1.01801956, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.7175705157633165, + "language_loss": 0.81774127, + "learning_rate": 3.133884793883107e-06, + "loss": 0.83883536, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 4.244718074798584 + }, + { + "auxiliary_loss_clip": 0.01114782, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.03754878, + "balance_loss_mlp": 1.02155137, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.75502272290315, + "language_loss": 0.67647588, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69798064, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 2.7691924571990967 + }, + { + "auxiliary_loss_clip": 0.01119286, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_clip": 1.03955591, + "balance_loss_mlp": 1.0275445, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 2.2003966863909383, + "language_loss": 0.64850533, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67013466, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 2.723322629928589 + }, + { + "auxiliary_loss_clip": 0.01104427, + "auxiliary_loss_mlp": 0.01042293, + "balance_loss_clip": 1.04071116, + "balance_loss_mlp": 1.0272491, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6939157426394527, + "language_loss": 0.88065946, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90212667, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 2.586672067642212 + }, + { + "auxiliary_loss_clip": 0.01074759, + "auxiliary_loss_mlp": 0.01037559, + "balance_loss_clip": 1.03450966, + "balance_loss_mlp": 1.0219183, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 1.851884108956157, + "language_loss": 0.78595269, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80707586, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 2.717046022415161 + }, + { + "auxiliary_loss_clip": 0.01013828, + "auxiliary_loss_mlp": 0.01022789, + "balance_loss_clip": 1.00903678, + "balance_loss_mlp": 1.0209527, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.8162686891204645, + "language_loss": 0.60246301, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62282914, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.0942094326019287 + }, + { + "auxiliary_loss_clip": 0.01061011, + "auxiliary_loss_mlp": 0.01057758, + "balance_loss_clip": 1.03070557, + "balance_loss_mlp": 1.03900611, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.8688664726984974, + "language_loss": 0.77288073, + "learning_rate": 3.131959088630455e-06, + "loss": 0.79406846, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 4.17413592338562 + }, + { + "auxiliary_loss_clip": 0.01074188, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.03557014, + "balance_loss_mlp": 1.02613115, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.817093130214778, + "language_loss": 0.74469507, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76583791, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.636976718902588 + }, + { + "auxiliary_loss_clip": 0.01109485, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.03784454, + "balance_loss_mlp": 1.02437806, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 1.9482196227798452, + "language_loss": 0.75710791, + "learning_rate": 3.131316843357713e-06, + "loss": 0.77857196, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 4.13489842414856 + }, + { + "auxiliary_loss_clip": 0.01099351, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.03599644, + "balance_loss_mlp": 1.0208168, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.8387309940569645, + "language_loss": 0.80010593, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.8214376, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 4.0471954345703125 + }, + { + "auxiliary_loss_clip": 0.01016194, + "auxiliary_loss_mlp": 0.01005654, + "balance_loss_clip": 1.01155019, + "balance_loss_mlp": 1.00436616, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7445725678767633, + "language_loss": 0.56542146, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58563995, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.2293269634246826 + }, + { + "auxiliary_loss_clip": 0.01096547, + "auxiliary_loss_mlp": 0.00749778, + "balance_loss_clip": 1.03520072, + "balance_loss_mlp": 1.0003494, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.7689009534534381, + "language_loss": 0.77507424, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79353756, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.597136974334717 + }, + { + "auxiliary_loss_clip": 0.01094811, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.03880572, + "balance_loss_mlp": 1.02600372, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 2.046364614540114, + "language_loss": 0.787233, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80857515, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 2.741565227508545 + }, + { + "auxiliary_loss_clip": 0.01102943, + "auxiliary_loss_mlp": 0.01039717, + "balance_loss_clip": 1.03625584, + "balance_loss_mlp": 1.02527475, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.7114746796208782, + "language_loss": 0.73995382, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76138043, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 2.6043925285339355 + }, + { + "auxiliary_loss_clip": 0.01094595, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.03628361, + "balance_loss_mlp": 1.02985525, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.8762605983905583, + "language_loss": 0.75823855, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77962315, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 2.662236452102661 + }, + { + "auxiliary_loss_clip": 0.01114109, + "auxiliary_loss_mlp": 0.01044776, + "balance_loss_clip": 1.03991115, + "balance_loss_mlp": 1.03098941, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 1.8867010276793283, + "language_loss": 0.7166391, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73822796, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.5998175144195557 + }, + { + "auxiliary_loss_clip": 0.01058989, + "auxiliary_loss_mlp": 0.01039061, + "balance_loss_clip": 1.03760147, + "balance_loss_mlp": 1.02582836, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.9126247243675853, + "language_loss": 0.80547971, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82646018, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 2.8760673999786377 + }, + { + "auxiliary_loss_clip": 0.01079944, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_clip": 1.03360438, + "balance_loss_mlp": 1.02754021, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 1.945535943110254, + "language_loss": 0.84177285, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86299908, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.6429483890533447 + }, + { + "auxiliary_loss_clip": 0.01056994, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.03109443, + "balance_loss_mlp": 1.02524734, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.0616850252981354, + "language_loss": 0.7404635, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76144218, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.7055017948150635 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.03919578, + "balance_loss_mlp": 1.02450824, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.1382915629953754, + "language_loss": 0.72082007, + "learning_rate": 3.127781429646098e-06, + "loss": 0.7423352, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.547595500946045 + }, + { + "auxiliary_loss_clip": 0.0111026, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.03553689, + "balance_loss_mlp": 1.02212465, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.2676437261318427, + "language_loss": 0.8827821, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90423763, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 2.5365705490112305 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.03663445, + "balance_loss_mlp": 1.0192728, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 1.988318708428386, + "language_loss": 0.83234793, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85369158, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 2.6278791427612305 + }, + { + "auxiliary_loss_clip": 0.01086957, + "auxiliary_loss_mlp": 0.01039923, + "balance_loss_clip": 1.03707576, + "balance_loss_mlp": 1.02582014, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 1.7937571007981152, + "language_loss": 0.77691585, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79818469, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 2.748567819595337 + }, + { + "auxiliary_loss_clip": 0.01118244, + "auxiliary_loss_mlp": 0.01038111, + "balance_loss_clip": 1.04122782, + "balance_loss_mlp": 1.02331078, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.215952927892349, + "language_loss": 0.74429756, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76586115, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 2.4977219104766846 + }, + { + "auxiliary_loss_clip": 0.00988808, + "auxiliary_loss_mlp": 0.01012287, + "balance_loss_clip": 1.01010251, + "balance_loss_mlp": 1.01084459, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7895377852357841, + "language_loss": 0.53952253, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55953348, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.242835283279419 + }, + { + "auxiliary_loss_clip": 0.01090281, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.03605354, + "balance_loss_mlp": 1.01789105, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.534546013377965, + "language_loss": 0.8706547, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89187628, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 2.963074207305908 + }, + { + "auxiliary_loss_clip": 0.0108129, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.03861177, + "balance_loss_mlp": 1.02731788, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 1.8829715224301946, + "language_loss": 0.73604172, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75727946, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.7688331604003906 + }, + { + "auxiliary_loss_clip": 0.01078656, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.03455126, + "balance_loss_mlp": 1.01714706, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.0609600472906027, + "language_loss": 0.72586191, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74695766, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 2.6635169982910156 + }, + { + "auxiliary_loss_clip": 0.01086748, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.03651881, + "balance_loss_mlp": 1.02046132, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.0945719379863563, + "language_loss": 0.80166543, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82287133, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.7093565464019775 + }, + { + "auxiliary_loss_clip": 0.01092004, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.03256178, + "balance_loss_mlp": 1.02303255, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 1.9744383056188997, + "language_loss": 0.76393545, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78522718, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 2.6703970432281494 + }, + { + "auxiliary_loss_clip": 0.01096436, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.04019868, + "balance_loss_mlp": 1.02039027, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.572916550073699, + "language_loss": 0.79320574, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81451404, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.6810598373413086 + }, + { + "auxiliary_loss_clip": 0.01104149, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.03874302, + "balance_loss_mlp": 1.0171237, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 1.9568244425195813, + "language_loss": 0.66517735, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68653727, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 2.7753102779388428 + }, + { + "auxiliary_loss_clip": 0.01104615, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.03857255, + "balance_loss_mlp": 1.02655458, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 1.989547021374238, + "language_loss": 0.77656496, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79802632, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.602289915084839 + }, + { + "auxiliary_loss_clip": 0.01091882, + "auxiliary_loss_mlp": 0.01038885, + "balance_loss_clip": 1.04113364, + "balance_loss_mlp": 1.02437735, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.5679710995291334, + "language_loss": 0.72389358, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74520129, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.7455227375030518 + }, + { + "auxiliary_loss_clip": 0.01081696, + "auxiliary_loss_mlp": 0.01041831, + "balance_loss_clip": 1.03324175, + "balance_loss_mlp": 1.02580905, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.636727594858001, + "language_loss": 0.75152469, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77275997, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.65643310546875 + }, + { + "auxiliary_loss_clip": 0.0108931, + "auxiliary_loss_mlp": 0.01037864, + "balance_loss_clip": 1.03695655, + "balance_loss_mlp": 1.02426267, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.991803580267003, + "language_loss": 0.7023918, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72366357, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 2.701698064804077 + }, + { + "auxiliary_loss_clip": 0.01094322, + "auxiliary_loss_mlp": 0.0104483, + "balance_loss_clip": 1.03715801, + "balance_loss_mlp": 1.0309068, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.7872646415281037, + "language_loss": 0.81682611, + "learning_rate": 3.122307436058899e-06, + "loss": 0.83821762, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.731370687484741 + }, + { + "auxiliary_loss_clip": 0.01098471, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.03721595, + "balance_loss_mlp": 1.01728725, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.8024039072339286, + "language_loss": 0.79288429, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81418526, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 2.6708662509918213 + }, + { + "auxiliary_loss_clip": 0.01091512, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.03743601, + "balance_loss_mlp": 1.02461231, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.7766019062113978, + "language_loss": 0.71681881, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73812759, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 2.6551198959350586 + }, + { + "auxiliary_loss_clip": 0.01079964, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.03714943, + "balance_loss_mlp": 1.02220356, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 1.8623035250366637, + "language_loss": 0.71786839, + "learning_rate": 3.12134015873989e-06, + "loss": 0.739025, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 4.115631341934204 + }, + { + "auxiliary_loss_clip": 0.0109683, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.03901434, + "balance_loss_mlp": 1.01475406, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.706668157192298, + "language_loss": 0.73079491, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75204599, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 2.6644785404205322 + }, + { + "auxiliary_loss_clip": 0.01066822, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.03384471, + "balance_loss_mlp": 1.02188957, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.582701307242334, + "language_loss": 0.87932914, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90035331, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 2.715348482131958 + }, + { + "auxiliary_loss_clip": 0.01051836, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.03252029, + "balance_loss_mlp": 1.02421987, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.7195731390854017, + "language_loss": 0.72964287, + "learning_rate": 3.12037249872891e-06, + "loss": 0.75054348, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.750460624694824 + }, + { + "auxiliary_loss_clip": 0.01068267, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.03370738, + "balance_loss_mlp": 1.02142668, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.6773152108345477, + "language_loss": 0.72207832, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.74310863, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.808773994445801 + }, + { + "auxiliary_loss_clip": 0.0107882, + "auxiliary_loss_mlp": 0.01031221, + "balance_loss_clip": 1.03554225, + "balance_loss_mlp": 1.01677251, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.9773323595614767, + "language_loss": 0.68009079, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70119125, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 2.647287368774414 + }, + { + "auxiliary_loss_clip": 0.01083406, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_clip": 1.03516233, + "balance_loss_mlp": 1.02831793, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.1700520955088165, + "language_loss": 0.65913105, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68041235, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 4.2044336795806885 + }, + { + "auxiliary_loss_clip": 0.01098673, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.0369885, + "balance_loss_mlp": 1.01704252, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.6584483260269012, + "language_loss": 0.68991184, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.711209, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.645876407623291 + }, + { + "auxiliary_loss_clip": 0.01103956, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.03687251, + "balance_loss_mlp": 1.02135718, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.6313141020812862, + "language_loss": 0.8048228, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82621574, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.5979573726654053 + }, + { + "auxiliary_loss_clip": 0.01088051, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.03260851, + "balance_loss_mlp": 1.02075815, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.7320519086865258, + "language_loss": 0.74367172, + "learning_rate": 3.118436031952143e-06, + "loss": 0.7649042, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 4.165754556655884 + }, + { + "auxiliary_loss_clip": 0.01012407, + "auxiliary_loss_mlp": 0.01010567, + "balance_loss_clip": 1.0080986, + "balance_loss_mlp": 1.00920188, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6143469585781736, + "language_loss": 0.54373968, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56396943, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.307448148727417 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.03808331, + "balance_loss_mlp": 1.02044046, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 11.571334021272726, + "language_loss": 0.78207028, + "learning_rate": 3.117790203606336e-06, + "loss": 0.80344653, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.599358320236206 + }, + { + "auxiliary_loss_clip": 0.01084322, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.03599453, + "balance_loss_mlp": 1.01817369, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 1.8615066006089767, + "language_loss": 0.76481116, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.7859689, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 2.715487480163574 + }, + { + "auxiliary_loss_clip": 0.01100803, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.03539491, + "balance_loss_mlp": 1.0235796, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 2.1846363744026096, + "language_loss": 0.70295221, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72433871, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 2.6488969326019287 + }, + { + "auxiliary_loss_clip": 0.01087188, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.03560424, + "balance_loss_mlp": 1.01765704, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7154602897324929, + "language_loss": 0.73560423, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.7567876, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 2.6772444248199463 + }, + { + "auxiliary_loss_clip": 0.01078175, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03114915, + "balance_loss_mlp": 1.01607084, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.8823805018210518, + "language_loss": 0.81469387, + "learning_rate": 3.116498038372114e-06, + "loss": 0.83577824, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.578943967819214 + }, + { + "auxiliary_loss_clip": 0.01071637, + "auxiliary_loss_mlp": 0.00749654, + "balance_loss_clip": 1.03457558, + "balance_loss_mlp": 1.00026333, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6308782388818768, + "language_loss": 0.82954919, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84776211, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 2.7245287895202637 + }, + { + "auxiliary_loss_clip": 0.01032917, + "auxiliary_loss_mlp": 0.01004244, + "balance_loss_clip": 1.00851452, + "balance_loss_mlp": 1.00274777, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7609591661909578, + "language_loss": 0.52558887, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54596043, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.132889986038208 + }, + { + "auxiliary_loss_clip": 0.010713, + "auxiliary_loss_mlp": 0.00749808, + "balance_loss_clip": 1.03765857, + "balance_loss_mlp": 1.00030708, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.0399907554092365, + "language_loss": 0.78076196, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79897302, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 2.6754958629608154 + }, + { + "auxiliary_loss_clip": 0.01072277, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.04102719, + "balance_loss_mlp": 1.02871776, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 1.7170560528592942, + "language_loss": 0.72014558, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.7413007, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.7879459857940674 + }, + { + "auxiliary_loss_clip": 0.01088199, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.03586316, + "balance_loss_mlp": 1.01521921, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 2.3858299830134255, + "language_loss": 0.8311618, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.85232651, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 2.614203691482544 + }, + { + "auxiliary_loss_clip": 0.01085408, + "auxiliary_loss_mlp": 0.00749885, + "balance_loss_clip": 1.03584373, + "balance_loss_mlp": 1.00025082, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.7847447797072162, + "language_loss": 0.69782001, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71617293, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.653123378753662 + }, + { + "auxiliary_loss_clip": 0.01098719, + "auxiliary_loss_mlp": 0.01038967, + "balance_loss_clip": 1.03618121, + "balance_loss_mlp": 1.02443528, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.780789932936989, + "language_loss": 0.76969177, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.79106867, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 2.6292760372161865 + }, + { + "auxiliary_loss_clip": 0.01092684, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.02148342, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 4.575235725133065, + "language_loss": 0.7335186, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75480139, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 2.656937599182129 + }, + { + "auxiliary_loss_clip": 0.01089888, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.03889239, + "balance_loss_mlp": 1.01636291, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 1.9038582919653642, + "language_loss": 0.65755141, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67874008, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 2.695664405822754 + }, + { + "auxiliary_loss_clip": 0.0105673, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.0343349, + "balance_loss_mlp": 1.02104831, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 2.264816448512641, + "language_loss": 0.70885956, + "learning_rate": 3.113264663362451e-06, + "loss": 0.72977591, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 2.7244961261749268 + }, + { + "auxiliary_loss_clip": 0.01060228, + "auxiliary_loss_mlp": 0.01032998, + "balance_loss_clip": 1.03443313, + "balance_loss_mlp": 1.01937199, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.6226449243582113, + "language_loss": 0.67014265, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69107485, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.67816162109375 + }, + { + "auxiliary_loss_clip": 0.01102993, + "auxiliary_loss_mlp": 0.00749759, + "balance_loss_clip": 1.03731835, + "balance_loss_mlp": 1.00031745, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.217853629531698, + "language_loss": 0.7326507, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75117821, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 2.6251485347747803 + }, + { + "auxiliary_loss_clip": 0.01099833, + "auxiliary_loss_mlp": 0.01037161, + "balance_loss_clip": 1.03655076, + "balance_loss_mlp": 1.02391124, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.657004622772784, + "language_loss": 0.81757569, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83894563, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.67262864112854 + }, + { + "auxiliary_loss_clip": 0.01106122, + "auxiliary_loss_mlp": 0.01039677, + "balance_loss_clip": 1.04006183, + "balance_loss_mlp": 1.02589011, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 2.1538902518280048, + "language_loss": 0.71456897, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73602694, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 2.7026479244232178 + }, + { + "auxiliary_loss_clip": 0.01098136, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.03615069, + "balance_loss_mlp": 1.02149546, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 2.8571517341442405, + "language_loss": 0.74397117, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76529992, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.655973434448242 + }, + { + "auxiliary_loss_clip": 0.01117525, + "auxiliary_loss_mlp": 0.01043334, + "balance_loss_clip": 1.03828156, + "balance_loss_mlp": 1.02901733, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.8989045408307121, + "language_loss": 0.71064448, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73225313, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 2.5770764350891113 + }, + { + "auxiliary_loss_clip": 0.01097215, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.0335753, + "balance_loss_mlp": 1.02152407, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 1.6832858061073779, + "language_loss": 0.6075933, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62891167, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 2.7658557891845703 + }, + { + "auxiliary_loss_clip": 0.01092804, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.03648877, + "balance_loss_mlp": 1.02556419, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.9437587992094432, + "language_loss": 0.68209916, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70342636, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.69586443901062 + }, + { + "auxiliary_loss_clip": 0.01099114, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.03562033, + "balance_loss_mlp": 1.0214541, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6812187306516106, + "language_loss": 0.75110334, + "learning_rate": 3.110351016113414e-06, + "loss": 0.772439, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.579533815383911 + }, + { + "auxiliary_loss_clip": 0.01030787, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.03319573, + "balance_loss_mlp": 1.02545416, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 2.4244770266182885, + "language_loss": 0.75367773, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77439475, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 2.9096877574920654 + }, + { + "auxiliary_loss_clip": 0.01108585, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.03607619, + "balance_loss_mlp": 1.01832271, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 6.237214895881504, + "language_loss": 0.70829833, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.72969532, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 2.8880248069763184 + }, + { + "auxiliary_loss_clip": 0.01070541, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.03711748, + "balance_loss_mlp": 1.02013087, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.600241563991568, + "language_loss": 0.69348598, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71452278, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 2.6862030029296875 + }, + { + "auxiliary_loss_clip": 0.0107632, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.03372943, + "balance_loss_mlp": 1.02132034, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.7573770522251417, + "language_loss": 0.65062332, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.67173672, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 2.6613261699676514 + }, + { + "auxiliary_loss_clip": 0.01087858, + "auxiliary_loss_mlp": 0.01027952, + "balance_loss_clip": 1.03718281, + "balance_loss_mlp": 1.01600122, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.613469288636742, + "language_loss": 0.85405499, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87521303, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.6600050926208496 + }, + { + "auxiliary_loss_clip": 0.01102574, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.03620207, + "balance_loss_mlp": 1.01564741, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 1.977802746314491, + "language_loss": 0.74773461, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76905829, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 4.346672534942627 + }, + { + "auxiliary_loss_clip": 0.01103669, + "auxiliary_loss_mlp": 0.01031009, + "balance_loss_clip": 1.03698897, + "balance_loss_mlp": 1.01650095, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.932912111084524, + "language_loss": 0.68281704, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70416385, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.7569079399108887 + }, + { + "auxiliary_loss_clip": 0.01066027, + "auxiliary_loss_mlp": 0.01044697, + "balance_loss_clip": 1.03397882, + "balance_loss_mlp": 1.03026676, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 2.4575204357325995, + "language_loss": 0.60851789, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62962514, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.7152016162872314 + }, + { + "auxiliary_loss_clip": 0.010659, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.03369498, + "balance_loss_mlp": 1.02055049, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.6771496416406457, + "language_loss": 0.71278369, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.73379123, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.6679112911224365 + }, + { + "auxiliary_loss_clip": 0.01071842, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.03168714, + "balance_loss_mlp": 1.01928496, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.607890409401421, + "language_loss": 0.82744461, + "learning_rate": 3.107109630732192e-06, + "loss": 0.84848958, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.6997172832489014 + }, + { + "auxiliary_loss_clip": 0.01091021, + "auxiliary_loss_mlp": 0.00749874, + "balance_loss_clip": 1.03739631, + "balance_loss_mlp": 1.00034118, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 1.8716407775846553, + "language_loss": 0.80896705, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.82737607, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.652402400970459 + }, + { + "auxiliary_loss_clip": 0.01100656, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.03712487, + "balance_loss_mlp": 1.02149987, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.5751593765651932, + "language_loss": 0.81429106, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83564764, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 4.23065447807312 + }, + { + "auxiliary_loss_clip": 0.01093924, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.03362513, + "balance_loss_mlp": 1.02148986, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.8823595281676246, + "language_loss": 0.73985958, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76114464, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 2.688793182373047 + }, + { + "auxiliary_loss_clip": 0.01100897, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.03848231, + "balance_loss_mlp": 1.01961458, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.556700967607976, + "language_loss": 0.82296789, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84430432, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.6703827381134033 + }, + { + "auxiliary_loss_clip": 0.01089758, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.03575397, + "balance_loss_mlp": 1.02154446, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.5640224672826266, + "language_loss": 0.80305433, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82430363, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 4.210628271102905 + }, + { + "auxiliary_loss_clip": 0.01080008, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.03426623, + "balance_loss_mlp": 1.01501393, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.6554471969232056, + "language_loss": 0.81329638, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83438104, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 2.698728561401367 + }, + { + "auxiliary_loss_clip": 0.0107073, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.03279567, + "balance_loss_mlp": 1.02234185, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 1.8643090538135019, + "language_loss": 0.7175563, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.73861933, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 2.647244453430176 + }, + { + "auxiliary_loss_clip": 0.01092027, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.0361129, + "balance_loss_mlp": 1.02386057, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 1.6292803149219537, + "language_loss": 0.75011528, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77141774, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.6689257621765137 + }, + { + "auxiliary_loss_clip": 0.01091442, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.03992677, + "balance_loss_mlp": 1.01905739, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 1.7441992980906873, + "language_loss": 0.6970315, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.7182703, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 2.607454538345337 + }, + { + "auxiliary_loss_clip": 0.01097651, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.03523993, + "balance_loss_mlp": 1.0188818, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.7465952534585938, + "language_loss": 0.65147072, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.672759, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 2.6204488277435303 + }, + { + "auxiliary_loss_clip": 0.01056134, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.03527498, + "balance_loss_mlp": 1.02328658, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 1.3838372523711011, + "language_loss": 0.741202, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76214218, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 3.019350290298462 + }, + { + "auxiliary_loss_clip": 0.01011699, + "auxiliary_loss_mlp": 0.01011988, + "balance_loss_clip": 1.0156827, + "balance_loss_mlp": 1.01065242, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.783210165514663, + "language_loss": 0.55488849, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57512534, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 3.1656837463378906 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.03956938, + "balance_loss_mlp": 1.01721334, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7162914166632237, + "language_loss": 0.65088093, + "learning_rate": 3.102889555312721e-06, + "loss": 0.67229736, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 2.6837077140808105 + }, + { + "auxiliary_loss_clip": 0.01092014, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.03850508, + "balance_loss_mlp": 1.02078152, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.8745106132666294, + "language_loss": 0.77287984, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79414326, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.6683859825134277 + }, + { + "auxiliary_loss_clip": 0.01089348, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.03709483, + "balance_loss_mlp": 1.01749814, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.7616736820547159, + "language_loss": 0.76276469, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78397262, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 2.6563913822174072 + }, + { + "auxiliary_loss_clip": 0.01076156, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.03769684, + "balance_loss_mlp": 1.02244639, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.296343925222523, + "language_loss": 0.70846021, + "learning_rate": 3.101914687048842e-06, + "loss": 0.72958672, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 2.715832471847534 + }, + { + "auxiliary_loss_clip": 0.01075269, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.03557301, + "balance_loss_mlp": 1.01675487, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.9440615841388191, + "language_loss": 0.90262705, + "learning_rate": 3.10158964737502e-06, + "loss": 0.92369789, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 2.7202718257904053 + }, + { + "auxiliary_loss_clip": 0.01074194, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.03393161, + "balance_loss_mlp": 1.0177449, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.820035639256729, + "language_loss": 0.79862082, + "learning_rate": 3.101264565928808e-06, + "loss": 0.81967711, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 2.7337613105773926 + }, + { + "auxiliary_loss_clip": 0.01038563, + "auxiliary_loss_mlp": 0.00747381, + "balance_loss_clip": 1.0140233, + "balance_loss_mlp": 1.00016618, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.8951508253807736, + "language_loss": 0.5602479, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.57810736, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 3.1048736572265625 + }, + { + "auxiliary_loss_clip": 0.01114569, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.04105496, + "balance_loss_mlp": 1.02377033, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 1.9095243074864152, + "language_loss": 0.77951372, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80103076, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 2.6170403957366943 + }, + { + "auxiliary_loss_clip": 0.01082557, + "auxiliary_loss_mlp": 0.01040927, + "balance_loss_clip": 1.03847659, + "balance_loss_mlp": 1.02643108, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.236126136939607, + "language_loss": 0.72527003, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.74650484, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 2.7488300800323486 + }, + { + "auxiliary_loss_clip": 0.01098341, + "auxiliary_loss_mlp": 0.01028148, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.01523781, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.6960609390549595, + "language_loss": 0.87675816, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.89802301, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.6171135902404785 + }, + { + "auxiliary_loss_clip": 0.010948, + "auxiliary_loss_mlp": 0.01038049, + "balance_loss_clip": 1.03782129, + "balance_loss_mlp": 1.02268279, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.420086383101803, + "language_loss": 0.82727641, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84860486, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 2.6226301193237305 + }, + { + "auxiliary_loss_clip": 0.01100491, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.03611386, + "balance_loss_mlp": 1.02052224, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.1298600816019624, + "language_loss": 0.73524982, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75660276, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 2.608238935470581 + }, + { + "auxiliary_loss_clip": 0.01081175, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.04254913, + "balance_loss_mlp": 1.01827788, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.6738713857243943, + "language_loss": 0.81355399, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.83469474, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.6057841777801514 + }, + { + "auxiliary_loss_clip": 0.01049346, + "auxiliary_loss_mlp": 0.0074988, + "balance_loss_clip": 1.03584373, + "balance_loss_mlp": 1.00021625, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.6419720545006116, + "language_loss": 0.71366608, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.7316584, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.650665760040283 + }, + { + "auxiliary_loss_clip": 0.01061212, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.03687191, + "balance_loss_mlp": 1.02559447, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 1.872613571689303, + "language_loss": 0.81379569, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83481216, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 2.6731503009796143 + }, + { + "auxiliary_loss_clip": 0.01090787, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.0381043, + "balance_loss_mlp": 1.01429522, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.7404592276209374, + "language_loss": 0.77757591, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.79876447, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.6588778495788574 + }, + { + "auxiliary_loss_clip": 0.01081676, + "auxiliary_loss_mlp": 0.01043477, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.0271337, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 2.265966836422003, + "language_loss": 0.74678063, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76803219, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.604412078857422 + }, + { + "auxiliary_loss_clip": 0.01087848, + "auxiliary_loss_mlp": 0.01038259, + "balance_loss_clip": 1.03558922, + "balance_loss_mlp": 1.02365601, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.5314197817676793, + "language_loss": 0.82470036, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84596139, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 2.58933424949646 + }, + { + "auxiliary_loss_clip": 0.01090883, + "auxiliary_loss_mlp": 0.01039435, + "balance_loss_clip": 1.036623, + "balance_loss_mlp": 1.02560055, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.8675541173812165, + "language_loss": 0.76967573, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79097891, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.746462345123291 + }, + { + "auxiliary_loss_clip": 0.0109475, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.03838241, + "balance_loss_mlp": 1.02328396, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.5157467389944046, + "language_loss": 0.76324308, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78456783, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 2.605966091156006 + }, + { + "auxiliary_loss_clip": 0.01097935, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.03438258, + "balance_loss_mlp": 1.01841187, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.4938157545648019, + "language_loss": 0.77568495, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79699433, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.5781941413879395 + }, + { + "auxiliary_loss_clip": 0.01076883, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_clip": 1.03867078, + "balance_loss_mlp": 1.03161299, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.6291650548010406, + "language_loss": 0.80101341, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.8222779, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.6484971046447754 + }, + { + "auxiliary_loss_clip": 0.01110338, + "auxiliary_loss_mlp": 0.01035195, + "balance_loss_clip": 1.03831816, + "balance_loss_mlp": 1.02245784, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.8753091068675432, + "language_loss": 0.67495513, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69641042, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.529350519180298 + }, + { + "auxiliary_loss_clip": 0.01089626, + "auxiliary_loss_mlp": 0.00750009, + "balance_loss_clip": 1.0360291, + "balance_loss_mlp": 1.00018644, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 2.2342964791081332, + "language_loss": 0.69880801, + "learning_rate": 3.095405970878919e-06, + "loss": 0.71720439, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 4.030624151229858 + }, + { + "auxiliary_loss_clip": 0.01087113, + "auxiliary_loss_mlp": 0.01040889, + "balance_loss_clip": 1.03405941, + "balance_loss_mlp": 1.02535653, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 1.7269255413324136, + "language_loss": 0.67316544, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69444549, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.5435638427734375 + }, + { + "auxiliary_loss_clip": 0.01085961, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.04133546, + "balance_loss_mlp": 1.02424741, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 2.2211616987409606, + "language_loss": 0.73276138, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75401306, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.6190648078918457 + }, + { + "auxiliary_loss_clip": 0.01111664, + "auxiliary_loss_mlp": 0.01037335, + "balance_loss_clip": 1.03746963, + "balance_loss_mlp": 1.02345932, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.95615075865173, + "language_loss": 0.69887924, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72036922, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.5286526679992676 + }, + { + "auxiliary_loss_clip": 0.0109158, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.03875697, + "balance_loss_mlp": 1.02067971, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.4885355758613343, + "language_loss": 0.7630502, + "learning_rate": 3.094102230664423e-06, + "loss": 0.7843082, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.6041784286499023 + }, + { + "auxiliary_loss_clip": 0.01076789, + "auxiliary_loss_mlp": 0.00750193, + "balance_loss_clip": 1.03195548, + "balance_loss_mlp": 1.00018227, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.1716162675598003, + "language_loss": 0.72156948, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73983932, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.593036413192749 + }, + { + "auxiliary_loss_clip": 0.01057228, + "auxiliary_loss_mlp": 0.00750526, + "balance_loss_clip": 1.03279901, + "balance_loss_mlp": 1.00026035, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.9246607944799858, + "language_loss": 0.80125481, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.81933236, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 4.254218101501465 + }, + { + "auxiliary_loss_clip": 0.01091782, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.03779376, + "balance_loss_mlp": 1.02088618, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.765172103998099, + "language_loss": 0.81487936, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83613658, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.5812807083129883 + }, + { + "auxiliary_loss_clip": 0.01091651, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.03725648, + "balance_loss_mlp": 1.02575576, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.6121033101169522, + "language_loss": 0.75711524, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.77841902, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 2.6001622676849365 + }, + { + "auxiliary_loss_clip": 0.01099622, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.03637195, + "balance_loss_mlp": 1.02037549, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.8504004969060568, + "language_loss": 0.78679293, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.80812746, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 5.768392324447632 + }, + { + "auxiliary_loss_clip": 0.01118929, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.03878582, + "balance_loss_mlp": 1.02253449, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4022023106543953, + "language_loss": 0.6446346, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66620123, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.776437759399414 + }, + { + "auxiliary_loss_clip": 0.01080354, + "auxiliary_loss_mlp": 0.01043925, + "balance_loss_clip": 1.03637314, + "balance_loss_mlp": 1.02622235, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 3.7525139661122986, + "language_loss": 0.82336235, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84460515, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 2.5577332973480225 + }, + { + "auxiliary_loss_clip": 0.0110848, + "auxiliary_loss_mlp": 0.01044597, + "balance_loss_clip": 1.03861606, + "balance_loss_mlp": 1.02815819, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 2.334276450065787, + "language_loss": 0.82621455, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.8477453, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.5084054470062256 + }, + { + "auxiliary_loss_clip": 0.01104829, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.04209602, + "balance_loss_mlp": 1.01908898, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 2.0400595607454792, + "language_loss": 0.83407056, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85544389, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 2.563786745071411 + }, + { + "auxiliary_loss_clip": 0.01116727, + "auxiliary_loss_mlp": 0.01045711, + "balance_loss_clip": 1.03935254, + "balance_loss_mlp": 1.03140628, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.8261379977600607, + "language_loss": 0.69646347, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.71808785, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 2.521629571914673 + }, + { + "auxiliary_loss_clip": 0.01095718, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.0393815, + "balance_loss_mlp": 1.02224028, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.3729386970131332, + "language_loss": 0.83100492, + "learning_rate": 3.090513524656898e-06, + "loss": 0.8523379, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.6362497806549072 + }, + { + "auxiliary_loss_clip": 0.01075701, + "auxiliary_loss_mlp": 0.0104043, + "balance_loss_clip": 1.0354085, + "balance_loss_mlp": 1.02576733, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 1.4157697465017955, + "language_loss": 0.73653543, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75769675, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 2.666351795196533 + }, + { + "auxiliary_loss_clip": 0.01097418, + "auxiliary_loss_mlp": 0.0103928, + "balance_loss_clip": 1.03781366, + "balance_loss_mlp": 1.02446222, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.2233752701023493, + "language_loss": 0.83419454, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85556149, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.5593509674072266 + }, + { + "auxiliary_loss_clip": 0.01087266, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.03412819, + "balance_loss_mlp": 1.02380681, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.5664581303225804, + "language_loss": 0.67806315, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69931275, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 2.572279691696167 + }, + { + "auxiliary_loss_clip": 0.01098006, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_clip": 1.03626001, + "balance_loss_mlp": 1.02700043, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 1.6830609203897284, + "language_loss": 0.7050193, + "learning_rate": 3.089207299216464e-06, + "loss": 0.72643906, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 2.5908894538879395 + }, + { + "auxiliary_loss_clip": 0.01031317, + "auxiliary_loss_mlp": 0.01036595, + "balance_loss_clip": 1.03156507, + "balance_loss_mlp": 1.02226591, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 2.1788347991586883, + "language_loss": 0.79269373, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81337285, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 2.701481580734253 + }, + { + "auxiliary_loss_clip": 0.0110569, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.0395416, + "balance_loss_mlp": 1.02167583, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.602973746067799, + "language_loss": 0.82424772, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84567392, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 2.598402976989746 + }, + { + "auxiliary_loss_clip": 0.01102813, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.03831887, + "balance_loss_mlp": 1.02016783, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 2.240628953234444, + "language_loss": 0.81834513, + "learning_rate": 3.088227196412879e-06, + "loss": 0.83972859, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 2.5690643787384033 + }, + { + "auxiliary_loss_clip": 0.01099784, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.04181588, + "balance_loss_mlp": 1.01942801, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.6702332861643838, + "language_loss": 0.79532862, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81668234, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.6338980197906494 + }, + { + "auxiliary_loss_clip": 0.01056037, + "auxiliary_loss_mlp": 0.01034385, + "balance_loss_clip": 1.03114176, + "balance_loss_mlp": 1.02019906, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.373861392439859, + "language_loss": 0.70335782, + "learning_rate": 3.087573588194753e-06, + "loss": 0.724262, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 2.78730845451355 + }, + { + "auxiliary_loss_clip": 0.01097066, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.03883862, + "balance_loss_mlp": 1.01813757, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.7375775267266236, + "language_loss": 0.79466373, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81596226, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.604374885559082 + }, + { + "auxiliary_loss_clip": 0.0108293, + "auxiliary_loss_mlp": 0.01039985, + "balance_loss_clip": 1.03509736, + "balance_loss_mlp": 1.02334344, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.674572876147899, + "language_loss": 0.90993351, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93116271, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 2.6541428565979004 + }, + { + "auxiliary_loss_clip": 0.01101089, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.03681278, + "balance_loss_mlp": 1.022542, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6375693264164732, + "language_loss": 0.80752999, + "learning_rate": 3.086592866591809e-06, + "loss": 0.82890737, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 2.5897409915924072 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.00750256, + "balance_loss_clip": 1.04167068, + "balance_loss_mlp": 1.0002569, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 1.6929431626475908, + "language_loss": 0.83851159, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.85715234, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 2.635451078414917 + }, + { + "auxiliary_loss_clip": 0.01041238, + "auxiliary_loss_mlp": 0.01042788, + "balance_loss_clip": 1.03370953, + "balance_loss_mlp": 1.02683747, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.4986892935511718, + "language_loss": 0.79974973, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.8205899, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.863144874572754 + }, + { + "auxiliary_loss_clip": 0.01073336, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.03714728, + "balance_loss_mlp": 1.02252126, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.6142437089436865, + "language_loss": 0.70926344, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73036849, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 2.6994011402130127 + }, + { + "auxiliary_loss_clip": 0.01093055, + "auxiliary_loss_mlp": 0.01042301, + "balance_loss_clip": 1.03541863, + "balance_loss_mlp": 1.0282048, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 3.6923729926679614, + "language_loss": 0.70115143, + "learning_rate": 3.085284660993821e-06, + "loss": 0.72250497, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.5817525386810303 + }, + { + "auxiliary_loss_clip": 0.01116305, + "auxiliary_loss_mlp": 0.01037604, + "balance_loss_clip": 1.04041028, + "balance_loss_mlp": 1.02335203, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 1.6351368779497941, + "language_loss": 0.67851615, + "learning_rate": 3.084957506678058e-06, + "loss": 0.7000553, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.615821361541748 + }, + { + "auxiliary_loss_clip": 0.01089005, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.03869092, + "balance_loss_mlp": 1.02607179, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.638778940900647, + "language_loss": 0.83051562, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.85180116, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 2.6066880226135254 + }, + { + "auxiliary_loss_clip": 0.01078458, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.03450406, + "balance_loss_mlp": 1.02025604, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4311434891164219, + "language_loss": 0.73608202, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75720954, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 2.708376407623291 + }, + { + "auxiliary_loss_clip": 0.01022051, + "auxiliary_loss_mlp": 0.0100105, + "balance_loss_clip": 1.01632309, + "balance_loss_mlp": 0.99954814, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7768551702528133, + "language_loss": 0.54934764, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56957865, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.3988037109375 + }, + { + "auxiliary_loss_clip": 0.01072551, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.0369668, + "balance_loss_mlp": 1.03550828, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 2.2494923844878802, + "language_loss": 0.73187399, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75312817, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.650651693344116 + }, + { + "auxiliary_loss_clip": 0.01106806, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_clip": 1.03881037, + "balance_loss_mlp": 1.02709568, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 1.8172263030159166, + "language_loss": 0.70638525, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72787613, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.5993666648864746 + }, + { + "auxiliary_loss_clip": 0.01086884, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.03519583, + "balance_loss_mlp": 1.02227283, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.6250047246864623, + "language_loss": 0.81042123, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83165687, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.607663154602051 + }, + { + "auxiliary_loss_clip": 0.01109871, + "auxiliary_loss_mlp": 0.00750101, + "balance_loss_clip": 1.04117048, + "balance_loss_mlp": 1.00024009, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.0139629979900393, + "language_loss": 0.80520254, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82380223, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 2.5856270790100098 + }, + { + "auxiliary_loss_clip": 0.01059531, + "auxiliary_loss_mlp": 0.01046659, + "balance_loss_clip": 1.03407574, + "balance_loss_mlp": 1.02985024, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.8151377125174757, + "language_loss": 0.77204514, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79310703, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 4.1533496379852295 + }, + { + "auxiliary_loss_clip": 0.01096636, + "auxiliary_loss_mlp": 0.01040457, + "balance_loss_clip": 1.03737628, + "balance_loss_mlp": 1.02394664, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.911026247190456, + "language_loss": 0.85080343, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87217438, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.6859610080718994 + }, + { + "auxiliary_loss_clip": 0.01064223, + "auxiliary_loss_mlp": 0.01052407, + "balance_loss_clip": 1.03547657, + "balance_loss_mlp": 1.03682637, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 3.6895465487230235, + "language_loss": 0.72136831, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.74253458, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.7052996158599854 + }, + { + "auxiliary_loss_clip": 0.01014515, + "auxiliary_loss_mlp": 0.01008342, + "balance_loss_clip": 1.00917053, + "balance_loss_mlp": 1.00675702, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8538237518275568, + "language_loss": 0.5614866, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58171523, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.191741704940796 + }, + { + "auxiliary_loss_clip": 0.01101736, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.03892374, + "balance_loss_mlp": 1.01749027, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.8801760292635608, + "language_loss": 0.80188274, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.823219, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.606593132019043 + }, + { + "auxiliary_loss_clip": 0.01075161, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.03367805, + "balance_loss_mlp": 1.01942468, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.1075785374316434, + "language_loss": 0.58973932, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61083543, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 2.640759229660034 + }, + { + "auxiliary_loss_clip": 0.01083494, + "auxiliary_loss_mlp": 0.01034748, + "balance_loss_clip": 1.03734565, + "balance_loss_mlp": 1.02081239, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.8023149263517204, + "language_loss": 0.9279415, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94912386, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 4.13959527015686 + }, + { + "auxiliary_loss_clip": 0.01065142, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.03660524, + "balance_loss_mlp": 1.01828575, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.736106441865153, + "language_loss": 0.75003624, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.77101409, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.608668804168701 + }, + { + "auxiliary_loss_clip": 0.0110033, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.03688943, + "balance_loss_mlp": 1.02135456, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.8421852343373897, + "language_loss": 0.83453768, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85590059, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 4.171520233154297 + }, + { + "auxiliary_loss_clip": 0.01063633, + "auxiliary_loss_mlp": 0.01044953, + "balance_loss_clip": 1.03474212, + "balance_loss_mlp": 1.02809668, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.5538999314725659, + "language_loss": 0.69638729, + "learning_rate": 3.079389598759495e-06, + "loss": 0.71747315, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 4.092868328094482 + }, + { + "auxiliary_loss_clip": 0.01081074, + "auxiliary_loss_mlp": 0.01048049, + "balance_loss_clip": 1.03572357, + "balance_loss_mlp": 1.03290939, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 4.09113670142663, + "language_loss": 0.81115866, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83244991, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.638641119003296 + }, + { + "auxiliary_loss_clip": 0.0111731, + "auxiliary_loss_mlp": 0.01040599, + "balance_loss_clip": 1.03856719, + "balance_loss_mlp": 1.02524471, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.1645906172648495, + "language_loss": 0.68125403, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70283306, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.5025720596313477 + }, + { + "auxiliary_loss_clip": 0.01088725, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.03529656, + "balance_loss_mlp": 1.01936555, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 2.293463438577167, + "language_loss": 0.6990152, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72024071, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 2.554490804672241 + }, + { + "auxiliary_loss_clip": 0.01117667, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.03996754, + "balance_loss_mlp": 1.02538824, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.7923043499000502, + "language_loss": 0.87261182, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89418411, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 2.560666084289551 + }, + { + "auxiliary_loss_clip": 0.01095939, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.03638172, + "balance_loss_mlp": 1.01637781, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.793075685766496, + "language_loss": 0.84050429, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86175168, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.521118402481079 + }, + { + "auxiliary_loss_clip": 0.01082335, + "auxiliary_loss_mlp": 0.01040995, + "balance_loss_clip": 1.03422248, + "balance_loss_mlp": 1.02711368, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.517546327581479, + "language_loss": 0.76803505, + "learning_rate": 3.077421627435922e-06, + "loss": 0.78926837, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 2.60373592376709 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.01043726, + "balance_loss_clip": 1.03722787, + "balance_loss_mlp": 1.02948081, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 6.384881532163198, + "language_loss": 0.63465226, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65609682, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 2.6161653995513916 + }, + { + "auxiliary_loss_clip": 0.01099227, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.03615022, + "balance_loss_mlp": 1.02014232, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.2352011566587198, + "language_loss": 0.76471198, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78603864, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 2.6218719482421875 + }, + { + "auxiliary_loss_clip": 0.01104628, + "auxiliary_loss_mlp": 0.01041234, + "balance_loss_clip": 1.03925872, + "balance_loss_mlp": 1.02682137, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 2.399972701011342, + "language_loss": 0.79318571, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81464434, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.574563503265381 + }, + { + "auxiliary_loss_clip": 0.01090316, + "auxiliary_loss_mlp": 0.0074976, + "balance_loss_clip": 1.03974009, + "balance_loss_mlp": 1.00014925, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 1.8368136751715898, + "language_loss": 0.77233601, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79073679, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.742189884185791 + }, + { + "auxiliary_loss_clip": 0.00966156, + "auxiliary_loss_mlp": 0.01005738, + "balance_loss_clip": 1.01150739, + "balance_loss_mlp": 1.00431335, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7813659260729013, + "language_loss": 0.56351507, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58323395, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 3.510594606399536 + }, + { + "auxiliary_loss_clip": 0.01081574, + "auxiliary_loss_mlp": 0.00750043, + "balance_loss_clip": 1.03304982, + "balance_loss_mlp": 1.00026798, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 2.156384966365808, + "language_loss": 0.85208833, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87040454, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 3.0999579429626465 + }, + { + "auxiliary_loss_clip": 0.01100743, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.036098, + "balance_loss_mlp": 1.01411164, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.7766496497567634, + "language_loss": 0.7095964, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73088264, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 2.6967527866363525 + }, + { + "auxiliary_loss_clip": 0.01060238, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.03268433, + "balance_loss_mlp": 1.0209024, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 2.3009889164866295, + "language_loss": 0.80617958, + "learning_rate": 3.074795378203616e-06, + "loss": 0.8271296, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 2.6660828590393066 + }, + { + "auxiliary_loss_clip": 0.01116693, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.03937912, + "balance_loss_mlp": 1.02426577, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.7859634886388533, + "language_loss": 0.76849771, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79005039, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 2.5982792377471924 + }, + { + "auxiliary_loss_clip": 0.01091693, + "auxiliary_loss_mlp": 0.01041831, + "balance_loss_clip": 1.03309453, + "balance_loss_mlp": 1.02716231, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 2.6161008906458947, + "language_loss": 0.85598588, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87732112, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 2.5060079097747803 + }, + { + "auxiliary_loss_clip": 0.01097614, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.03490233, + "balance_loss_mlp": 1.02118027, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 2.3177654394723026, + "language_loss": 0.65350068, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67482114, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 2.593693256378174 + }, + { + "auxiliary_loss_clip": 0.01102499, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.03803933, + "balance_loss_mlp": 1.02525449, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4076631524946457, + "language_loss": 0.76247507, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78388542, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.6020753383636475 + }, + { + "auxiliary_loss_clip": 0.01076595, + "auxiliary_loss_mlp": 0.01032951, + "balance_loss_clip": 1.03365469, + "balance_loss_mlp": 1.01837802, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 3.1840111917110683, + "language_loss": 0.83257145, + "learning_rate": 3.073152647447525e-06, + "loss": 0.8536669, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 2.656925678253174 + }, + { + "auxiliary_loss_clip": 0.01087281, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.03653979, + "balance_loss_mlp": 1.02556944, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.8003323070955803, + "language_loss": 0.85543156, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87668437, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.6481363773345947 + }, + { + "auxiliary_loss_clip": 0.01024637, + "auxiliary_loss_mlp": 0.01023472, + "balance_loss_clip": 1.0098331, + "balance_loss_mlp": 1.02219677, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8220407267503845, + "language_loss": 0.60036355, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62084466, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 3.0904970169067383 + }, + { + "auxiliary_loss_clip": 0.01108855, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0376699, + "balance_loss_mlp": 1.02021348, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.9179681257446648, + "language_loss": 0.67795861, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.69937855, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 2.6204774379730225 + }, + { + "auxiliary_loss_clip": 0.01115137, + "auxiliary_loss_mlp": 0.0104109, + "balance_loss_clip": 1.04077101, + "balance_loss_mlp": 1.02699947, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.9903975136242944, + "language_loss": 0.67162085, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69318318, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 2.628592014312744 + }, + { + "auxiliary_loss_clip": 0.01093777, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.03996348, + "balance_loss_mlp": 1.02293956, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.8859228105586328, + "language_loss": 0.78980422, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81110495, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.6094653606414795 + }, + { + "auxiliary_loss_clip": 0.01080744, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.03892732, + "balance_loss_mlp": 1.02215683, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 2.222864147665553, + "language_loss": 0.73085749, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75203413, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 2.675128936767578 + }, + { + "auxiliary_loss_clip": 0.01066619, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.03306961, + "balance_loss_mlp": 1.02095008, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 1.9057399568370768, + "language_loss": 0.86093187, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88192916, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 2.7701916694641113 + }, + { + "auxiliary_loss_clip": 0.01114838, + "auxiliary_loss_mlp": 0.01032946, + "balance_loss_clip": 1.03951335, + "balance_loss_mlp": 1.0200417, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.7914228950038629, + "language_loss": 0.69124156, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71271932, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.5194718837738037 + }, + { + "auxiliary_loss_clip": 0.01112877, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.03766966, + "balance_loss_mlp": 1.02013445, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.4219831090764075, + "language_loss": 0.72838432, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.74985468, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.4877970218658447 + }, + { + "auxiliary_loss_clip": 0.01104578, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.03820395, + "balance_loss_mlp": 1.02092874, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.6083826732114401, + "language_loss": 0.73155916, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75295234, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 2.5751211643218994 + }, + { + "auxiliary_loss_clip": 0.01022866, + "auxiliary_loss_mlp": 0.0100116, + "balance_loss_clip": 1.00961578, + "balance_loss_mlp": 0.99983674, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8448651192539228, + "language_loss": 0.63259435, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65283459, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.2411887645721436 + }, + { + "auxiliary_loss_clip": 0.01013472, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.03079855, + "balance_loss_mlp": 1.02469146, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 1.9553655097461058, + "language_loss": 0.71595323, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.73648155, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 2.863062620162964 + }, + { + "auxiliary_loss_clip": 0.01081798, + "auxiliary_loss_mlp": 0.00749631, + "balance_loss_clip": 1.038167, + "balance_loss_mlp": 1.0001868, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.6767101510256441, + "language_loss": 0.80907738, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82739174, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 4.8810014724731445 + }, + { + "auxiliary_loss_clip": 0.01062844, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.03345942, + "balance_loss_mlp": 1.02080178, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.6189069549757356, + "language_loss": 0.77008796, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79106665, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.7107832431793213 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.00749829, + "balance_loss_clip": 1.0401063, + "balance_loss_mlp": 1.0001905, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 2.0416118386956676, + "language_loss": 0.74056613, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.75922334, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 2.5757694244384766 + }, + { + "auxiliary_loss_clip": 0.01095485, + "auxiliary_loss_mlp": 0.01038189, + "balance_loss_clip": 1.03590977, + "balance_loss_mlp": 1.02394319, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.9270008772543183, + "language_loss": 0.7356751, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75701189, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 2.5953354835510254 + }, + { + "auxiliary_loss_clip": 0.01101909, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.03754878, + "balance_loss_mlp": 1.01868081, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 2.343877505184434, + "language_loss": 0.79509288, + "learning_rate": 3.067559762415682e-06, + "loss": 0.81643474, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 4.1774983406066895 + }, + { + "auxiliary_loss_clip": 0.01033006, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00815547, + "balance_loss_mlp": 1.00291848, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7918241286328298, + "language_loss": 0.56065166, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58102596, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.2936623096466064 + }, + { + "auxiliary_loss_clip": 0.01089943, + "auxiliary_loss_mlp": 0.00749542, + "balance_loss_clip": 1.03637648, + "balance_loss_mlp": 1.00011349, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.5787825283852304, + "language_loss": 0.78848845, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.80688328, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 2.650049924850464 + }, + { + "auxiliary_loss_clip": 0.01094196, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.03315234, + "balance_loss_mlp": 1.01648462, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8147434258289128, + "language_loss": 0.8545872, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87583768, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 2.6516623497009277 + }, + { + "auxiliary_loss_clip": 0.01088417, + "auxiliary_loss_mlp": 0.01038591, + "balance_loss_clip": 1.03708887, + "balance_loss_mlp": 1.02404761, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 1.999563006600783, + "language_loss": 0.79084837, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81211841, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 4.323576211929321 + }, + { + "auxiliary_loss_clip": 0.01099319, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.03467345, + "balance_loss_mlp": 1.01693821, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.666228449270346, + "language_loss": 0.74961525, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.77090997, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 4.099848031997681 + }, + { + "auxiliary_loss_clip": 0.01021943, + "auxiliary_loss_mlp": 0.01003245, + "balance_loss_clip": 1.0078882, + "balance_loss_mlp": 1.00178456, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7385296933615482, + "language_loss": 0.59442961, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.6146816, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 3.1644911766052246 + }, + { + "auxiliary_loss_clip": 0.01085628, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.03390181, + "balance_loss_mlp": 1.01574278, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 1.794642634168884, + "language_loss": 0.71923625, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74037665, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.6081700325012207 + }, + { + "auxiliary_loss_clip": 0.01087435, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.03544235, + "balance_loss_mlp": 1.02435327, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.1804246055700265, + "language_loss": 0.71741819, + "learning_rate": 3.064923764577233e-06, + "loss": 0.73866308, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 2.6039764881134033 + }, + { + "auxiliary_loss_clip": 0.01110045, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.0353725, + "balance_loss_mlp": 1.02237272, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 4.986016437913344, + "language_loss": 0.84210533, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86356765, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 2.597156524658203 + }, + { + "auxiliary_loss_clip": 0.01090374, + "auxiliary_loss_mlp": 0.01041921, + "balance_loss_clip": 1.03572142, + "balance_loss_mlp": 1.02726412, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 3.012223947687143, + "language_loss": 0.70865035, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72997332, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 2.6267433166503906 + }, + { + "auxiliary_loss_clip": 0.01109389, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.03705204, + "balance_loss_mlp": 1.01737761, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.2948592329714312, + "language_loss": 0.7503413, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77173674, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.5522119998931885 + }, + { + "auxiliary_loss_clip": 0.01090152, + "auxiliary_loss_mlp": 0.01040152, + "balance_loss_clip": 1.03380132, + "balance_loss_mlp": 1.0266515, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.720400434519952, + "language_loss": 0.70482218, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72612524, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.639183282852173 + }, + { + "auxiliary_loss_clip": 0.01100772, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.03508735, + "balance_loss_mlp": 1.02318716, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 3.39511757948832, + "language_loss": 0.77788472, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79926836, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 2.5796549320220947 + }, + { + "auxiliary_loss_clip": 0.01084674, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.03669894, + "balance_loss_mlp": 1.02100039, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.799039645775078, + "language_loss": 0.87003821, + "learning_rate": 3.062945069803981e-06, + "loss": 0.89123118, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.6370248794555664 + }, + { + "auxiliary_loss_clip": 0.01098052, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.03930545, + "balance_loss_mlp": 1.02037406, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.877878646252651, + "language_loss": 0.79651034, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.81784636, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 2.684635639190674 + }, + { + "auxiliary_loss_clip": 0.01104139, + "auxiliary_loss_mlp": 0.01034439, + "balance_loss_clip": 1.03693867, + "balance_loss_mlp": 1.01963329, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 1.7887167074170984, + "language_loss": 0.73542535, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75681108, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.5265121459960938 + }, + { + "auxiliary_loss_clip": 0.0109196, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.03343034, + "balance_loss_mlp": 1.02438176, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 2.249078215025824, + "language_loss": 0.75163203, + "learning_rate": 3.061955178104237e-06, + "loss": 0.77293831, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.6209964752197266 + }, + { + "auxiliary_loss_clip": 0.01098471, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.03622675, + "balance_loss_mlp": 1.0194639, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.6429052774264028, + "language_loss": 0.68198061, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.7032876, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.5956780910491943 + }, + { + "auxiliary_loss_clip": 0.01100821, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.0355829, + "balance_loss_mlp": 1.02166522, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 1.944204478021242, + "language_loss": 0.72178334, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74315912, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 2.519019365310669 + }, + { + "auxiliary_loss_clip": 0.0106669, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.03303885, + "balance_loss_mlp": 1.02315128, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.6467682255093814, + "language_loss": 0.7491945, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.77022237, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.6368649005889893 + }, + { + "auxiliary_loss_clip": 0.01073106, + "auxiliary_loss_mlp": 0.01038055, + "balance_loss_clip": 1.03639996, + "balance_loss_mlp": 1.0252223, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.933400979255033, + "language_loss": 0.80006814, + "learning_rate": 3.060634758790747e-06, + "loss": 0.82117975, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.5999977588653564 + }, + { + "auxiliary_loss_clip": 0.01054765, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.0316304, + "balance_loss_mlp": 1.02328312, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 2.835358183864482, + "language_loss": 0.73390317, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75482452, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 2.6708855628967285 + }, + { + "auxiliary_loss_clip": 0.01061002, + "auxiliary_loss_mlp": 0.0104717, + "balance_loss_clip": 1.03237689, + "balance_loss_mlp": 1.03167307, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 2.162332888067655, + "language_loss": 0.71339762, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73447931, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 2.7050230503082275 + }, + { + "auxiliary_loss_clip": 0.01083822, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.03532863, + "balance_loss_mlp": 1.01570153, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.8485426319965323, + "language_loss": 0.82147735, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84260583, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 2.625619649887085 + }, + { + "auxiliary_loss_clip": 0.01051309, + "auxiliary_loss_mlp": 0.01052039, + "balance_loss_clip": 1.03342962, + "balance_loss_mlp": 1.03514767, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 2.388612849486696, + "language_loss": 0.69145268, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71248615, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 2.724802255630493 + }, + { + "auxiliary_loss_clip": 0.0108853, + "auxiliary_loss_mlp": 0.01030502, + "balance_loss_clip": 1.03588545, + "balance_loss_mlp": 1.01758599, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.2433599433242826, + "language_loss": 0.72594976, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74714005, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.667820453643799 + }, + { + "auxiliary_loss_clip": 0.01085385, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.03845584, + "balance_loss_mlp": 1.01895523, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.6492032078081658, + "language_loss": 0.81476021, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83593172, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 2.5995218753814697 + }, + { + "auxiliary_loss_clip": 0.01096737, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.03786314, + "balance_loss_mlp": 1.01840734, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 4.005774899219587, + "language_loss": 0.71390343, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73518717, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.6436190605163574 + }, + { + "auxiliary_loss_clip": 0.01009976, + "auxiliary_loss_mlp": 0.0100357, + "balance_loss_clip": 1.00615549, + "balance_loss_mlp": 1.00213909, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.7932924796267754, + "language_loss": 0.57490379, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59503925, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 3.094723701477051 + }, + { + "auxiliary_loss_clip": 0.01097062, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.03568339, + "balance_loss_mlp": 1.01966774, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 14.626269119543908, + "language_loss": 0.7467683, + "learning_rate": 3.057661463723086e-06, + "loss": 0.76808292, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.7017738819122314 + }, + { + "auxiliary_loss_clip": 0.01075746, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.03578353, + "balance_loss_mlp": 1.02105117, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 1.9533171469276727, + "language_loss": 0.7277441, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.74884069, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.7264862060546875 + }, + { + "auxiliary_loss_clip": 0.01070993, + "auxiliary_loss_mlp": 0.0103044, + "balance_loss_clip": 1.03762221, + "balance_loss_mlp": 1.0169034, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 2.1912593823596134, + "language_loss": 0.79479754, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81581187, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.6812524795532227 + }, + { + "auxiliary_loss_clip": 0.01097399, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.04020798, + "balance_loss_mlp": 1.01867437, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 2.127703639228485, + "language_loss": 0.83192855, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85323334, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 2.611091136932373 + }, + { + "auxiliary_loss_clip": 0.01102849, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.03959095, + "balance_loss_mlp": 1.02066648, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.9839260386413253, + "language_loss": 0.75060284, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77196825, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.566420316696167 + }, + { + "auxiliary_loss_clip": 0.01076992, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.03481889, + "balance_loss_mlp": 1.01780641, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.63546101621579, + "language_loss": 0.80710971, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.82819241, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 4.180939674377441 + }, + { + "auxiliary_loss_clip": 0.01091817, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.03796911, + "balance_loss_mlp": 1.0242883, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.889258938771206, + "language_loss": 0.78823048, + "learning_rate": 3.055677461649329e-06, + "loss": 0.809542, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 2.7171902656555176 + }, + { + "auxiliary_loss_clip": 0.01102182, + "auxiliary_loss_mlp": 0.01034788, + "balance_loss_clip": 1.03619528, + "balance_loss_mlp": 1.02006555, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 2.452224039793694, + "language_loss": 0.69928765, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72065735, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 2.555907726287842 + }, + { + "auxiliary_loss_clip": 0.0107493, + "auxiliary_loss_mlp": 0.00749974, + "balance_loss_clip": 1.03443754, + "balance_loss_mlp": 1.00024962, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.8497089157636613, + "language_loss": 0.67305565, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69130462, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.774653911590576 + }, + { + "auxiliary_loss_clip": 0.01001297, + "auxiliary_loss_mlp": 0.01008154, + "balance_loss_clip": 1.00739086, + "balance_loss_mlp": 1.00675893, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8481892757903379, + "language_loss": 0.58149374, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60158825, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.15527606010437 + }, + { + "auxiliary_loss_clip": 0.01114046, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.03898406, + "balance_loss_mlp": 1.02242947, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.654260384448691, + "language_loss": 0.80773377, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82923007, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 2.535797595977783 + }, + { + "auxiliary_loss_clip": 0.01112279, + "auxiliary_loss_mlp": 0.01041061, + "balance_loss_clip": 1.03816545, + "balance_loss_mlp": 1.02734649, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.7960562414954657, + "language_loss": 0.71720123, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7387346, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 4.23236608505249 + }, + { + "auxiliary_loss_clip": 0.01029425, + "auxiliary_loss_mlp": 0.01022612, + "balance_loss_clip": 1.02096879, + "balance_loss_mlp": 1.02094293, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.9109470183471748, + "language_loss": 0.65881604, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67933643, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 3.1518969535827637 + }, + { + "auxiliary_loss_clip": 0.01093397, + "auxiliary_loss_mlp": 0.01038574, + "balance_loss_clip": 1.03591597, + "balance_loss_mlp": 1.02502024, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 1.821934158051351, + "language_loss": 0.74213707, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76345682, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 2.620361566543579 + }, + { + "auxiliary_loss_clip": 0.01061492, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.03661489, + "balance_loss_mlp": 1.02308786, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.955088455398753, + "language_loss": 0.75246239, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77344424, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 4.312153577804565 + }, + { + "auxiliary_loss_clip": 0.01079013, + "auxiliary_loss_mlp": 0.01042749, + "balance_loss_clip": 1.03771353, + "balance_loss_mlp": 1.02812219, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.8151952984501074, + "language_loss": 0.63611829, + "learning_rate": 3.052698757266734e-06, + "loss": 0.65733594, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 4.255802869796753 + }, + { + "auxiliary_loss_clip": 0.01065724, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.0330447, + "balance_loss_mlp": 1.02234769, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.0497733542079013, + "language_loss": 0.73711431, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.7581476, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 2.7496213912963867 + }, + { + "auxiliary_loss_clip": 0.01094534, + "auxiliary_loss_mlp": 0.01044911, + "balance_loss_clip": 1.03553009, + "balance_loss_mlp": 1.02873456, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.703891436233653, + "language_loss": 0.74331665, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76471114, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 2.5392262935638428 + }, + { + "auxiliary_loss_clip": 0.01085583, + "auxiliary_loss_mlp": 0.00750042, + "balance_loss_clip": 1.03592026, + "balance_loss_mlp": 1.00026393, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 2.380352524647911, + "language_loss": 0.80156541, + "learning_rate": 3.051705136821992e-06, + "loss": 0.81992161, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 2.5955371856689453 + }, + { + "auxiliary_loss_clip": 0.01062492, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.03643918, + "balance_loss_mlp": 1.0196197, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.6335097364129032, + "language_loss": 0.81347418, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83442408, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.7287137508392334 + }, + { + "auxiliary_loss_clip": 0.01071344, + "auxiliary_loss_mlp": 0.01049275, + "balance_loss_clip": 1.03292561, + "balance_loss_mlp": 1.0336113, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.823327299012281, + "language_loss": 0.80933809, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83054423, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.6044106483459473 + }, + { + "auxiliary_loss_clip": 0.01082798, + "auxiliary_loss_mlp": 0.01045309, + "balance_loss_clip": 1.03447509, + "balance_loss_mlp": 1.02984178, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.7081438783155987, + "language_loss": 0.69210744, + "learning_rate": 3.05071115745038e-06, + "loss": 0.7133885, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 2.757805585861206 + }, + { + "auxiliary_loss_clip": 0.01106927, + "auxiliary_loss_mlp": 0.01042823, + "balance_loss_clip": 1.04006147, + "balance_loss_mlp": 1.02682495, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.6302872587854977, + "language_loss": 0.68960774, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71110523, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 2.5798072814941406 + }, + { + "auxiliary_loss_clip": 0.01087229, + "auxiliary_loss_mlp": 0.01039875, + "balance_loss_clip": 1.04182053, + "balance_loss_mlp": 1.02680945, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.85423533321815, + "language_loss": 0.73042053, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75169158, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.6581637859344482 + }, + { + "auxiliary_loss_clip": 0.0106917, + "auxiliary_loss_mlp": 0.01041512, + "balance_loss_clip": 1.0339216, + "balance_loss_mlp": 1.0264678, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.9402552939576314, + "language_loss": 0.88267267, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90377951, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.590111017227173 + }, + { + "auxiliary_loss_clip": 0.01059534, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.03236389, + "balance_loss_mlp": 1.03025246, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.3934557303396566, + "language_loss": 0.70396376, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72500294, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 2.6721088886260986 + }, + { + "auxiliary_loss_clip": 0.01099039, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.03669143, + "balance_loss_mlp": 1.01848185, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 2.452587621167111, + "language_loss": 0.74146819, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76278234, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.493971586227417 + }, + { + "auxiliary_loss_clip": 0.01068213, + "auxiliary_loss_mlp": 0.01045808, + "balance_loss_clip": 1.03089833, + "balance_loss_mlp": 1.03056145, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.372838267308666, + "language_loss": 0.79706943, + "learning_rate": 3.048722123283578e-06, + "loss": 0.81820965, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 2.6262171268463135 + }, + { + "auxiliary_loss_clip": 0.01102062, + "auxiliary_loss_mlp": 0.01039446, + "balance_loss_clip": 1.03689122, + "balance_loss_mlp": 1.02548027, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 1.868514483258595, + "language_loss": 0.78038424, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.8017993, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 2.5511984825134277 + }, + { + "auxiliary_loss_clip": 0.01000568, + "auxiliary_loss_mlp": 0.01019923, + "balance_loss_clip": 1.00617957, + "balance_loss_mlp": 1.01849294, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.738731169990045, + "language_loss": 0.53538156, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55558646, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.2349894046783447 + }, + { + "auxiliary_loss_clip": 0.01092093, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.0366894, + "balance_loss_mlp": 1.02442515, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.9143014773329905, + "language_loss": 0.83499908, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85630763, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.6686408519744873 + }, + { + "auxiliary_loss_clip": 0.01091472, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.03741598, + "balance_loss_mlp": 1.0182513, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 1.890385152784545, + "language_loss": 0.92510808, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94634402, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 2.67193865776062 + }, + { + "auxiliary_loss_clip": 0.01079715, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.04281831, + "balance_loss_mlp": 1.02091241, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.6495173224850077, + "language_loss": 0.7636317, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78478909, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 2.7305877208709717 + }, + { + "auxiliary_loss_clip": 0.01098602, + "auxiliary_loss_mlp": 0.01037089, + "balance_loss_clip": 1.04111981, + "balance_loss_mlp": 1.0225935, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.845333341663151, + "language_loss": 0.78600657, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.80736339, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.7003042697906494 + }, + { + "auxiliary_loss_clip": 0.01054696, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.03056931, + "balance_loss_mlp": 1.02982712, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.11820751435303, + "language_loss": 0.71589124, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73690903, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 2.720747232437134 + }, + { + "auxiliary_loss_clip": 0.01072707, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.03390336, + "balance_loss_mlp": 1.02224326, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 3.044371823628288, + "language_loss": 0.81850696, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83961374, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.72659969329834 + }, + { + "auxiliary_loss_clip": 0.01083047, + "auxiliary_loss_mlp": 0.01037793, + "balance_loss_clip": 1.0381459, + "balance_loss_mlp": 1.0231781, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 1.8638217772617898, + "language_loss": 0.82831353, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.84952193, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 2.6285316944122314 + }, + { + "auxiliary_loss_clip": 0.01105917, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.04018307, + "balance_loss_mlp": 1.01942968, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.0888393717968117, + "language_loss": 0.76357615, + "learning_rate": 3.045403886269181e-06, + "loss": 0.7849797, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.6033875942230225 + }, + { + "auxiliary_loss_clip": 0.01090796, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.03667927, + "balance_loss_mlp": 1.01702356, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.5254182018428761, + "language_loss": 0.77439249, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79561222, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.65130352973938 + }, + { + "auxiliary_loss_clip": 0.01104606, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.03783238, + "balance_loss_mlp": 1.02096033, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 7.3180893270820295, + "language_loss": 0.76063007, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78203142, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.5689334869384766 + }, + { + "auxiliary_loss_clip": 0.01101828, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.03760552, + "balance_loss_mlp": 1.02207184, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 2.810896680009017, + "language_loss": 0.70263088, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72400379, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.6373376846313477 + }, + { + "auxiliary_loss_clip": 0.01113715, + "auxiliary_loss_mlp": 0.01033154, + "balance_loss_clip": 1.03931689, + "balance_loss_mlp": 1.01813936, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.8312117043051117, + "language_loss": 0.79857314, + "learning_rate": 3.044075480787665e-06, + "loss": 0.82004178, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.500950336456299 + }, + { + "auxiliary_loss_clip": 0.0107271, + "auxiliary_loss_mlp": 0.01040128, + "balance_loss_clip": 1.03670788, + "balance_loss_mlp": 1.02484488, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.901748134231937, + "language_loss": 0.89612854, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91725683, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 2.6506121158599854 + }, + { + "auxiliary_loss_clip": 0.01107483, + "auxiliary_loss_mlp": 0.01037088, + "balance_loss_clip": 1.03862691, + "balance_loss_mlp": 1.02207923, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 2.0060890879384767, + "language_loss": 0.64514345, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66658914, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.587214231491089 + }, + { + "auxiliary_loss_clip": 0.0109438, + "auxiliary_loss_mlp": 0.01035335, + "balance_loss_clip": 1.03794169, + "balance_loss_mlp": 1.02142286, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.8852627852034396, + "language_loss": 0.72897208, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75026923, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 2.715204954147339 + }, + { + "auxiliary_loss_clip": 0.01057869, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.03642702, + "balance_loss_mlp": 1.01964021, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.657136939612145, + "language_loss": 0.75216907, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77307057, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 4.278322219848633 + }, + { + "auxiliary_loss_clip": 0.01027399, + "auxiliary_loss_mlp": 0.00999676, + "balance_loss_clip": 1.02016091, + "balance_loss_mlp": 0.99822754, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8943189414162949, + "language_loss": 0.62672615, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64699686, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 3.0469095706939697 + }, + { + "auxiliary_loss_clip": 0.01087702, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.03686357, + "balance_loss_mlp": 1.01770878, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 1.6366973512008378, + "language_loss": 0.80484784, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82602978, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.6608989238739014 + }, + { + "auxiliary_loss_clip": 0.01112191, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.03857827, + "balance_loss_mlp": 1.02655458, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 3.3674686688162283, + "language_loss": 0.83646721, + "learning_rate": 3.041749247409439e-06, + "loss": 0.85798955, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.451491117477417 + }, + { + "auxiliary_loss_clip": 0.01015015, + "auxiliary_loss_mlp": 0.00747348, + "balance_loss_clip": 1.01059937, + "balance_loss_mlp": 1.00012362, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7393579377437823, + "language_loss": 0.63145274, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.64907634, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 3.042379140853882 + }, + { + "auxiliary_loss_clip": 0.0108913, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.03717899, + "balance_loss_mlp": 1.02039671, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.781625160263284, + "language_loss": 0.70790279, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.72914243, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 2.5393619537353516 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.0387733, + "balance_loss_mlp": 1.01918161, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 2.669165269284746, + "language_loss": 0.72787529, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.7492559, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 4.060820579528809 + }, + { + "auxiliary_loss_clip": 0.01097786, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.03488362, + "balance_loss_mlp": 1.01908851, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6927103805139276, + "language_loss": 0.72340912, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74471629, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 2.7652761936187744 + }, + { + "auxiliary_loss_clip": 0.01023705, + "auxiliary_loss_mlp": 0.01002273, + "balance_loss_clip": 1.00850964, + "balance_loss_mlp": 1.00043726, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7234240550874045, + "language_loss": 0.62603515, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64629501, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 4.6558287143707275 + }, + { + "auxiliary_loss_clip": 0.01003494, + "auxiliary_loss_mlp": 0.00747264, + "balance_loss_clip": 1.00908065, + "balance_loss_mlp": 0.99999064, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8276881225110276, + "language_loss": 0.5928179, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61032552, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 3.1519253253936768 + }, + { + "auxiliary_loss_clip": 0.0110124, + "auxiliary_loss_mlp": 0.01035393, + "balance_loss_clip": 1.04475307, + "balance_loss_mlp": 1.02235162, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.69924805481116, + "language_loss": 0.71652132, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73788768, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 4.187386512756348 + }, + { + "auxiliary_loss_clip": 0.01058267, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.03195262, + "balance_loss_mlp": 1.0354147, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 2.010927463019192, + "language_loss": 0.83401346, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85511172, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 2.6885392665863037 + }, + { + "auxiliary_loss_clip": 0.01004666, + "auxiliary_loss_mlp": 0.01005916, + "balance_loss_clip": 1.0102272, + "balance_loss_mlp": 1.00435472, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8203771459171278, + "language_loss": 0.56559277, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.5856986, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 3.218818426132202 + }, + { + "auxiliary_loss_clip": 0.01098564, + "auxiliary_loss_mlp": 0.00749889, + "balance_loss_clip": 1.03501797, + "balance_loss_mlp": 1.00035906, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.2408941275192658, + "language_loss": 0.94571328, + "learning_rate": 3.038422700166474e-06, + "loss": 0.96419775, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 2.5846798419952393 + }, + { + "auxiliary_loss_clip": 0.01079762, + "auxiliary_loss_mlp": 0.01036838, + "balance_loss_clip": 1.03395033, + "balance_loss_mlp": 1.02238345, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.8107338680212675, + "language_loss": 0.69457859, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71574455, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 2.724384307861328 + }, + { + "auxiliary_loss_clip": 0.01103464, + "auxiliary_loss_mlp": 0.01042168, + "balance_loss_clip": 1.03918052, + "balance_loss_mlp": 1.02589643, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 1.724889093372087, + "language_loss": 0.83952767, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86098397, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.5676565170288086 + }, + { + "auxiliary_loss_clip": 0.01077889, + "auxiliary_loss_mlp": 0.01036381, + "balance_loss_clip": 1.03486741, + "balance_loss_mlp": 1.02236819, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.317521137868419, + "language_loss": 0.67831659, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69945925, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.600085496902466 + }, + { + "auxiliary_loss_clip": 0.01085978, + "auxiliary_loss_mlp": 0.01051626, + "balance_loss_clip": 1.03863835, + "balance_loss_mlp": 1.03634322, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 1.7353311918010021, + "language_loss": 0.76997828, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79135436, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.5947628021240234 + }, + { + "auxiliary_loss_clip": 0.01056027, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.03344727, + "balance_loss_mlp": 1.02197456, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.7345828680795077, + "language_loss": 0.73591751, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75682968, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.646514415740967 + }, + { + "auxiliary_loss_clip": 0.01087781, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.03896713, + "balance_loss_mlp": 1.02532125, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.1028442464521597, + "language_loss": 0.78195602, + "learning_rate": 3.036424880912893e-06, + "loss": 0.80323505, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 2.6464180946350098 + }, + { + "auxiliary_loss_clip": 0.01023077, + "auxiliary_loss_mlp": 0.01001156, + "balance_loss_clip": 1.00887012, + "balance_loss_mlp": 0.9996714, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7911568512882804, + "language_loss": 0.57434291, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59458524, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.155480146408081 + }, + { + "auxiliary_loss_clip": 0.01088846, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.03907895, + "balance_loss_mlp": 1.02306938, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.5867453084961607, + "language_loss": 0.857714, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87899667, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.6597208976745605 + }, + { + "auxiliary_loss_clip": 0.01009996, + "auxiliary_loss_mlp": 0.01001852, + "balance_loss_clip": 1.00929248, + "balance_loss_mlp": 1.00001574, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7659925714830051, + "language_loss": 0.59797657, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61809504, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 2.9505722522735596 + }, + { + "auxiliary_loss_clip": 0.01104865, + "auxiliary_loss_mlp": 0.0104339, + "balance_loss_clip": 1.03882611, + "balance_loss_mlp": 1.02887583, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 2.179994145374994, + "language_loss": 0.71665883, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73814142, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.01088651, + "auxiliary_loss_mlp": 0.00750003, + "balance_loss_clip": 1.03992462, + "balance_loss_mlp": 1.0003562, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4551220201904493, + "language_loss": 0.76408219, + "learning_rate": 3.034758950632507e-06, + "loss": 0.7824688, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.7517735958099365 + }, + { + "auxiliary_loss_clip": 0.01105459, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.03725505, + "balance_loss_mlp": 1.0252068, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.0205668205287193, + "language_loss": 0.70575482, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72721106, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 2.6195507049560547 + }, + { + "auxiliary_loss_clip": 0.01090356, + "auxiliary_loss_mlp": 0.00749821, + "balance_loss_clip": 1.03808177, + "balance_loss_mlp": 1.00031018, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 5.04141830730347, + "language_loss": 0.76284474, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78124654, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 2.655996799468994 + }, + { + "auxiliary_loss_clip": 0.01089923, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.03525972, + "balance_loss_mlp": 1.02142727, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.117735421654802, + "language_loss": 0.77711022, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79837704, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 2.642807960510254 + }, + { + "auxiliary_loss_clip": 0.01013511, + "auxiliary_loss_mlp": 0.01001389, + "balance_loss_clip": 1.00882149, + "balance_loss_mlp": 0.9998396, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8404691934478506, + "language_loss": 0.63339597, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65354496, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.222583293914795 + }, + { + "auxiliary_loss_clip": 0.01076355, + "auxiliary_loss_mlp": 0.01037757, + "balance_loss_clip": 1.03707802, + "balance_loss_mlp": 1.02306986, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 1.9056594096793213, + "language_loss": 0.64720237, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66834342, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.715327262878418 + }, + { + "auxiliary_loss_clip": 0.01093873, + "auxiliary_loss_mlp": 0.01043564, + "balance_loss_clip": 1.03700066, + "balance_loss_mlp": 1.02952147, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 2.038026506079195, + "language_loss": 0.71491027, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73628461, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 2.8292958736419678 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.01045184, + "balance_loss_clip": 1.03995991, + "balance_loss_mlp": 1.03084302, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 2.2260699709981178, + "language_loss": 0.62578189, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.647434, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.6415815353393555 + }, + { + "auxiliary_loss_clip": 0.01070339, + "auxiliary_loss_mlp": 0.01039346, + "balance_loss_clip": 1.03369999, + "balance_loss_mlp": 1.02501667, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.9791484738979093, + "language_loss": 0.72377515, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74487197, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.6122946739196777 + }, + { + "auxiliary_loss_clip": 0.01049716, + "auxiliary_loss_mlp": 0.01044633, + "balance_loss_clip": 1.03106952, + "balance_loss_mlp": 1.02887392, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 1.917498917526948, + "language_loss": 0.77005821, + "learning_rate": 3.031757805185612e-06, + "loss": 0.79100168, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.7784814834594727 + }, + { + "auxiliary_loss_clip": 0.0108688, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.03732085, + "balance_loss_mlp": 1.02167726, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 2.3929927471507493, + "language_loss": 0.62822199, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64944464, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.598604679107666 + }, + { + "auxiliary_loss_clip": 0.01074305, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.04091644, + "balance_loss_mlp": 1.01926529, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.640843948935342, + "language_loss": 0.88315141, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90421909, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.7193763256073 + }, + { + "auxiliary_loss_clip": 0.01067264, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.03913033, + "balance_loss_mlp": 1.02165127, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 2.166880524319608, + "language_loss": 0.81737512, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83840406, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 2.6529293060302734 + }, + { + "auxiliary_loss_clip": 0.01085562, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.03946376, + "balance_loss_mlp": 1.02549434, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.6606445196984538, + "language_loss": 0.8053537, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82659906, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 2.6236560344696045 + }, + { + "auxiliary_loss_clip": 0.01113223, + "auxiliary_loss_mlp": 0.00749925, + "balance_loss_clip": 1.03998518, + "balance_loss_mlp": 1.00029373, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.8423806190827448, + "language_loss": 0.74897552, + "learning_rate": 3.030089132216836e-06, + "loss": 0.76760697, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.5125789642333984 + }, + { + "auxiliary_loss_clip": 0.01082104, + "auxiliary_loss_mlp": 0.0075009, + "balance_loss_clip": 1.03511858, + "balance_loss_mlp": 1.00040579, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.532019849808783, + "language_loss": 0.81340849, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83173043, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.6720569133758545 + }, + { + "auxiliary_loss_clip": 0.01122579, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.04215074, + "balance_loss_mlp": 1.02245438, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.8218807646904263, + "language_loss": 0.85454011, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87613797, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 3.978050708770752 + }, + { + "auxiliary_loss_clip": 0.01107183, + "auxiliary_loss_mlp": 0.01052937, + "balance_loss_clip": 1.04014003, + "balance_loss_mlp": 1.03864992, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.918921689522223, + "language_loss": 0.85030341, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87190467, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.583946943283081 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01042247, + "balance_loss_clip": 1.04185569, + "balance_loss_mlp": 1.02805519, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.0931021020479004, + "language_loss": 0.81361377, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.83512938, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.6554806232452393 + }, + { + "auxiliary_loss_clip": 0.01106069, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.03847647, + "balance_loss_mlp": 1.02237439, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 2.0308393519369137, + "language_loss": 0.77629924, + "learning_rate": 3.028419482721056e-06, + "loss": 0.7977277, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 2.6465401649475098 + }, + { + "auxiliary_loss_clip": 0.01088151, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.03456688, + "balance_loss_mlp": 1.01734996, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.6404077643776636, + "language_loss": 0.81474942, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83594197, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 2.668036937713623 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01047362, + "balance_loss_clip": 1.04007041, + "balance_loss_mlp": 1.03319979, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7726541515374834, + "language_loss": 0.76030421, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78182232, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 3.960643768310547 + }, + { + "auxiliary_loss_clip": 0.01101788, + "auxiliary_loss_mlp": 0.01040812, + "balance_loss_clip": 1.03730071, + "balance_loss_mlp": 1.02668023, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 1.8119277761330193, + "language_loss": 0.57430923, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59573531, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 2.6525068283081055 + }, + { + "auxiliary_loss_clip": 0.01087328, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.0369575, + "balance_loss_mlp": 1.02234125, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.35732153991418, + "language_loss": 0.82424086, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84547293, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 4.111630439758301 + }, + { + "auxiliary_loss_clip": 0.01100607, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.03840303, + "balance_loss_mlp": 1.0190028, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 3.354381574434263, + "language_loss": 0.83319587, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85451972, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 2.5501582622528076 + }, + { + "auxiliary_loss_clip": 0.01110402, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.037709, + "balance_loss_mlp": 1.0192399, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.6879119147144055, + "language_loss": 0.73335457, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75479484, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.5506603717803955 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01038114, + "balance_loss_clip": 1.03839409, + "balance_loss_mlp": 1.02351105, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 3.703422404944322, + "language_loss": 0.76221216, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78373861, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 4.007215976715088 + }, + { + "auxiliary_loss_clip": 0.01055209, + "auxiliary_loss_mlp": 0.01037783, + "balance_loss_clip": 1.04470277, + "balance_loss_mlp": 1.02432394, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.973408955939906, + "language_loss": 0.75757074, + "learning_rate": 3.025746016302734e-06, + "loss": 0.77850068, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 2.831042766571045 + }, + { + "auxiliary_loss_clip": 0.01091035, + "auxiliary_loss_mlp": 0.00749805, + "balance_loss_clip": 1.03664351, + "balance_loss_mlp": 1.00030494, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 1.907508728874712, + "language_loss": 0.67344499, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69185334, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 3.1532723903656006 + }, + { + "auxiliary_loss_clip": 0.01077736, + "auxiliary_loss_mlp": 0.01045478, + "balance_loss_clip": 1.03339589, + "balance_loss_mlp": 1.03136992, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 2.106509571306193, + "language_loss": 0.76501864, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78625077, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.6792826652526855 + }, + { + "auxiliary_loss_clip": 0.01034339, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.0320071, + "balance_loss_mlp": 1.01912546, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 3.069849514151237, + "language_loss": 0.78944993, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81012428, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.755685806274414 + }, + { + "auxiliary_loss_clip": 0.01088158, + "auxiliary_loss_mlp": 0.0075009, + "balance_loss_clip": 1.03382754, + "balance_loss_mlp": 1.0004487, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.9730038002021122, + "language_loss": 0.67984289, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69822544, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.749727725982666 + }, + { + "auxiliary_loss_clip": 0.0108684, + "auxiliary_loss_mlp": 0.01039133, + "balance_loss_clip": 1.0368613, + "balance_loss_mlp": 1.02538848, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 1.9656457240224086, + "language_loss": 0.76145786, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78271759, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 2.603344678878784 + }, + { + "auxiliary_loss_clip": 0.01064631, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.0340488, + "balance_loss_mlp": 1.02412629, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 2.334681891475525, + "language_loss": 0.6701327, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69116282, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.779881477355957 + }, + { + "auxiliary_loss_clip": 0.01100474, + "auxiliary_loss_mlp": 0.01039311, + "balance_loss_clip": 1.03923035, + "balance_loss_mlp": 1.02520871, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.4862912414317868, + "language_loss": 0.71833253, + "learning_rate": 3.023404690904629e-06, + "loss": 0.73973036, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.6580917835235596 + }, + { + "auxiliary_loss_clip": 0.0111336, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.03610563, + "balance_loss_mlp": 1.01885867, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.7295211547842448, + "language_loss": 0.73996603, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76143456, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 2.673413038253784 + }, + { + "auxiliary_loss_clip": 0.01108807, + "auxiliary_loss_mlp": 0.01038242, + "balance_loss_clip": 1.03735518, + "balance_loss_mlp": 1.02520609, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 2.0267285281766174, + "language_loss": 0.8439008, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86537129, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.569692850112915 + }, + { + "auxiliary_loss_clip": 0.01083432, + "auxiliary_loss_mlp": 0.01033469, + "balance_loss_clip": 1.03477669, + "balance_loss_mlp": 1.02114248, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 2.1158427788467895, + "language_loss": 0.8106668, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.83183587, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.6030352115631104 + }, + { + "auxiliary_loss_clip": 0.0110998, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.03620887, + "balance_loss_mlp": 1.02140999, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.8962615931560027, + "language_loss": 0.75546455, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.7769087, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 2.545283317565918 + }, + { + "auxiliary_loss_clip": 0.01095288, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.03716087, + "balance_loss_mlp": 1.02232099, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 3.1581366605971732, + "language_loss": 0.80019826, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82151377, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 2.660163164138794 + }, + { + "auxiliary_loss_clip": 0.01048257, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.03134632, + "balance_loss_mlp": 1.02039075, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 2.306246104317711, + "language_loss": 0.69057429, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71140426, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 2.6589303016662598 + }, + { + "auxiliary_loss_clip": 0.0107678, + "auxiliary_loss_mlp": 0.00749774, + "balance_loss_clip": 1.03103614, + "balance_loss_mlp": 1.00030601, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 1.9680208632936496, + "language_loss": 0.76463664, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78290212, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 2.5820424556732178 + }, + { + "auxiliary_loss_clip": 0.0109104, + "auxiliary_loss_mlp": 0.00749982, + "balance_loss_clip": 1.03689623, + "balance_loss_mlp": 1.00041914, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5846246797744385, + "language_loss": 0.84554458, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86395478, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.651822328567505 + }, + { + "auxiliary_loss_clip": 0.0109589, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.0359236, + "balance_loss_mlp": 1.01636672, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 3.522738372958155, + "language_loss": 0.77317083, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79441774, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.5186309814453125 + }, + { + "auxiliary_loss_clip": 0.01100247, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.03806639, + "balance_loss_mlp": 1.02304697, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 1.9155775130702482, + "language_loss": 0.59078175, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.61215115, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.575542449951172 + }, + { + "auxiliary_loss_clip": 0.01034296, + "auxiliary_loss_mlp": 0.01005637, + "balance_loss_clip": 1.01013517, + "balance_loss_mlp": 1.00431335, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.871950834253383, + "language_loss": 0.59934199, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61974132, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.142717123031616 + }, + { + "auxiliary_loss_clip": 0.01070517, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.03513145, + "balance_loss_mlp": 1.01882339, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 1.811020016419983, + "language_loss": 0.83335209, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85438484, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 2.615586757659912 + }, + { + "auxiliary_loss_clip": 0.01083707, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.03382158, + "balance_loss_mlp": 1.01526141, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 1.71285445464975, + "language_loss": 0.7070384, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.72816026, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.588831901550293 + }, + { + "auxiliary_loss_clip": 0.01099894, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.03552651, + "balance_loss_mlp": 1.02100706, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 1.857874803075804, + "language_loss": 0.70293927, + "learning_rate": 3.018716339744759e-06, + "loss": 0.72427493, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.7142927646636963 + }, + { + "auxiliary_loss_clip": 0.0110806, + "auxiliary_loss_mlp": 0.01042856, + "balance_loss_clip": 1.04032052, + "balance_loss_mlp": 1.02823448, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.9827024627896335, + "language_loss": 0.73824358, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75975275, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.5869548320770264 + }, + { + "auxiliary_loss_clip": 0.01092838, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.03840244, + "balance_loss_mlp": 1.01662254, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 2.0583727075507734, + "language_loss": 0.78114837, + "learning_rate": 3.018045956403094e-06, + "loss": 0.8023895, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.598242998123169 + }, + { + "auxiliary_loss_clip": 0.01023075, + "auxiliary_loss_mlp": 0.01004925, + "balance_loss_clip": 1.00862479, + "balance_loss_mlp": 1.00372696, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7132224888891222, + "language_loss": 0.59315991, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61343992, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 3.1632323265075684 + }, + { + "auxiliary_loss_clip": 0.01090016, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.03673983, + "balance_loss_mlp": 1.0193994, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 6.753200796044953, + "language_loss": 0.84684789, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86808562, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.606217861175537 + }, + { + "auxiliary_loss_clip": 0.01102592, + "auxiliary_loss_mlp": 0.00749746, + "balance_loss_clip": 1.03836, + "balance_loss_mlp": 1.00053358, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 3.8162029360309693, + "language_loss": 0.83002639, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.84854978, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.5370824337005615 + }, + { + "auxiliary_loss_clip": 0.01091196, + "auxiliary_loss_mlp": 0.0103782, + "balance_loss_clip": 1.03884518, + "balance_loss_mlp": 1.02392077, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.8381131413475522, + "language_loss": 0.80561185, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.82690203, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.685337543487549 + }, + { + "auxiliary_loss_clip": 0.01055469, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.03247094, + "balance_loss_mlp": 1.0209527, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.1538409537588628, + "language_loss": 0.70929307, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73019028, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 2.646556854248047 + }, + { + "auxiliary_loss_clip": 0.01108407, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_clip": 1.04164064, + "balance_loss_mlp": 1.02874517, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.8492179201916723, + "language_loss": 0.79727495, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81880403, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 4.1411988735198975 + }, + { + "auxiliary_loss_clip": 0.01077651, + "auxiliary_loss_mlp": 0.01044801, + "balance_loss_clip": 1.03999031, + "balance_loss_mlp": 1.02973843, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.9399601383157137, + "language_loss": 0.72605109, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74727565, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 2.721651554107666 + }, + { + "auxiliary_loss_clip": 0.01065004, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.03224087, + "balance_loss_mlp": 1.01749349, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.134294619985686, + "language_loss": 0.88538885, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90635073, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 2.6834089756011963 + }, + { + "auxiliary_loss_clip": 0.01054153, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.03387856, + "balance_loss_mlp": 1.02706242, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 2.1157879042307295, + "language_loss": 0.78717577, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80813092, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 2.685434579849243 + }, + { + "auxiliary_loss_clip": 0.01073423, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.03457355, + "balance_loss_mlp": 1.02670431, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.747847448222357, + "language_loss": 0.70645547, + "learning_rate": 3.014691725465008e-06, + "loss": 0.72762156, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 2.712782621383667 + }, + { + "auxiliary_loss_clip": 0.01099003, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03682375, + "balance_loss_mlp": 1.01929665, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.4992678445793204, + "language_loss": 0.80961961, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83093238, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 4.125170469284058 + }, + { + "auxiliary_loss_clip": 0.01067005, + "auxiliary_loss_mlp": 0.01037445, + "balance_loss_clip": 1.04171419, + "balance_loss_mlp": 1.0229609, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 5.5889497525384915, + "language_loss": 0.83963376, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86067826, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 2.7282168865203857 + }, + { + "auxiliary_loss_clip": 0.01047869, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.03344488, + "balance_loss_mlp": 1.02329826, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 1.4985394786495165, + "language_loss": 0.76843536, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78928125, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.7643327713012695 + }, + { + "auxiliary_loss_clip": 0.01074676, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.03966236, + "balance_loss_mlp": 1.02924407, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 1.8483697707128768, + "language_loss": 0.77185053, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79305053, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 4.140688180923462 + }, + { + "auxiliary_loss_clip": 0.01098434, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.03712702, + "balance_loss_mlp": 1.02307892, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.884461745646361, + "language_loss": 0.68041462, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70176125, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.598644733428955 + }, + { + "auxiliary_loss_clip": 0.01111705, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.03797388, + "balance_loss_mlp": 1.0183692, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.023839508016096, + "language_loss": 0.83409572, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.8555364, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 4.024778842926025 + }, + { + "auxiliary_loss_clip": 0.01100668, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.03671873, + "balance_loss_mlp": 1.02176118, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.8920873822489657, + "language_loss": 0.59057307, + "learning_rate": 3.012341473657572e-06, + "loss": 0.61194015, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.5868451595306396 + }, + { + "auxiliary_loss_clip": 0.01071801, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.03620028, + "balance_loss_mlp": 1.02207637, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.4109716096913054, + "language_loss": 0.87209463, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89317417, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 2.6933481693267822 + }, + { + "auxiliary_loss_clip": 0.01091513, + "auxiliary_loss_mlp": 0.01036456, + "balance_loss_clip": 1.03832889, + "balance_loss_mlp": 1.02111411, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.9215578881634214, + "language_loss": 0.75010782, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77138752, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.5876288414001465 + }, + { + "auxiliary_loss_clip": 0.01099996, + "auxiliary_loss_mlp": 0.01042784, + "balance_loss_clip": 1.0371573, + "balance_loss_mlp": 1.02920556, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.4680924121838923, + "language_loss": 0.68805766, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70948541, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.5155954360961914 + }, + { + "auxiliary_loss_clip": 0.01112076, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.03836179, + "balance_loss_mlp": 1.02294064, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 6.569722026542977, + "language_loss": 0.65372348, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67521238, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 2.562361001968384 + }, + { + "auxiliary_loss_clip": 0.01099373, + "auxiliary_loss_mlp": 0.01040086, + "balance_loss_clip": 1.03852153, + "balance_loss_mlp": 1.02534556, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.118244058655288, + "language_loss": 0.75631851, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77771312, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.54072904586792 + }, + { + "auxiliary_loss_clip": 0.01098885, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.03734052, + "balance_loss_mlp": 1.02277803, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 3.2759204236853723, + "language_loss": 0.72874761, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75010067, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.5447449684143066 + }, + { + "auxiliary_loss_clip": 0.01067713, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0351063, + "balance_loss_mlp": 1.02142251, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.562930701494813, + "language_loss": 0.75156772, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77259862, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.6988987922668457 + }, + { + "auxiliary_loss_clip": 0.01087628, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.03778803, + "balance_loss_mlp": 1.02057815, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.392491152875582, + "language_loss": 0.72369659, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74490613, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 2.7002434730529785 + }, + { + "auxiliary_loss_clip": 0.01097839, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.04082108, + "balance_loss_mlp": 1.02829134, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 7.771693854524137, + "language_loss": 0.8956036, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91700572, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.6938135623931885 + }, + { + "auxiliary_loss_clip": 0.01092536, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.03835654, + "balance_loss_mlp": 1.01816428, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 2.1534188326228474, + "language_loss": 0.74453133, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76577455, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 2.6342382431030273 + }, + { + "auxiliary_loss_clip": 0.01099604, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.03792787, + "balance_loss_mlp": 1.01780939, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.4461251757285467, + "language_loss": 0.75560522, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77691746, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 2.6060891151428223 + }, + { + "auxiliary_loss_clip": 0.01083186, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.03620481, + "balance_loss_mlp": 1.02130353, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.7183818900994177, + "language_loss": 0.87580621, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.8969962, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 2.5906307697296143 + }, + { + "auxiliary_loss_clip": 0.01107877, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.03612971, + "balance_loss_mlp": 1.01865697, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.2211358939246706, + "language_loss": 0.67276335, + "learning_rate": 3.007971733162737e-06, + "loss": 0.69415504, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.5423669815063477 + }, + { + "auxiliary_loss_clip": 0.01086214, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.03533244, + "balance_loss_mlp": 1.01968527, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.7149414607223956, + "language_loss": 0.81152201, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83271945, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 2.5702953338623047 + }, + { + "auxiliary_loss_clip": 0.0109711, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.04446197, + "balance_loss_mlp": 1.01781011, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.5756429447760218, + "language_loss": 0.73341823, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75469196, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.638963460922241 + }, + { + "auxiliary_loss_clip": 0.01107578, + "auxiliary_loss_mlp": 0.01033875, + "balance_loss_clip": 1.03591776, + "balance_loss_mlp": 1.02132189, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 2.175638520695298, + "language_loss": 0.71201241, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73342693, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.553257942199707 + }, + { + "auxiliary_loss_clip": 0.01095896, + "auxiliary_loss_mlp": 0.01042114, + "balance_loss_clip": 1.03703523, + "balance_loss_mlp": 1.0274272, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.8022713700993491, + "language_loss": 0.61138159, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63276166, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.781053066253662 + }, + { + "auxiliary_loss_clip": 0.01102276, + "auxiliary_loss_mlp": 0.01038582, + "balance_loss_clip": 1.03908229, + "balance_loss_mlp": 1.02493894, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 2.114746490880685, + "language_loss": 0.73380381, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75521231, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 2.6072278022766113 + }, + { + "auxiliary_loss_clip": 0.01112024, + "auxiliary_loss_mlp": 0.01035564, + "balance_loss_clip": 1.03692067, + "balance_loss_mlp": 1.02233768, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.7050592926896713, + "language_loss": 0.76148152, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78295743, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.5815649032592773 + }, + { + "auxiliary_loss_clip": 0.0109872, + "auxiliary_loss_mlp": 0.0103641, + "balance_loss_clip": 1.04208589, + "balance_loss_mlp": 1.02187848, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.7560266208255215, + "language_loss": 0.72008157, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.74143291, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.602827310562134 + }, + { + "auxiliary_loss_clip": 0.01083769, + "auxiliary_loss_mlp": 0.01041061, + "balance_loss_clip": 1.03521991, + "balance_loss_mlp": 1.02573061, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.1142882081493704, + "language_loss": 0.65543306, + "learning_rate": 3.005279449623811e-06, + "loss": 0.67668128, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 2.647913694381714 + }, + { + "auxiliary_loss_clip": 0.01089453, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.03704906, + "balance_loss_mlp": 1.01848137, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 1.8437968838157917, + "language_loss": 0.66887325, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.69008195, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.608412742614746 + }, + { + "auxiliary_loss_clip": 0.01088637, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.03826499, + "balance_loss_mlp": 1.02112532, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.050931914985472, + "language_loss": 0.7709797, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79223114, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.627570152282715 + }, + { + "auxiliary_loss_clip": 0.01100403, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.03929806, + "balance_loss_mlp": 1.02488589, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.5239197697923155, + "language_loss": 0.75253594, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77392256, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 2.6630570888519287 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_clip": 1.03597069, + "balance_loss_mlp": 1.02888799, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.317222922251298, + "language_loss": 0.7945981, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81602049, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.6202480792999268 + }, + { + "auxiliary_loss_clip": 0.01104472, + "auxiliary_loss_mlp": 0.01039706, + "balance_loss_clip": 1.0388546, + "balance_loss_mlp": 1.02453029, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 3.7039764804525848, + "language_loss": 0.81144941, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83289111, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 2.5863850116729736 + }, + { + "auxiliary_loss_clip": 0.01071492, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.03702688, + "balance_loss_mlp": 1.01934278, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.584786563611322, + "language_loss": 0.8385725, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.85963631, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 2.619429588317871 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.0103952, + "balance_loss_clip": 1.03901482, + "balance_loss_mlp": 1.02490497, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 2.649979781054137, + "language_loss": 0.74236834, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76390791, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 3.9889259338378906 + }, + { + "auxiliary_loss_clip": 0.01103905, + "auxiliary_loss_mlp": 0.0103908, + "balance_loss_clip": 1.03835297, + "balance_loss_mlp": 1.02459002, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.024160328309211, + "language_loss": 0.61624265, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63767248, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 2.561319351196289 + }, + { + "auxiliary_loss_clip": 0.011027, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.03734994, + "balance_loss_mlp": 1.02523851, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 1.9563998649945435, + "language_loss": 0.74305779, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76447898, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 2.611684799194336 + }, + { + "auxiliary_loss_clip": 0.01101536, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.03775978, + "balance_loss_mlp": 1.01856041, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.592144836043939, + "language_loss": 0.71729755, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73863989, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 2.687952995300293 + }, + { + "auxiliary_loss_clip": 0.0108976, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.0328815, + "balance_loss_mlp": 1.01723504, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.141078329011797, + "language_loss": 0.73533237, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.75653124, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 2.5232458114624023 + }, + { + "auxiliary_loss_clip": 0.01090017, + "auxiliary_loss_mlp": 0.00749804, + "balance_loss_clip": 1.03689623, + "balance_loss_mlp": 1.00050449, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.7367440306317365, + "language_loss": 0.820593, + "learning_rate": 3.001236451924089e-06, + "loss": 0.83899117, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 4.14081335067749 + }, + { + "auxiliary_loss_clip": 0.01090847, + "auxiliary_loss_mlp": 0.01044406, + "balance_loss_clip": 1.03509557, + "balance_loss_mlp": 1.02938604, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.9243112441691064, + "language_loss": 0.66105783, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68241036, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.603907346725464 + }, + { + "auxiliary_loss_clip": 0.01023429, + "auxiliary_loss_mlp": 0.01004121, + "balance_loss_clip": 1.00881147, + "balance_loss_mlp": 1.00264323, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.76883190982687, + "language_loss": 0.61531258, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63558805, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 2.9975101947784424 + }, + { + "auxiliary_loss_clip": 0.01043058, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.03237128, + "balance_loss_mlp": 1.026847, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 2.0943292816357157, + "language_loss": 0.80037642, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82121396, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.7068231105804443 + }, + { + "auxiliary_loss_clip": 0.01003315, + "auxiliary_loss_mlp": 0.00747035, + "balance_loss_clip": 1.00956297, + "balance_loss_mlp": 1.00009763, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6781342134954453, + "language_loss": 0.56765157, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58515507, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 4.7537477016448975 + }, + { + "auxiliary_loss_clip": 0.0107769, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.0344317, + "balance_loss_mlp": 1.01553345, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.8905722296053715, + "language_loss": 0.71979618, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74086392, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 2.6274731159210205 + }, + { + "auxiliary_loss_clip": 0.01084817, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.03443265, + "balance_loss_mlp": 1.02091646, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.7798630215030038, + "language_loss": 0.77946222, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80066133, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.669005870819092 + }, + { + "auxiliary_loss_clip": 0.01086196, + "auxiliary_loss_mlp": 0.01040209, + "balance_loss_clip": 1.03940713, + "balance_loss_mlp": 1.02381754, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.2117467900450594, + "language_loss": 0.63013536, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65139937, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 4.159051418304443 + }, + { + "auxiliary_loss_clip": 0.01089083, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.03617644, + "balance_loss_mlp": 1.01621723, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 2.537560899085533, + "language_loss": 0.65848023, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67967856, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.6518874168395996 + }, + { + "auxiliary_loss_clip": 0.01092392, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.03714967, + "balance_loss_mlp": 1.01699543, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.5381303593357218, + "language_loss": 0.75591087, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77713352, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.6084704399108887 + }, + { + "auxiliary_loss_clip": 0.01083473, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.03571773, + "balance_loss_mlp": 1.02233648, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.168155841290193, + "language_loss": 0.70432425, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.7255469, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.6677629947662354 + }, + { + "auxiliary_loss_clip": 0.01083835, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.0394665, + "balance_loss_mlp": 1.02339387, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 3.36535763687269, + "language_loss": 0.77831376, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.79953349, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.6210713386535645 + }, + { + "auxiliary_loss_clip": 0.01087112, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.03792119, + "balance_loss_mlp": 1.0198791, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 2.0702188381194566, + "language_loss": 0.7519387, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77314162, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.627751111984253 + }, + { + "auxiliary_loss_clip": 0.01059398, + "auxiliary_loss_mlp": 0.0103885, + "balance_loss_clip": 1.03195179, + "balance_loss_mlp": 1.02375805, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.9762181590803833, + "language_loss": 0.83147025, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85245275, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.662980794906616 + }, + { + "auxiliary_loss_clip": 0.01110984, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.03810692, + "balance_loss_mlp": 1.01726127, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.7617379164196216, + "language_loss": 0.78406847, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80549788, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 2.516227960586548 + }, + { + "auxiliary_loss_clip": 0.01053192, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.03200078, + "balance_loss_mlp": 1.02404022, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 2.060843930846578, + "language_loss": 0.65267974, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67359006, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 2.6711955070495605 + }, + { + "auxiliary_loss_clip": 0.01079855, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.03536451, + "balance_loss_mlp": 1.02105141, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 2.2641037122293755, + "language_loss": 0.77046025, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79160339, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 2.6761417388916016 + }, + { + "auxiliary_loss_clip": 0.0108057, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.04063845, + "balance_loss_mlp": 1.02295077, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.8682103322181198, + "language_loss": 0.80471289, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82588351, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 2.6769134998321533 + }, + { + "auxiliary_loss_clip": 0.01084899, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.03502226, + "balance_loss_mlp": 1.02031898, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 2.0540006042810144, + "language_loss": 0.79734242, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81851602, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 2.661344051361084 + }, + { + "auxiliary_loss_clip": 0.0107125, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.03462541, + "balance_loss_mlp": 1.02363682, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.1086736786914932, + "language_loss": 0.73588228, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75698161, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.6526939868927 + }, + { + "auxiliary_loss_clip": 0.01087573, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_clip": 1.03668642, + "balance_loss_mlp": 1.0243814, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 2.0707764177511234, + "language_loss": 0.66957593, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69083881, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 2.66373872756958 + }, + { + "auxiliary_loss_clip": 0.01059543, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.03511322, + "balance_loss_mlp": 1.02192175, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.7914962456489436, + "language_loss": 0.69867885, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71964037, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.7517478466033936 + }, + { + "auxiliary_loss_clip": 0.01085199, + "auxiliary_loss_mlp": 0.00749777, + "balance_loss_clip": 1.03807783, + "balance_loss_mlp": 1.00052547, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.7861438625882198, + "language_loss": 0.74196386, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76031357, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.8035519123077393 + }, + { + "auxiliary_loss_clip": 0.01082888, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.03608596, + "balance_loss_mlp": 1.02328718, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 1.894865205251377, + "language_loss": 0.83453643, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85573459, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.6011815071105957 + }, + { + "auxiliary_loss_clip": 0.0108643, + "auxiliary_loss_mlp": 0.00749682, + "balance_loss_clip": 1.03512621, + "balance_loss_mlp": 1.00047874, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.730389839731277, + "language_loss": 0.69520509, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.71356618, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.675901174545288 + }, + { + "auxiliary_loss_clip": 0.01084249, + "auxiliary_loss_mlp": 0.01039186, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.02401686, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 3.3019670026010743, + "language_loss": 0.81811136, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83934569, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 2.700423240661621 + }, + { + "auxiliary_loss_clip": 0.01108676, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.03698421, + "balance_loss_mlp": 1.02184165, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 2.955036689627011, + "language_loss": 0.74263072, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76406342, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 2.6273796558380127 + }, + { + "auxiliary_loss_clip": 0.0111233, + "auxiliary_loss_mlp": 0.00749746, + "balance_loss_clip": 1.03809547, + "balance_loss_mlp": 1.00059175, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.8181648873428269, + "language_loss": 0.7946642, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81328499, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 2.592832088470459 + }, + { + "auxiliary_loss_clip": 0.01085187, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.034899, + "balance_loss_mlp": 1.02483547, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 2.143880367711983, + "language_loss": 0.81351358, + "learning_rate": 2.991781567335093e-06, + "loss": 0.8347559, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.629896879196167 + }, + { + "auxiliary_loss_clip": 0.01100684, + "auxiliary_loss_mlp": 0.00749843, + "balance_loss_clip": 1.03977203, + "balance_loss_mlp": 1.00048614, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 2.091277497435324, + "language_loss": 0.75841594, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77692115, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.5353071689605713 + }, + { + "auxiliary_loss_clip": 0.01101395, + "auxiliary_loss_mlp": 0.01036461, + "balance_loss_clip": 1.03813374, + "balance_loss_mlp": 1.02316904, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.9041721263377427, + "language_loss": 0.70376301, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72514153, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 2.5389585494995117 + }, + { + "auxiliary_loss_clip": 0.01102914, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.03762019, + "balance_loss_mlp": 1.01981187, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 3.429801060602783, + "language_loss": 0.73841727, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.75978035, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 2.52451229095459 + }, + { + "auxiliary_loss_clip": 0.01092618, + "auxiliary_loss_mlp": 0.00749784, + "balance_loss_clip": 1.03960752, + "balance_loss_mlp": 1.00058413, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 3.153719733819237, + "language_loss": 0.78551221, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.80393624, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 2.614481210708618 + }, + { + "auxiliary_loss_clip": 0.01074361, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.03626108, + "balance_loss_mlp": 1.02091026, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 2.235321508299716, + "language_loss": 0.72824389, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74931628, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.6498146057128906 + }, + { + "auxiliary_loss_clip": 0.01081634, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.03673005, + "balance_loss_mlp": 1.01762164, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 1.9647582088883266, + "language_loss": 0.74757367, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.76871932, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.644523859024048 + }, + { + "auxiliary_loss_clip": 0.0103508, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.0284065, + "balance_loss_mlp": 1.02319658, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.815350403573859, + "language_loss": 0.75308645, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77382833, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 4.3883538246154785 + }, + { + "auxiliary_loss_clip": 0.01091043, + "auxiliary_loss_mlp": 0.0103522, + "balance_loss_clip": 1.03952539, + "balance_loss_mlp": 1.02188647, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.8686432991191, + "language_loss": 0.68101597, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70227861, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 2.7917075157165527 + }, + { + "auxiliary_loss_clip": 0.01097412, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.01727176, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 1.9558722856523771, + "language_loss": 0.78323615, + "learning_rate": 2.988736221969144e-06, + "loss": 0.80450892, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 2.589808225631714 + }, + { + "auxiliary_loss_clip": 0.01086429, + "auxiliary_loss_mlp": 0.01037258, + "balance_loss_clip": 1.03532338, + "balance_loss_mlp": 1.0225122, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 3.0266820523361724, + "language_loss": 0.70540154, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72663832, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 2.594109535217285 + }, + { + "auxiliary_loss_clip": 0.01095946, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.03547931, + "balance_loss_mlp": 1.01935434, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 2.0244884946739896, + "language_loss": 0.86662877, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.8879143, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 2.569636583328247 + }, + { + "auxiliary_loss_clip": 0.01090595, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.0376246, + "balance_loss_mlp": 1.01756477, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.780986873355931, + "language_loss": 0.77275634, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79396987, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 4.0437633991241455 + }, + { + "auxiliary_loss_clip": 0.01074661, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.03861022, + "balance_loss_mlp": 1.01961935, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.5295195047976666, + "language_loss": 0.82496482, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84604836, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.7899813652038574 + }, + { + "auxiliary_loss_clip": 0.01113053, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.03922939, + "balance_loss_mlp": 1.01833463, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.609592918098287, + "language_loss": 0.70228398, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72373599, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.6760568618774414 + }, + { + "auxiliary_loss_clip": 0.01098541, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.03584647, + "balance_loss_mlp": 1.02038169, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 2.154565819825776, + "language_loss": 0.7619313, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.7832545, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 2.612900733947754 + }, + { + "auxiliary_loss_clip": 0.01074636, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.03484035, + "balance_loss_mlp": 1.02121806, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.9915030107961131, + "language_loss": 0.88523513, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90631974, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 4.225741624832153 + }, + { + "auxiliary_loss_clip": 0.0102734, + "auxiliary_loss_mlp": 0.01040061, + "balance_loss_clip": 1.02867246, + "balance_loss_mlp": 1.02447438, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.038819192008289, + "language_loss": 0.74944496, + "learning_rate": 2.98602669849771e-06, + "loss": 0.77011901, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.7252399921417236 + }, + { + "auxiliary_loss_clip": 0.01032402, + "auxiliary_loss_mlp": 0.01007501, + "balance_loss_clip": 1.02381968, + "balance_loss_mlp": 1.00618958, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 1.0334789488011025, + "language_loss": 0.63808239, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65848142, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 2.9234671592712402 + }, + { + "auxiliary_loss_clip": 0.01101977, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.03850889, + "balance_loss_mlp": 1.01743066, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 1.9529840604820925, + "language_loss": 0.73620778, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.75753832, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 4.0814208984375 + }, + { + "auxiliary_loss_clip": 0.01066678, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.03421092, + "balance_loss_mlp": 1.02018929, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 2.5734305913518503, + "language_loss": 0.76921767, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79022002, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.660627841949463 + }, + { + "auxiliary_loss_clip": 0.01086971, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.03656411, + "balance_loss_mlp": 1.0193671, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 2.6711991009076024, + "language_loss": 0.67540926, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69660383, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.660978317260742 + }, + { + "auxiliary_loss_clip": 0.01096371, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.03776014, + "balance_loss_mlp": 1.02330065, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 15.950333453656649, + "language_loss": 0.78719473, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.80852944, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.588815450668335 + }, + { + "auxiliary_loss_clip": 0.01092715, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.04073715, + "balance_loss_mlp": 1.02555215, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.7908598665294988, + "language_loss": 0.85447919, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87578845, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.6031577587127686 + }, + { + "auxiliary_loss_clip": 0.01084916, + "auxiliary_loss_mlp": 0.01041974, + "balance_loss_clip": 1.03701353, + "balance_loss_mlp": 1.02779448, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 2.0332883625231273, + "language_loss": 0.77983654, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.80110538, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.7320380210876465 + }, + { + "auxiliary_loss_clip": 0.01055239, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.03607154, + "balance_loss_mlp": 1.02941322, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.880619869294058, + "language_loss": 0.75766349, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.77864528, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 2.703112840652466 + }, + { + "auxiliary_loss_clip": 0.01086211, + "auxiliary_loss_mlp": 0.00750001, + "balance_loss_clip": 1.03917515, + "balance_loss_mlp": 1.00055635, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.438588467316401, + "language_loss": 0.69413507, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71249717, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 2.700058937072754 + }, + { + "auxiliary_loss_clip": 0.01110458, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.038715, + "balance_loss_mlp": 1.02487588, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 2.095159663671497, + "language_loss": 0.78905469, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81053239, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 2.549145221710205 + }, + { + "auxiliary_loss_clip": 0.01111705, + "auxiliary_loss_mlp": 0.01038541, + "balance_loss_clip": 1.03938746, + "balance_loss_mlp": 1.02533817, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.5842701394948784, + "language_loss": 0.82058871, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84209114, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 2.566274881362915 + }, + { + "auxiliary_loss_clip": 0.01096726, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.03562069, + "balance_loss_mlp": 1.02300501, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.7868529125833748, + "language_loss": 0.70431483, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72563398, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.4967010021209717 + }, + { + "auxiliary_loss_clip": 0.0110332, + "auxiliary_loss_mlp": 0.01044398, + "balance_loss_clip": 1.04112315, + "balance_loss_mlp": 1.0298425, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 2.0970866936318733, + "language_loss": 0.67880327, + "learning_rate": 2.981618622015244e-06, + "loss": 0.70028043, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 2.5952141284942627 + }, + { + "auxiliary_loss_clip": 0.01100425, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.03790951, + "balance_loss_mlp": 1.02340782, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.8434329855864542, + "language_loss": 0.68082595, + "learning_rate": 2.981279278287211e-06, + "loss": 0.70219582, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 2.5874552726745605 + }, + { + "auxiliary_loss_clip": 0.01060504, + "auxiliary_loss_mlp": 0.01027558, + "balance_loss_clip": 1.03740716, + "balance_loss_mlp": 1.01508236, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.1980065767711765, + "language_loss": 0.77960479, + "learning_rate": 2.980939897348969e-06, + "loss": 0.80048537, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.636443853378296 + }, + { + "auxiliary_loss_clip": 0.01093417, + "auxiliary_loss_mlp": 0.01052023, + "balance_loss_clip": 1.03569615, + "balance_loss_mlp": 1.03708625, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.3781655542942386, + "language_loss": 0.69555867, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71701312, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.6536271572113037 + }, + { + "auxiliary_loss_clip": 0.01088771, + "auxiliary_loss_mlp": 0.00750356, + "balance_loss_clip": 1.03720737, + "balance_loss_mlp": 1.00053048, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.9291672910495228, + "language_loss": 0.71200585, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73039716, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 2.629058599472046 + }, + { + "auxiliary_loss_clip": 0.01081337, + "auxiliary_loss_mlp": 0.0103342, + "balance_loss_clip": 1.03773785, + "balance_loss_mlp": 1.0194726, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.252952322315921, + "language_loss": 0.78196472, + "learning_rate": 2.979921531401692e-06, + "loss": 0.80311227, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 2.6428303718566895 + }, + { + "auxiliary_loss_clip": 0.01102059, + "auxiliary_loss_mlp": 0.00749656, + "balance_loss_clip": 1.0387311, + "balance_loss_mlp": 1.00039911, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 2.231676890169416, + "language_loss": 0.64247131, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66098851, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 2.6163220405578613 + }, + { + "auxiliary_loss_clip": 0.01113333, + "auxiliary_loss_mlp": 0.00749594, + "balance_loss_clip": 1.0383203, + "balance_loss_mlp": 1.00040972, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5865601894867982, + "language_loss": 0.78534627, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80397552, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 2.5005877017974854 + }, + { + "auxiliary_loss_clip": 0.01074713, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.0385114, + "balance_loss_mlp": 1.02576137, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 2.809810878452657, + "language_loss": 0.80326593, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82440507, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.681777238845825 + }, + { + "auxiliary_loss_clip": 0.01089053, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.03668845, + "balance_loss_mlp": 1.02123189, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.9826396912447308, + "language_loss": 0.79151034, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81275135, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.6071367263793945 + }, + { + "auxiliary_loss_clip": 0.01093805, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.03854704, + "balance_loss_mlp": 1.0175128, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.381029400148792, + "language_loss": 0.72215056, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74340731, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.6789302825927734 + }, + { + "auxiliary_loss_clip": 0.01108411, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.04273486, + "balance_loss_mlp": 1.02125907, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 2.3765033712276025, + "language_loss": 0.63729167, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.65873665, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 2.7204337120056152 + }, + { + "auxiliary_loss_clip": 0.01100842, + "auxiliary_loss_mlp": 0.01036424, + "balance_loss_clip": 1.03756583, + "balance_loss_mlp": 1.02251804, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 3.347587445086264, + "language_loss": 0.74101514, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76238775, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 2.5813865661621094 + }, + { + "auxiliary_loss_clip": 0.01040568, + "auxiliary_loss_mlp": 0.01009713, + "balance_loss_clip": 1.01542044, + "balance_loss_mlp": 1.00836563, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7958007705453978, + "language_loss": 0.6071074, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62761021, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.204622745513916 + }, + { + "auxiliary_loss_clip": 0.01089329, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.03776562, + "balance_loss_mlp": 1.01672125, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.2350570087443735, + "language_loss": 0.72614485, + "learning_rate": 2.976864428379655e-06, + "loss": 0.74733752, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.5862419605255127 + }, + { + "auxiliary_loss_clip": 0.01085362, + "auxiliary_loss_mlp": 0.00749597, + "balance_loss_clip": 1.03328943, + "balance_loss_mlp": 1.00034809, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.873619445864525, + "language_loss": 0.81396013, + "learning_rate": 2.976524564880326e-06, + "loss": 0.83230972, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 2.594933271408081 + }, + { + "auxiliary_loss_clip": 0.01112574, + "auxiliary_loss_mlp": 0.0103983, + "balance_loss_clip": 1.03929329, + "balance_loss_mlp": 1.025805, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.3875818159133557, + "language_loss": 0.68786031, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.70938432, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 2.534878969192505 + }, + { + "auxiliary_loss_clip": 0.01079396, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.03375483, + "balance_loss_mlp": 1.01903582, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.7114894720259675, + "language_loss": 0.75451803, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77563357, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 4.2192816734313965 + }, + { + "auxiliary_loss_clip": 0.01062907, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.04168797, + "balance_loss_mlp": 1.02373683, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.1728112289383006, + "language_loss": 0.70170081, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72270155, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 2.837071180343628 + }, + { + "auxiliary_loss_clip": 0.01091213, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.03902113, + "balance_loss_mlp": 1.02243686, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.9624948297641056, + "language_loss": 0.77419078, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79546034, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.704167366027832 + }, + { + "auxiliary_loss_clip": 0.01102149, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.03717339, + "balance_loss_mlp": 1.01919246, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.8035487471127942, + "language_loss": 0.72815275, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74950993, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.6188135147094727 + }, + { + "auxiliary_loss_clip": 0.01105016, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.03895915, + "balance_loss_mlp": 1.02281237, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 2.1593175289667155, + "language_loss": 0.70139223, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.72281164, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 4.224388837814331 + }, + { + "auxiliary_loss_clip": 0.01057268, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.03343594, + "balance_loss_mlp": 1.02282286, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 1.926902869950236, + "language_loss": 0.69870484, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71964848, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.8029236793518066 + }, + { + "auxiliary_loss_clip": 0.010893, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.03668785, + "balance_loss_mlp": 1.016366, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.9905178819478027, + "language_loss": 0.66699028, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68817687, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 2.6017231941223145 + }, + { + "auxiliary_loss_clip": 0.01087154, + "auxiliary_loss_mlp": 0.01035507, + "balance_loss_clip": 1.03674614, + "balance_loss_mlp": 1.02300191, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 2.421168233333146, + "language_loss": 0.74770319, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76892978, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.582070827484131 + }, + { + "auxiliary_loss_clip": 0.01089667, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.03659964, + "balance_loss_mlp": 1.01706982, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 2.568159553688958, + "language_loss": 0.75719738, + "learning_rate": 2.973123895369182e-06, + "loss": 0.77838796, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.6198315620422363 + }, + { + "auxiliary_loss_clip": 0.01105377, + "auxiliary_loss_mlp": 0.01028169, + "balance_loss_clip": 1.03549314, + "balance_loss_mlp": 1.01531792, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.6922734868342746, + "language_loss": 0.72804338, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.7493788, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 4.057060241699219 + }, + { + "auxiliary_loss_clip": 0.01089568, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.03930926, + "balance_loss_mlp": 1.01768577, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 5.493435375599461, + "language_loss": 0.70808131, + "learning_rate": 2.972443318242726e-06, + "loss": 0.72928333, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.628378391265869 + }, + { + "auxiliary_loss_clip": 0.01071403, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.03404164, + "balance_loss_mlp": 1.019274, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 2.210505453245153, + "language_loss": 0.8855629, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90658987, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.627718687057495 + }, + { + "auxiliary_loss_clip": 0.01109284, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.03773713, + "balance_loss_mlp": 1.02134442, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.7900143870110619, + "language_loss": 0.58266532, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60410017, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 4.4267003536224365 + }, + { + "auxiliary_loss_clip": 0.01110275, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.03796828, + "balance_loss_mlp": 1.020859, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 2.086313316725316, + "language_loss": 0.7622633, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78371513, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.5435731410980225 + }, + { + "auxiliary_loss_clip": 0.01078836, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.03841531, + "balance_loss_mlp": 1.0192672, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.867117714926948, + "language_loss": 0.70388436, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72499299, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.7667465209960938 + }, + { + "auxiliary_loss_clip": 0.01093723, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.04236245, + "balance_loss_mlp": 1.02354443, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.6768812575290193, + "language_loss": 0.74456513, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76585186, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.8685171604156494 + }, + { + "auxiliary_loss_clip": 0.01111332, + "auxiliary_loss_mlp": 0.01038719, + "balance_loss_clip": 1.04033613, + "balance_loss_mlp": 1.02558184, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.7796374616983819, + "language_loss": 0.78535706, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80685759, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 2.5721359252929688 + }, + { + "auxiliary_loss_clip": 0.01086346, + "auxiliary_loss_mlp": 0.01035626, + "balance_loss_clip": 1.03810847, + "balance_loss_mlp": 1.02201855, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 2.2252720678845743, + "language_loss": 0.66355926, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68477893, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 2.6176693439483643 + }, + { + "auxiliary_loss_clip": 0.01108791, + "auxiliary_loss_mlp": 0.00749695, + "balance_loss_clip": 1.03688192, + "balance_loss_mlp": 1.00037837, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 1.6772688679798164, + "language_loss": 0.78834677, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.80693161, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 2.6123480796813965 + }, + { + "auxiliary_loss_clip": 0.01049259, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.03177798, + "balance_loss_mlp": 1.02594995, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 5.385327090048191, + "language_loss": 0.90968603, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93058968, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 2.63627552986145 + }, + { + "auxiliary_loss_clip": 0.01072993, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.03606033, + "balance_loss_mlp": 1.02621818, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 2.9128163905007374, + "language_loss": 0.80155575, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82270569, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 2.68727445602417 + }, + { + "auxiliary_loss_clip": 0.01086048, + "auxiliary_loss_mlp": 0.0104826, + "balance_loss_clip": 1.0367589, + "balance_loss_mlp": 1.03371692, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.82119540670724, + "language_loss": 0.83744967, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.85879278, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 2.6109230518341064 + }, + { + "auxiliary_loss_clip": 0.01064452, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.03401649, + "balance_loss_mlp": 1.01735675, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.944573361670845, + "language_loss": 0.72186208, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74280584, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.741971969604492 + }, + { + "auxiliary_loss_clip": 0.01085258, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.03462231, + "balance_loss_mlp": 1.01960135, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.6787617563110324, + "language_loss": 0.7947849, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81596142, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 2.586155652999878 + }, + { + "auxiliary_loss_clip": 0.01078817, + "auxiliary_loss_mlp": 0.01038357, + "balance_loss_clip": 1.04308033, + "balance_loss_mlp": 1.02448738, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 3.8254061517211526, + "language_loss": 0.78832638, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80949813, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 2.7114551067352295 + }, + { + "auxiliary_loss_clip": 0.01070348, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.03527737, + "balance_loss_mlp": 1.02202082, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 1.8475923122423676, + "language_loss": 0.81445062, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83550638, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.632110834121704 + }, + { + "auxiliary_loss_clip": 0.01030429, + "auxiliary_loss_mlp": 0.0100311, + "balance_loss_clip": 1.02536964, + "balance_loss_mlp": 1.00147653, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9141708226020032, + "language_loss": 0.567294, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58762944, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 3.0518832206726074 + }, + { + "auxiliary_loss_clip": 0.01099522, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.03693378, + "balance_loss_mlp": 1.02486849, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.8487456819237924, + "language_loss": 0.68645728, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.70783138, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.585329055786133 + }, + { + "auxiliary_loss_clip": 0.01108917, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.03752017, + "balance_loss_mlp": 1.0221777, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 3.371368397615337, + "language_loss": 0.80357844, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82501334, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 2.626227378845215 + }, + { + "auxiliary_loss_clip": 0.01046118, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.03336489, + "balance_loss_mlp": 1.01940608, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.7312648426975707, + "language_loss": 0.78799576, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.80879545, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.7065629959106445 + }, + { + "auxiliary_loss_clip": 0.01073046, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.03614545, + "balance_loss_mlp": 1.02194667, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 2.3826660562179063, + "language_loss": 0.79745346, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.81852931, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.659898281097412 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.00749907, + "balance_loss_clip": 1.03722739, + "balance_loss_mlp": 1.00040162, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 2.0184133351147064, + "language_loss": 0.68168354, + "learning_rate": 2.965288372816436e-06, + "loss": 0.70028603, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.636930465698242 + }, + { + "auxiliary_loss_clip": 0.0107811, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.03388107, + "balance_loss_mlp": 1.02188897, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 1.8371181582461127, + "language_loss": 0.6725719, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69371223, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.653777599334717 + }, + { + "auxiliary_loss_clip": 0.01088742, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.03604054, + "balance_loss_mlp": 1.02285945, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 2.077285119429459, + "language_loss": 0.71149051, + "learning_rate": 2.964606105671327e-06, + "loss": 0.7327593, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 2.661893367767334 + }, + { + "auxiliary_loss_clip": 0.01088088, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.03868198, + "balance_loss_mlp": 1.02416921, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.699600194620021, + "language_loss": 0.70936674, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73063838, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 2.6616456508636475 + }, + { + "auxiliary_loss_clip": 0.01088073, + "auxiliary_loss_mlp": 0.01043687, + "balance_loss_clip": 1.03437614, + "balance_loss_mlp": 1.02927494, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 2.4036860022019795, + "language_loss": 0.76059401, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78191161, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 2.607586145401001 + }, + { + "auxiliary_loss_clip": 0.01118044, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_clip": 1.04062641, + "balance_loss_mlp": 1.02620471, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.6654124758793625, + "language_loss": 0.76301211, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78460717, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 2.4883522987365723 + }, + { + "auxiliary_loss_clip": 0.01096649, + "auxiliary_loss_mlp": 0.00749796, + "balance_loss_clip": 1.03681457, + "balance_loss_mlp": 1.00040698, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 2.3289585869959057, + "language_loss": 0.86624622, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88471067, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 2.6155049800872803 + }, + { + "auxiliary_loss_clip": 0.01097328, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.03629863, + "balance_loss_mlp": 1.02662957, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.4768716905727464, + "language_loss": 0.72453272, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74591374, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 2.6443841457366943 + }, + { + "auxiliary_loss_clip": 0.01076649, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.03407788, + "balance_loss_mlp": 1.02216387, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 2.3981051049794875, + "language_loss": 0.73676062, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75789684, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 2.7257232666015625 + }, + { + "auxiliary_loss_clip": 0.01115424, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.03988206, + "balance_loss_mlp": 1.02348828, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 3.807565167716935, + "language_loss": 0.6894908, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71102822, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 4.112979888916016 + }, + { + "auxiliary_loss_clip": 0.01105654, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.03974795, + "balance_loss_mlp": 1.02154541, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.6871628974233466, + "language_loss": 0.73119092, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75260395, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.5457921028137207 + }, + { + "auxiliary_loss_clip": 0.01062984, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.03204548, + "balance_loss_mlp": 1.01825798, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.6708332884282828, + "language_loss": 0.80005908, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82101345, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.666095018386841 + }, + { + "auxiliary_loss_clip": 0.01101855, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.03843951, + "balance_loss_mlp": 1.01854515, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 2.1386523424121386, + "language_loss": 0.84057552, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86192155, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.5605602264404297 + }, + { + "auxiliary_loss_clip": 0.01085642, + "auxiliary_loss_mlp": 0.01038976, + "balance_loss_clip": 1.03782308, + "balance_loss_mlp": 1.02479005, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.9417381070809965, + "language_loss": 0.7558341, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77708024, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 4.09413480758667 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.03940761, + "balance_loss_mlp": 1.01922083, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 6.2073805329360985, + "language_loss": 0.77306902, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79453146, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 2.568643808364868 + }, + { + "auxiliary_loss_clip": 0.01085519, + "auxiliary_loss_mlp": 0.01041118, + "balance_loss_clip": 1.03602147, + "balance_loss_mlp": 1.02500701, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.7694777114674811, + "language_loss": 0.74291408, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.76418042, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.570549488067627 + }, + { + "auxiliary_loss_clip": 0.01074262, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.03796339, + "balance_loss_mlp": 1.0163064, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.823654388150607, + "language_loss": 0.68972212, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71076685, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 2.6268317699432373 + }, + { + "auxiliary_loss_clip": 0.01080381, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.03297043, + "balance_loss_mlp": 1.02643681, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 2.4077429736160854, + "language_loss": 0.82733744, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84855795, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 2.5888795852661133 + }, + { + "auxiliary_loss_clip": 0.01112285, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.03829575, + "balance_loss_mlp": 1.02091491, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.8001307300558638, + "language_loss": 0.73820001, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75967759, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 4.168315172195435 + }, + { + "auxiliary_loss_clip": 0.01095719, + "auxiliary_loss_mlp": 0.01035073, + "balance_loss_clip": 1.03651428, + "balance_loss_mlp": 1.02188826, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.209365264302219, + "language_loss": 0.69100302, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71231091, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.5469255447387695 + }, + { + "auxiliary_loss_clip": 0.01063419, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.03537846, + "balance_loss_mlp": 1.02262533, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.400976344619552, + "language_loss": 0.76804829, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.78906858, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 4.168020009994507 + }, + { + "auxiliary_loss_clip": 0.01074248, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.03594172, + "balance_loss_mlp": 1.02051008, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 1.7858663220047084, + "language_loss": 0.77983248, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80092072, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.6409554481506348 + }, + { + "auxiliary_loss_clip": 0.01073123, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.03735566, + "balance_loss_mlp": 1.01915324, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.8712559594846903, + "language_loss": 0.78172016, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80277586, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 2.671555280685425 + }, + { + "auxiliary_loss_clip": 0.01106337, + "auxiliary_loss_mlp": 0.00749643, + "balance_loss_clip": 1.03557944, + "balance_loss_mlp": 1.00033557, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 2.4973015080645498, + "language_loss": 0.83345771, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.8520174, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 2.591350793838501 + }, + { + "auxiliary_loss_clip": 0.01077991, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.03217876, + "balance_loss_mlp": 1.0183053, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.1497728644942926, + "language_loss": 0.90557039, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92665529, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.6288259029388428 + }, + { + "auxiliary_loss_clip": 0.01004444, + "auxiliary_loss_mlp": 0.01014134, + "balance_loss_clip": 1.01029217, + "balance_loss_mlp": 1.01270986, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.9466984947352194, + "language_loss": 0.53440928, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55459511, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.1407101154327393 + }, + { + "auxiliary_loss_clip": 0.01089599, + "auxiliary_loss_mlp": 0.00749976, + "balance_loss_clip": 1.035622, + "balance_loss_mlp": 1.00042331, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.7396543174286672, + "language_loss": 0.77671033, + "learning_rate": 2.956407517225883e-06, + "loss": 0.79510605, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 2.646756410598755 + }, + { + "auxiliary_loss_clip": 0.01088472, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.03466856, + "balance_loss_mlp": 1.02200198, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.1489756632533505, + "language_loss": 0.79147232, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81270993, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.5917463302612305 + }, + { + "auxiliary_loss_clip": 0.0111207, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.0371145, + "balance_loss_mlp": 1.01930046, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 2.1320135402010956, + "language_loss": 0.84205395, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86351788, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 2.5947105884552 + }, + { + "auxiliary_loss_clip": 0.01096618, + "auxiliary_loss_mlp": 0.01035243, + "balance_loss_clip": 1.03843725, + "balance_loss_mlp": 1.01991296, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.4669716829154713, + "language_loss": 0.72849566, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74981427, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 2.6883504390716553 + }, + { + "auxiliary_loss_clip": 0.01093763, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.03302765, + "balance_loss_mlp": 1.02130425, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 2.2453395417263353, + "language_loss": 0.82577479, + "learning_rate": 2.955039050023368e-06, + "loss": 0.84706128, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.5876519680023193 + }, + { + "auxiliary_loss_clip": 0.0108274, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.03774631, + "balance_loss_mlp": 1.0239265, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 2.319642986198575, + "language_loss": 0.76115829, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78236073, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 2.7062458992004395 + }, + { + "auxiliary_loss_clip": 0.01085281, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.03691185, + "balance_loss_mlp": 1.02112114, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.583377215857063, + "language_loss": 0.83210289, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85329926, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 2.6807236671447754 + }, + { + "auxiliary_loss_clip": 0.01117512, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.03930616, + "balance_loss_mlp": 1.02001178, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.9799530288095017, + "language_loss": 0.6214987, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64302254, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 2.5548508167266846 + }, + { + "auxiliary_loss_clip": 0.01085213, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.03476429, + "balance_loss_mlp": 1.0197674, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 2.7048489901995483, + "language_loss": 0.83572137, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.85690349, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.584390163421631 + }, + { + "auxiliary_loss_clip": 0.01108394, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.03515756, + "balance_loss_mlp": 1.02468276, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 2.008738784442599, + "language_loss": 0.91824949, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93972337, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.557741403579712 + }, + { + "auxiliary_loss_clip": 0.01107352, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.03446174, + "balance_loss_mlp": 1.02320766, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.76296551376746, + "language_loss": 0.73635101, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75779343, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.5557680130004883 + }, + { + "auxiliary_loss_clip": 0.01027557, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.02972412, + "balance_loss_mlp": 1.03096962, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 2.1724027864955104, + "language_loss": 0.64971745, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67046613, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 2.8291354179382324 + }, + { + "auxiliary_loss_clip": 0.01103291, + "auxiliary_loss_mlp": 0.01038414, + "balance_loss_clip": 1.0383178, + "balance_loss_mlp": 1.02242208, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.87346625217174, + "language_loss": 0.72048628, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74190331, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.776134490966797 + }, + { + "auxiliary_loss_clip": 0.01101286, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.03451586, + "balance_loss_mlp": 1.02086902, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.8266627194173293, + "language_loss": 0.73463285, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75599706, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.5458920001983643 + }, + { + "auxiliary_loss_clip": 0.01066087, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.03591442, + "balance_loss_mlp": 1.01890182, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.7541517743035269, + "language_loss": 0.68870854, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.70969659, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 2.653559446334839 + }, + { + "auxiliary_loss_clip": 0.01090965, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.03519678, + "balance_loss_mlp": 1.01775098, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 2.376036197362908, + "language_loss": 0.76073563, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78197694, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.6333768367767334 + }, + { + "auxiliary_loss_clip": 0.0110145, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.03840697, + "balance_loss_mlp": 1.02445841, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.89068099923193, + "language_loss": 0.7367, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.75810987, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 2.5928540229797363 + }, + { + "auxiliary_loss_clip": 0.01072349, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.03516412, + "balance_loss_mlp": 1.02048302, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 2.408798974184482, + "language_loss": 0.80709618, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.82815409, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 2.5687103271484375 + }, + { + "auxiliary_loss_clip": 0.01097193, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.03689456, + "balance_loss_mlp": 1.02197218, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 2.1617425308305958, + "language_loss": 0.81295466, + "learning_rate": 2.950244857154417e-06, + "loss": 0.8342768, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.6352760791778564 + }, + { + "auxiliary_loss_clip": 0.01090836, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.03725934, + "balance_loss_mlp": 1.01936412, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.8016770276361502, + "language_loss": 0.79901117, + "learning_rate": 2.9499021441341e-06, + "loss": 0.8202678, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 2.608532428741455 + }, + { + "auxiliary_loss_clip": 0.01069651, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.03062844, + "balance_loss_mlp": 1.02392316, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.9855596552198613, + "language_loss": 0.74980849, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.7708894, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 2.555638313293457 + }, + { + "auxiliary_loss_clip": 0.01095819, + "auxiliary_loss_mlp": 0.00749773, + "balance_loss_clip": 1.0347178, + "balance_loss_mlp": 1.00038266, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.6309316555620867, + "language_loss": 0.71988505, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.73834097, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.5326344966888428 + }, + { + "auxiliary_loss_clip": 0.01107125, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.03859401, + "balance_loss_mlp": 1.02579427, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.273306568780102, + "language_loss": 0.7919867, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81346947, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 2.569068193435669 + }, + { + "auxiliary_loss_clip": 0.01087508, + "auxiliary_loss_mlp": 0.01038364, + "balance_loss_clip": 1.035851, + "balance_loss_mlp": 1.02376091, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 6.948066196104474, + "language_loss": 0.68126255, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.70252132, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 4.0139384269714355 + }, + { + "auxiliary_loss_clip": 0.01068644, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.0355109, + "balance_loss_mlp": 1.0195992, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 4.077962072017202, + "language_loss": 0.85157979, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87259334, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.5953407287597656 + }, + { + "auxiliary_loss_clip": 0.01074507, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.03461266, + "balance_loss_mlp": 1.0190196, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 2.29410999350865, + "language_loss": 0.72999603, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.75106549, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 2.6315901279449463 + }, + { + "auxiliary_loss_clip": 0.01088043, + "auxiliary_loss_mlp": 0.01041485, + "balance_loss_clip": 1.03476691, + "balance_loss_mlp": 1.02485585, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.4693888882872375, + "language_loss": 0.73888493, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76018023, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.578690528869629 + }, + { + "auxiliary_loss_clip": 0.01063576, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.03137136, + "balance_loss_mlp": 1.02173769, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.7020690019601157, + "language_loss": 0.73241025, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75340176, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 2.743830442428589 + }, + { + "auxiliary_loss_clip": 0.01063161, + "auxiliary_loss_mlp": 0.01041799, + "balance_loss_clip": 1.03395295, + "balance_loss_mlp": 1.02818513, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 2.511320221399442, + "language_loss": 0.77584958, + "learning_rate": 2.946816107593884e-06, + "loss": 0.7968992, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 4.3164286613464355 + }, + { + "auxiliary_loss_clip": 0.0099418, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 1.01259589, + "balance_loss_mlp": 1.00018334, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.7772621599172173, + "language_loss": 0.64801621, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66797161, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.263493776321411 + }, + { + "auxiliary_loss_clip": 0.01089167, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.03275323, + "balance_loss_mlp": 1.01949573, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 4.125111967732871, + "language_loss": 0.89572406, + "learning_rate": 2.946129926425273e-06, + "loss": 0.91694975, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 2.7646982669830322 + }, + { + "auxiliary_loss_clip": 0.0108865, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.0359019, + "balance_loss_mlp": 1.0188942, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7133806691698799, + "language_loss": 0.73695433, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.75817543, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 2.670206308364868 + }, + { + "auxiliary_loss_clip": 0.01090732, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.0336926, + "balance_loss_mlp": 1.01732516, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 2.2796926972637137, + "language_loss": 0.75642675, + "learning_rate": 2.945443601747297e-06, + "loss": 0.77765441, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 4.132031440734863 + }, + { + "auxiliary_loss_clip": 0.01088552, + "auxiliary_loss_mlp": 0.01053151, + "balance_loss_clip": 1.0333333, + "balance_loss_mlp": 1.03674817, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6249112558919292, + "language_loss": 0.78035498, + "learning_rate": 2.945100385624828e-06, + "loss": 0.801772, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.6440682411193848 + }, + { + "auxiliary_loss_clip": 0.01023468, + "auxiliary_loss_mlp": 0.01002524, + "balance_loss_clip": 1.01205444, + "balance_loss_mlp": 1.000772, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.82579497748219, + "language_loss": 0.63404483, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.6543048, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 4.666405916213989 + }, + { + "auxiliary_loss_clip": 0.01081914, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.03456366, + "balance_loss_mlp": 1.02808785, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 2.0011561674406857, + "language_loss": 0.71122545, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73247862, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 2.5769121646881104 + }, + { + "auxiliary_loss_clip": 0.01105583, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.03913951, + "balance_loss_mlp": 1.02246571, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 1.776039189918945, + "language_loss": 0.81173146, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83315146, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 2.5988619327545166 + }, + { + "auxiliary_loss_clip": 0.01091474, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.03585434, + "balance_loss_mlp": 1.02043223, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 1.9316093422391398, + "language_loss": 0.84142375, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86269683, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 2.5896425247192383 + }, + { + "auxiliary_loss_clip": 0.01086837, + "auxiliary_loss_mlp": 0.01040628, + "balance_loss_clip": 1.03634143, + "balance_loss_mlp": 1.02684736, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.621745498710566, + "language_loss": 0.78161836, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80289298, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 2.599977970123291 + }, + { + "auxiliary_loss_clip": 0.01086362, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.04234576, + "balance_loss_mlp": 1.02207756, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 2.5814475310508316, + "language_loss": 0.65318304, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67441571, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 2.610158920288086 + }, + { + "auxiliary_loss_clip": 0.01083227, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.0363493, + "balance_loss_mlp": 1.0187149, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 2.203879378033701, + "language_loss": 0.81037939, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83153665, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.6370441913604736 + }, + { + "auxiliary_loss_clip": 0.01075294, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.03344965, + "balance_loss_mlp": 1.0238235, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 2.019928337244728, + "language_loss": 0.64639211, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66753662, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.8009746074676514 + }, + { + "auxiliary_loss_clip": 0.01076105, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.03408408, + "balance_loss_mlp": 1.02026844, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 2.237136131955455, + "language_loss": 0.77269888, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.79379851, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 2.663400650024414 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01039579, + "balance_loss_clip": 1.03767133, + "balance_loss_mlp": 1.02329516, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.6083778952646304, + "language_loss": 0.79615474, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81765944, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 2.6198084354400635 + }, + { + "auxiliary_loss_clip": 0.01024144, + "auxiliary_loss_mlp": 0.01006858, + "balance_loss_clip": 1.01041842, + "balance_loss_mlp": 1.00541544, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.765180461529902, + "language_loss": 0.52571881, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54602879, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 3.168041467666626 + }, + { + "auxiliary_loss_clip": 0.01078278, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.03716302, + "balance_loss_mlp": 1.02091026, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 2.0905039228903632, + "language_loss": 0.86480129, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88594067, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.689970016479492 + }, + { + "auxiliary_loss_clip": 0.01099236, + "auxiliary_loss_mlp": 0.00749837, + "balance_loss_clip": 1.03848374, + "balance_loss_mlp": 1.00036407, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.7920448599832606, + "language_loss": 0.783149, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80163974, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 2.5399844646453857 + }, + { + "auxiliary_loss_clip": 0.01100243, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.03464222, + "balance_loss_mlp": 1.02063918, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 1.8387011127905508, + "language_loss": 0.82704103, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84838533, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.6012465953826904 + }, + { + "auxiliary_loss_clip": 0.01075136, + "auxiliary_loss_mlp": 0.01032565, + "balance_loss_clip": 1.03361881, + "balance_loss_mlp": 1.02011919, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 2.1672351844994657, + "language_loss": 0.72038835, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74146533, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.706561326980591 + }, + { + "auxiliary_loss_clip": 0.00986298, + "auxiliary_loss_mlp": 0.01009624, + "balance_loss_clip": 1.00984204, + "balance_loss_mlp": 1.00787151, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7748873829708738, + "language_loss": 0.61208051, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63203973, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 3.266035318374634 + }, + { + "auxiliary_loss_clip": 0.01084892, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.03569114, + "balance_loss_mlp": 1.02177477, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 3.232652788496081, + "language_loss": 0.75954854, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78076422, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.5495665073394775 + }, + { + "auxiliary_loss_clip": 0.01111705, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.03658009, + "balance_loss_mlp": 1.02187347, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.978059851240334, + "language_loss": 0.74989933, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77138364, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 2.516106605529785 + }, + { + "auxiliary_loss_clip": 0.01095606, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.04084742, + "balance_loss_mlp": 1.02410936, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 1.8527141540550491, + "language_loss": 0.80222046, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.8235544, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.6026554107666016 + }, + { + "auxiliary_loss_clip": 0.01081007, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.03394544, + "balance_loss_mlp": 1.02004874, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 2.092436170180951, + "language_loss": 0.80110186, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82225454, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.61344838142395 + }, + { + "auxiliary_loss_clip": 0.01079249, + "auxiliary_loss_mlp": 0.00749755, + "balance_loss_clip": 1.03407896, + "balance_loss_mlp": 1.00035977, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 2.1480043799871575, + "language_loss": 0.84700161, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.86529166, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 2.5941619873046875 + }, + { + "auxiliary_loss_clip": 0.01069009, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.0339545, + "balance_loss_mlp": 1.02610421, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 2.595439331329046, + "language_loss": 0.87873387, + "learning_rate": 2.937540586903884e-06, + "loss": 0.89983332, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 2.7223312854766846 + }, + { + "auxiliary_loss_clip": 0.01104623, + "auxiliary_loss_mlp": 0.01037439, + "balance_loss_clip": 1.03975868, + "balance_loss_mlp": 1.0221802, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 1.9792234209098452, + "language_loss": 0.67437834, + "learning_rate": 2.937196549795971e-06, + "loss": 0.69579899, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.5963475704193115 + }, + { + "auxiliary_loss_clip": 0.0109748, + "auxiliary_loss_mlp": 0.01036699, + "balance_loss_clip": 1.04026127, + "balance_loss_mlp": 1.02178049, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.8412666046614, + "language_loss": 0.74847424, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.76981598, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 2.566190481185913 + }, + { + "auxiliary_loss_clip": 0.01087769, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.03775477, + "balance_loss_mlp": 1.01884079, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 2.6273969727268724, + "language_loss": 0.72378093, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74500471, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 2.7112925052642822 + }, + { + "auxiliary_loss_clip": 0.01093356, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.03415859, + "balance_loss_mlp": 1.02177656, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.852848206886864, + "language_loss": 0.67270637, + "learning_rate": 2.936164225292901e-06, + "loss": 0.69399858, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.5866239070892334 + }, + { + "auxiliary_loss_clip": 0.01091307, + "auxiliary_loss_mlp": 0.01045613, + "balance_loss_clip": 1.03588188, + "balance_loss_mlp": 1.03099251, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 2.1216154163770997, + "language_loss": 0.74206549, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76343465, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 2.761033058166504 + }, + { + "auxiliary_loss_clip": 0.01095694, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.03898668, + "balance_loss_mlp": 1.02415466, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 3.93837179693453, + "language_loss": 0.74870664, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77005935, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.710271120071411 + }, + { + "auxiliary_loss_clip": 0.01096296, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.03699803, + "balance_loss_mlp": 1.01570415, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.110387279904445, + "language_loss": 0.76874745, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.78999525, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 4.076513767242432 + }, + { + "auxiliary_loss_clip": 0.01112292, + "auxiliary_loss_mlp": 0.01036287, + "balance_loss_clip": 1.04011703, + "balance_loss_mlp": 1.02366889, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.192826051648146, + "language_loss": 0.70724094, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72872674, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 2.4999797344207764 + }, + { + "auxiliary_loss_clip": 0.01096069, + "auxiliary_loss_mlp": 0.01036882, + "balance_loss_clip": 1.03432035, + "balance_loss_mlp": 1.02264869, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.8021853099358056, + "language_loss": 0.73985445, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76118398, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.5856614112854004 + }, + { + "auxiliary_loss_clip": 0.01092805, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.0373466, + "balance_loss_mlp": 1.01802874, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.8239683039010546, + "language_loss": 0.65763074, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.67887849, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.642789125442505 + }, + { + "auxiliary_loss_clip": 0.01097933, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.03813314, + "balance_loss_mlp": 1.01840901, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.613971069274714, + "language_loss": 0.74009287, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76138932, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.577810287475586 + }, + { + "auxiliary_loss_clip": 0.01098275, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.03508592, + "balance_loss_mlp": 1.01904058, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 3.49386462344125, + "language_loss": 0.884642, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.9059633, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 4.038743734359741 + }, + { + "auxiliary_loss_clip": 0.01099766, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.03879571, + "balance_loss_mlp": 1.01943922, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 1.9209986654020603, + "language_loss": 0.7221176, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74344194, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.5624120235443115 + }, + { + "auxiliary_loss_clip": 0.01044218, + "auxiliary_loss_mlp": 0.0103333, + "balance_loss_clip": 1.03721344, + "balance_loss_mlp": 1.01841712, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 6.008691210706033, + "language_loss": 0.66251671, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68329227, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.7195324897766113 + }, + { + "auxiliary_loss_clip": 0.01079246, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.03573918, + "balance_loss_mlp": 1.0204072, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.6277037782812198, + "language_loss": 0.72902501, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75015831, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.7075066566467285 + }, + { + "auxiliary_loss_clip": 0.01076683, + "auxiliary_loss_mlp": 0.0103736, + "balance_loss_clip": 1.03591681, + "balance_loss_mlp": 1.02232218, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.2533183965414345, + "language_loss": 0.89509046, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91623086, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 2.7334766387939453 + }, + { + "auxiliary_loss_clip": 0.01101665, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.03844297, + "balance_loss_mlp": 1.02322245, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.3934977443284886, + "language_loss": 0.69516265, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71655697, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 4.054387092590332 + }, + { + "auxiliary_loss_clip": 0.01035428, + "auxiliary_loss_mlp": 0.01004022, + "balance_loss_clip": 1.01070487, + "balance_loss_mlp": 1.00259149, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7718527180707615, + "language_loss": 0.61775273, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63814723, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.163806676864624 + }, + { + "auxiliary_loss_clip": 0.01078342, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_clip": 1.03145194, + "balance_loss_mlp": 1.03074026, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 2.217821038416881, + "language_loss": 0.78102148, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80226392, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 4.217296600341797 + }, + { + "auxiliary_loss_clip": 0.01103269, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.03790522, + "balance_loss_mlp": 1.0224359, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.3523859321891165, + "language_loss": 0.6256758, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.6470753, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.9013009071350098 + }, + { + "auxiliary_loss_clip": 0.01084268, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.03643298, + "balance_loss_mlp": 1.02282453, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.8351770371509497, + "language_loss": 0.67274064, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69396663, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.654437780380249 + }, + { + "auxiliary_loss_clip": 0.01088027, + "auxiliary_loss_mlp": 0.00749806, + "balance_loss_clip": 1.03654432, + "balance_loss_mlp": 1.00038052, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.8501839003428542, + "language_loss": 0.74780917, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76618749, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 2.6573774814605713 + }, + { + "auxiliary_loss_clip": 0.01048912, + "auxiliary_loss_mlp": 0.00749538, + "balance_loss_clip": 1.03328609, + "balance_loss_mlp": 1.00020719, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 2.1434723764059758, + "language_loss": 0.82822204, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84620661, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 2.7428386211395264 + }, + { + "auxiliary_loss_clip": 0.01017701, + "auxiliary_loss_mlp": 0.01008071, + "balance_loss_clip": 1.01282156, + "balance_loss_mlp": 1.00678945, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8099267291139021, + "language_loss": 0.59361434, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61387205, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.2561824321746826 + }, + { + "auxiliary_loss_clip": 0.01064103, + "auxiliary_loss_mlp": 0.01039559, + "balance_loss_clip": 1.03352118, + "balance_loss_mlp": 1.02540267, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 1.7959331409268535, + "language_loss": 0.73099935, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75203598, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.624807119369507 + }, + { + "auxiliary_loss_clip": 0.0107964, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.03927231, + "balance_loss_mlp": 1.02507401, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 1.8625414535593774, + "language_loss": 0.77953917, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80071771, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 2.598231077194214 + }, + { + "auxiliary_loss_clip": 0.0108805, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.03321683, + "balance_loss_mlp": 1.01883626, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 2.0058213170343633, + "language_loss": 0.76944149, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79064047, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 2.7256438732147217 + }, + { + "auxiliary_loss_clip": 0.01058444, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.03662181, + "balance_loss_mlp": 1.02360797, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 2.092870509249773, + "language_loss": 0.70536494, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72632968, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 2.6762280464172363 + }, + { + "auxiliary_loss_clip": 0.01109735, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.03891206, + "balance_loss_mlp": 1.02039552, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.5873352696134517, + "language_loss": 0.794312, + "learning_rate": 2.92754912981472e-06, + "loss": 0.81576353, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 2.7181622982025146 + }, + { + "auxiliary_loss_clip": 0.01076986, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.03482604, + "balance_loss_mlp": 1.0224365, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 2.0131595805115996, + "language_loss": 0.71045429, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73157895, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 2.673520565032959 + }, + { + "auxiliary_loss_clip": 0.01085029, + "auxiliary_loss_mlp": 0.01042341, + "balance_loss_clip": 1.04113913, + "balance_loss_mlp": 1.02900732, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.9909372239369707, + "language_loss": 0.74546552, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76673925, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.5888357162475586 + }, + { + "auxiliary_loss_clip": 0.01050894, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.03848815, + "balance_loss_mlp": 1.0222826, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.8046585639930803, + "language_loss": 0.72789872, + "learning_rate": 2.926513837074284e-06, + "loss": 0.7487734, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.745089054107666 + }, + { + "auxiliary_loss_clip": 0.01101526, + "auxiliary_loss_mlp": 0.010433, + "balance_loss_clip": 1.03657937, + "balance_loss_mlp": 1.02873242, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 2.6144993270694603, + "language_loss": 0.78107738, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80252564, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 2.5795180797576904 + }, + { + "auxiliary_loss_clip": 0.01100919, + "auxiliary_loss_mlp": 0.01038551, + "balance_loss_clip": 1.03584254, + "balance_loss_mlp": 1.02482975, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 2.2910262901251865, + "language_loss": 0.74456376, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76595843, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 2.676492691040039 + }, + { + "auxiliary_loss_clip": 0.01117998, + "auxiliary_loss_mlp": 0.01048589, + "balance_loss_clip": 1.0414983, + "balance_loss_mlp": 1.03417015, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.6058642738957276, + "language_loss": 0.78986156, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81152737, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.5561697483062744 + }, + { + "auxiliary_loss_clip": 0.01090863, + "auxiliary_loss_mlp": 0.0074992, + "balance_loss_clip": 1.03755808, + "balance_loss_mlp": 1.00032783, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 3.663176824025432, + "language_loss": 0.73225093, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75065869, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 2.5665395259857178 + }, + { + "auxiliary_loss_clip": 0.01075792, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.03389263, + "balance_loss_mlp": 1.018767, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 1.9014494069549495, + "language_loss": 0.6730932, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69418085, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 2.674424886703491 + }, + { + "auxiliary_loss_clip": 0.01056332, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.03553438, + "balance_loss_mlp": 1.02107882, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.514516829715064, + "language_loss": 0.78026295, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.80117619, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 2.7373039722442627 + }, + { + "auxiliary_loss_clip": 0.01098286, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.0357275, + "balance_loss_mlp": 1.02024364, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.8985137652730222, + "language_loss": 0.73104, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75236607, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 2.6100990772247314 + }, + { + "auxiliary_loss_clip": 0.01089087, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.03725195, + "balance_loss_mlp": 1.02622628, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 5.973452868857225, + "language_loss": 0.84361225, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86489761, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 2.658376693725586 + }, + { + "auxiliary_loss_clip": 0.01078919, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.03436935, + "balance_loss_mlp": 1.01741493, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 2.5794348977688246, + "language_loss": 0.70932209, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.73042756, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 2.6665751934051514 + }, + { + "auxiliary_loss_clip": 0.01089055, + "auxiliary_loss_mlp": 0.01047158, + "balance_loss_clip": 1.03901768, + "balance_loss_mlp": 1.03135061, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.393112040916852, + "language_loss": 0.76135588, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.782718, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.5714471340179443 + }, + { + "auxiliary_loss_clip": 0.0110767, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.03925514, + "balance_loss_mlp": 1.01688588, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.5244177975970619, + "language_loss": 0.69786263, + "learning_rate": 2.922715061101625e-06, + "loss": 0.71926177, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 2.778998613357544 + }, + { + "auxiliary_loss_clip": 0.01072111, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.0379169, + "balance_loss_mlp": 1.02024376, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 2.1881797170781265, + "language_loss": 0.71817636, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73924148, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 2.6814382076263428 + }, + { + "auxiliary_loss_clip": 0.0110424, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.037835, + "balance_loss_mlp": 1.01645124, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 3.085582679273772, + "language_loss": 0.81074136, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83209473, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.6364634037017822 + }, + { + "auxiliary_loss_clip": 0.01119521, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.03952992, + "balance_loss_mlp": 1.01978064, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.8234153732166416, + "language_loss": 0.80652922, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.8280735, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.5626180171966553 + }, + { + "auxiliary_loss_clip": 0.01012854, + "auxiliary_loss_mlp": 0.00746995, + "balance_loss_clip": 1.01702523, + "balance_loss_mlp": 1.00008297, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6927943097404602, + "language_loss": 0.59215581, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.60975432, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 4.690047264099121 + }, + { + "auxiliary_loss_clip": 0.01090138, + "auxiliary_loss_mlp": 0.01032893, + "balance_loss_clip": 1.0381043, + "balance_loss_mlp": 1.01944625, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.9071063871362635, + "language_loss": 0.74446166, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76569194, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.573256731033325 + }, + { + "auxiliary_loss_clip": 0.01106382, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.03963637, + "balance_loss_mlp": 1.01938701, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.7960479511679712, + "language_loss": 0.73821312, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75960791, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.616346836090088 + }, + { + "auxiliary_loss_clip": 0.01042341, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.0334971, + "balance_loss_mlp": 1.02283955, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 2.3615075658077305, + "language_loss": 0.53078425, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55157745, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.6979634761810303 + }, + { + "auxiliary_loss_clip": 0.01099384, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.03717947, + "balance_loss_mlp": 1.01995754, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.5676561250005594, + "language_loss": 0.8063364, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82767427, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.609269857406616 + }, + { + "auxiliary_loss_clip": 0.01061151, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.03479588, + "balance_loss_mlp": 1.02314663, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7407563331703482, + "language_loss": 0.72144461, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74242651, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.7695324420928955 + }, + { + "auxiliary_loss_clip": 0.01104164, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.03872395, + "balance_loss_mlp": 1.0255878, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6127521158337574, + "language_loss": 0.85189009, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87331969, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 4.10091757774353 + }, + { + "auxiliary_loss_clip": 0.01101125, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.03542614, + "balance_loss_mlp": 1.02116799, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 2.548482421396999, + "language_loss": 0.7837292, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80510199, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.5616767406463623 + }, + { + "auxiliary_loss_clip": 0.01106198, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.03751159, + "balance_loss_mlp": 1.02498007, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 3.055057778631034, + "language_loss": 0.66957104, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69104123, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 2.536750316619873 + }, + { + "auxiliary_loss_clip": 0.01079543, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.03330743, + "balance_loss_mlp": 1.01847804, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.653432823164627, + "language_loss": 0.76398879, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78509867, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 4.07507848739624 + }, + { + "auxiliary_loss_clip": 0.01063325, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.03429592, + "balance_loss_mlp": 1.02288818, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.7798451975268998, + "language_loss": 0.63504887, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65604299, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.7112319469451904 + }, + { + "auxiliary_loss_clip": 0.01076779, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.03284454, + "balance_loss_mlp": 1.02031755, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 5.581891087487438, + "language_loss": 0.73271161, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75382149, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.639798164367676 + }, + { + "auxiliary_loss_clip": 0.01105644, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.03821313, + "balance_loss_mlp": 1.0219059, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.8040680185760567, + "language_loss": 0.7236129, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.7450341, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 4.127849102020264 + }, + { + "auxiliary_loss_clip": 0.0108533, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.03709841, + "balance_loss_mlp": 1.01729822, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 2.4047591535371367, + "language_loss": 0.80425709, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82542348, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.6264171600341797 + }, + { + "auxiliary_loss_clip": 0.01070793, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.03546977, + "balance_loss_mlp": 1.02389646, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 2.1549840093420514, + "language_loss": 0.64295375, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66403389, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 2.6571052074432373 + }, + { + "auxiliary_loss_clip": 0.01101394, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.03906679, + "balance_loss_mlp": 1.02299452, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 2.1257656531174898, + "language_loss": 0.72069961, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.74208295, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 2.673659324645996 + }, + { + "auxiliary_loss_clip": 0.0107874, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.03554225, + "balance_loss_mlp": 1.02439928, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.8897118554383014, + "language_loss": 0.69124794, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71242493, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.622305393218994 + }, + { + "auxiliary_loss_clip": 0.01097664, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.03487182, + "balance_loss_mlp": 1.02026606, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 4.519764240588079, + "language_loss": 0.73628479, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.7576139, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 2.5970237255096436 + }, + { + "auxiliary_loss_clip": 0.01088873, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.03634739, + "balance_loss_mlp": 1.02121067, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 2.295932637612306, + "language_loss": 0.73982924, + "learning_rate": 2.915104825441114e-06, + "loss": 0.7610777, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 2.622997760772705 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.03580666, + "balance_loss_mlp": 1.02955008, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 2.5440497384359673, + "language_loss": 0.78511828, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80657947, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 2.570164203643799 + }, + { + "auxiliary_loss_clip": 0.01103188, + "auxiliary_loss_mlp": 0.01042028, + "balance_loss_clip": 1.03712738, + "balance_loss_mlp": 1.0264833, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.766016307709099, + "language_loss": 0.6571703, + "learning_rate": 2.914412150914888e-06, + "loss": 0.67862248, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 2.5400869846343994 + }, + { + "auxiliary_loss_clip": 0.01096444, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.04152346, + "balance_loss_mlp": 1.0244801, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 5.799673109807649, + "language_loss": 0.70453066, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72588444, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 2.7425954341888428 + }, + { + "auxiliary_loss_clip": 0.01094773, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.03972316, + "balance_loss_mlp": 1.02488279, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 4.4335443181631735, + "language_loss": 0.75362813, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77496284, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 2.589421510696411 + }, + { + "auxiliary_loss_clip": 0.01091907, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.03404951, + "balance_loss_mlp": 1.02173972, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.970152107667076, + "language_loss": 0.84418666, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86546284, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.7648098468780518 + }, + { + "auxiliary_loss_clip": 0.01014105, + "auxiliary_loss_mlp": 0.01004356, + "balance_loss_clip": 1.01599336, + "balance_loss_mlp": 1.00311017, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8084030724617217, + "language_loss": 0.60319608, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62338066, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 3.223609685897827 + }, + { + "auxiliary_loss_clip": 0.01070897, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.03431642, + "balance_loss_mlp": 1.0144161, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.669389023617019, + "language_loss": 0.73310542, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.7540952, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.722592830657959 + }, + { + "auxiliary_loss_clip": 0.01096749, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.03445077, + "balance_loss_mlp": 1.01885056, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.889187171991219, + "language_loss": 0.73926687, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76056534, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 2.7038910388946533 + }, + { + "auxiliary_loss_clip": 0.01044019, + "auxiliary_loss_mlp": 0.01044724, + "balance_loss_clip": 1.0331316, + "balance_loss_mlp": 1.02931046, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.731017210118706, + "language_loss": 0.71483469, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73572212, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.7398622035980225 + }, + { + "auxiliary_loss_clip": 0.01078757, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03652418, + "balance_loss_mlp": 1.01834512, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.693854968116931, + "language_loss": 0.74863362, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.76974702, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 2.719590902328491 + }, + { + "auxiliary_loss_clip": 0.01002695, + "auxiliary_loss_mlp": 0.01001205, + "balance_loss_clip": 1.0077889, + "balance_loss_mlp": 0.99985772, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8192531350447523, + "language_loss": 0.58800685, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60804582, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 3.115305185317993 + }, + { + "auxiliary_loss_clip": 0.01077388, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.03378689, + "balance_loss_mlp": 1.02020621, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 2.1658342467735787, + "language_loss": 0.78651291, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.80762792, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 2.595825433731079 + }, + { + "auxiliary_loss_clip": 0.01099744, + "auxiliary_loss_mlp": 0.01041916, + "balance_loss_clip": 1.03542495, + "balance_loss_mlp": 1.02750325, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.283257500794093, + "language_loss": 0.74253201, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76394868, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.5885000228881836 + }, + { + "auxiliary_loss_clip": 0.01060827, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.03113425, + "balance_loss_mlp": 1.02256536, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.9064283575964933, + "language_loss": 0.65076077, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67173755, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 2.7446820735931396 + }, + { + "auxiliary_loss_clip": 0.01074068, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.03418446, + "balance_loss_mlp": 1.02369809, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0410412189638794, + "language_loss": 0.71649718, + "learning_rate": 2.909906390418006e-06, + "loss": 0.7376191, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 2.6039443016052246 + }, + { + "auxiliary_loss_clip": 0.01003519, + "auxiliary_loss_mlp": 0.0100788, + "balance_loss_clip": 1.01216674, + "balance_loss_mlp": 1.00681293, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7472677434671844, + "language_loss": 0.5931775, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61329144, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.216313362121582 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.03580034, + "balance_loss_mlp": 1.01996934, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 5.879930983097808, + "language_loss": 0.75137269, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77269524, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.561835527420044 + }, + { + "auxiliary_loss_clip": 0.01096876, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.03502274, + "balance_loss_mlp": 1.02054954, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 1.8386448160179223, + "language_loss": 0.76946402, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79076546, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 2.576230764389038 + }, + { + "auxiliary_loss_clip": 0.01095391, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.03457928, + "balance_loss_mlp": 1.01916528, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 2.173272798106808, + "language_loss": 0.82128173, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84255576, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.648960828781128 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01036093, + "balance_loss_clip": 1.03582346, + "balance_loss_mlp": 1.02249098, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 2.594163447276035, + "language_loss": 0.77315402, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79452884, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 2.5547146797180176 + }, + { + "auxiliary_loss_clip": 0.01092254, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.03587687, + "balance_loss_mlp": 1.01562035, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.8197336837474074, + "language_loss": 0.76664394, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.78786206, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.5396275520324707 + }, + { + "auxiliary_loss_clip": 0.01084513, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.03519273, + "balance_loss_mlp": 1.02208948, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 2.288017683677058, + "language_loss": 0.80399334, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82520509, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 4.224278688430786 + }, + { + "auxiliary_loss_clip": 0.01081895, + "auxiliary_loss_mlp": 0.00749812, + "balance_loss_clip": 1.03777039, + "balance_loss_mlp": 1.00030363, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.87370692856752, + "language_loss": 0.83335119, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85166824, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.654245376586914 + }, + { + "auxiliary_loss_clip": 0.0109648, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.03600872, + "balance_loss_mlp": 1.01718056, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 1.9658331869219643, + "language_loss": 0.73904479, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76031613, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.6959493160247803 + }, + { + "auxiliary_loss_clip": 0.01114115, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.03828728, + "balance_loss_mlp": 1.02295947, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 2.0453843388474118, + "language_loss": 0.70563662, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72715271, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.5995490550994873 + }, + { + "auxiliary_loss_clip": 0.01085135, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.03559184, + "balance_loss_mlp": 1.02036691, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 3.5276611129087154, + "language_loss": 0.81238204, + "learning_rate": 2.906089268194611e-06, + "loss": 0.833574, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.7385826110839844 + }, + { + "auxiliary_loss_clip": 0.01014587, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.0100565, + "balance_loss_mlp": 1.00241077, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.7842872554327383, + "language_loss": 0.63105607, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65124178, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.2813479900360107 + }, + { + "auxiliary_loss_clip": 0.01054381, + "auxiliary_loss_mlp": 0.01041272, + "balance_loss_clip": 1.03288841, + "balance_loss_mlp": 1.0271337, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 1.9374610254466194, + "language_loss": 0.69924194, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72019839, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 4.301764965057373 + }, + { + "auxiliary_loss_clip": 0.01102033, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.03839064, + "balance_loss_mlp": 1.02216601, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 2.020959926496886, + "language_loss": 0.72284687, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74422872, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 2.6432106494903564 + }, + { + "auxiliary_loss_clip": 0.0108242, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.03546798, + "balance_loss_mlp": 1.01973677, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.9934690243094246, + "language_loss": 0.67927086, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.700427, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.657874345779419 + }, + { + "auxiliary_loss_clip": 0.01099638, + "auxiliary_loss_mlp": 0.01028574, + "balance_loss_clip": 1.03554547, + "balance_loss_mlp": 1.01483512, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.796017104986255, + "language_loss": 0.67805898, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.69934106, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 4.087659597396851 + }, + { + "auxiliary_loss_clip": 0.01083085, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.03391564, + "balance_loss_mlp": 1.02085102, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.9302430234795334, + "language_loss": 0.81959981, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84076661, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.6633076667785645 + }, + { + "auxiliary_loss_clip": 0.01061879, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.03246152, + "balance_loss_mlp": 1.01947629, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.5553455638811946, + "language_loss": 0.76591468, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78687912, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 4.173614263534546 + }, + { + "auxiliary_loss_clip": 0.01111942, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.03579378, + "balance_loss_mlp": 1.01937437, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.075543478172236, + "language_loss": 0.68054724, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.70200229, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 2.5946450233459473 + }, + { + "auxiliary_loss_clip": 0.01085443, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03655064, + "balance_loss_mlp": 1.02136075, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 2.0756128990954976, + "language_loss": 0.71073699, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73193038, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.7576920986175537 + }, + { + "auxiliary_loss_clip": 0.01084238, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.03485227, + "balance_loss_mlp": 1.01870608, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.7104701116911272, + "language_loss": 0.78917909, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81032944, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 2.6683757305145264 + }, + { + "auxiliary_loss_clip": 0.01110647, + "auxiliary_loss_mlp": 0.01037961, + "balance_loss_clip": 1.03646016, + "balance_loss_mlp": 1.02382874, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.7108008843856484, + "language_loss": 0.79254687, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81403291, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 2.6020212173461914 + }, + { + "auxiliary_loss_clip": 0.01084357, + "auxiliary_loss_mlp": 0.00749797, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.00028896, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 2.507554435189913, + "language_loss": 0.79411817, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81245959, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.676213264465332 + }, + { + "auxiliary_loss_clip": 0.01099162, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.03695369, + "balance_loss_mlp": 1.02055454, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 2.053679660362689, + "language_loss": 0.67844367, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.69978213, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.5819308757781982 + }, + { + "auxiliary_loss_clip": 0.01080471, + "auxiliary_loss_mlp": 0.01036567, + "balance_loss_clip": 1.03568625, + "balance_loss_mlp": 1.02167773, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.1303780173356364, + "language_loss": 0.8307023, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.85187268, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 2.660625457763672 + }, + { + "auxiliary_loss_clip": 0.01090755, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.03662717, + "balance_loss_mlp": 1.02046061, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.9562423004119585, + "language_loss": 0.69203639, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71330297, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.613274335861206 + }, + { + "auxiliary_loss_clip": 0.01006941, + "auxiliary_loss_mlp": 0.01000584, + "balance_loss_clip": 1.01160645, + "balance_loss_mlp": 0.99917191, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.7909498179911849, + "language_loss": 0.56881869, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58889395, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 3.0250141620635986 + }, + { + "auxiliary_loss_clip": 0.0108461, + "auxiliary_loss_mlp": 0.01036896, + "balance_loss_clip": 1.03463233, + "balance_loss_mlp": 1.02395034, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 2.0116812949072966, + "language_loss": 0.75587469, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77708977, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 2.6012697219848633 + }, + { + "auxiliary_loss_clip": 0.01092326, + "auxiliary_loss_mlp": 0.00749637, + "balance_loss_clip": 1.03451073, + "balance_loss_mlp": 1.00026071, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 1.746812496139755, + "language_loss": 0.7348448, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75326449, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 2.5563747882843018 + }, + { + "auxiliary_loss_clip": 0.01108733, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.03771555, + "balance_loss_mlp": 1.01855326, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 2.578210668597971, + "language_loss": 0.79335451, + "learning_rate": 2.899486274782127e-06, + "loss": 0.8147521, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.5431180000305176 + }, + { + "auxiliary_loss_clip": 0.01098993, + "auxiliary_loss_mlp": 0.01040327, + "balance_loss_clip": 1.03799534, + "balance_loss_mlp": 1.02618861, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 3.603858759586493, + "language_loss": 0.7635833, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78497648, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 2.5653223991394043 + }, + { + "auxiliary_loss_clip": 0.01090088, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.04039705, + "balance_loss_mlp": 1.01744246, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 2.5602726549931267, + "language_loss": 0.80406725, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82528335, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 2.560349464416504 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.03702807, + "balance_loss_mlp": 1.02482867, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 1.8145736072375598, + "language_loss": 0.58897865, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61039436, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 2.6522161960601807 + }, + { + "auxiliary_loss_clip": 0.01087498, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.03449488, + "balance_loss_mlp": 1.02026248, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 2.3542367977042766, + "language_loss": 0.80516243, + "learning_rate": 2.898094598877435e-06, + "loss": 0.82637781, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.5801711082458496 + }, + { + "auxiliary_loss_clip": 0.01107696, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.0220449, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.9529145703012245, + "language_loss": 0.79636061, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81779099, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 2.5830390453338623 + }, + { + "auxiliary_loss_clip": 0.01100484, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.03852963, + "balance_loss_mlp": 1.026932, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 2.0810665737046503, + "language_loss": 0.88509494, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90649945, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.63578724861145 + }, + { + "auxiliary_loss_clip": 0.01099622, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.03725028, + "balance_loss_mlp": 1.02167809, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 2.452096484606084, + "language_loss": 0.73172009, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75305963, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.6343071460723877 + }, + { + "auxiliary_loss_clip": 0.01075683, + "auxiliary_loss_mlp": 0.01039262, + "balance_loss_clip": 1.03397858, + "balance_loss_mlp": 1.02586281, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.8796970989971673, + "language_loss": 0.75470746, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77585691, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 2.707932949066162 + }, + { + "auxiliary_loss_clip": 0.01044183, + "auxiliary_loss_mlp": 0.01040073, + "balance_loss_clip": 1.03379297, + "balance_loss_mlp": 1.0250411, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 3.9174583907606157, + "language_loss": 0.72042233, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74126488, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.7352283000946045 + }, + { + "auxiliary_loss_clip": 0.01111569, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.03720641, + "balance_loss_mlp": 1.02204931, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 1.741088208769324, + "language_loss": 0.70112807, + "learning_rate": 2.896006063609283e-06, + "loss": 0.72260797, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.5843043327331543 + }, + { + "auxiliary_loss_clip": 0.01087001, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.03531611, + "balance_loss_mlp": 1.01744723, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.6547963775699224, + "language_loss": 0.7815522, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80272847, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 2.64278507232666 + }, + { + "auxiliary_loss_clip": 0.01097606, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.03636718, + "balance_loss_mlp": 1.01910758, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 2.040675077978522, + "language_loss": 0.78628635, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80759549, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.570404529571533 + }, + { + "auxiliary_loss_clip": 0.01016913, + "auxiliary_loss_mlp": 0.01000918, + "balance_loss_clip": 1.01264977, + "balance_loss_mlp": 0.99947608, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 1.0440187130759973, + "language_loss": 0.57441014, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59458846, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 3.1744513511657715 + }, + { + "auxiliary_loss_clip": 0.01101619, + "auxiliary_loss_mlp": 0.00749966, + "balance_loss_clip": 1.0354116, + "balance_loss_mlp": 1.00036168, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 1.7320750907129254, + "language_loss": 0.76506245, + "learning_rate": 2.894613027055066e-06, + "loss": 0.78357834, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 2.5798895359039307 + }, + { + "auxiliary_loss_clip": 0.01067307, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.03366113, + "balance_loss_mlp": 1.02018619, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 2.0086701620056573, + "language_loss": 0.72679198, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74780369, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.733424663543701 + }, + { + "auxiliary_loss_clip": 0.01053255, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.03149533, + "balance_loss_mlp": 1.01731491, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 2.2796810760816784, + "language_loss": 0.76793849, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.7887848, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.7297680377960205 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01037079, + "balance_loss_clip": 1.0383575, + "balance_loss_mlp": 1.02245235, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8617482933351768, + "language_loss": 0.83176315, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85318065, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.652939558029175 + }, + { + "auxiliary_loss_clip": 0.01098224, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.03558159, + "balance_loss_mlp": 1.02026629, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 1.918611189572737, + "language_loss": 0.84685552, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86817658, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 4.250423192977905 + }, + { + "auxiliary_loss_clip": 0.01084805, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.03860235, + "balance_loss_mlp": 1.01952767, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.9850224631927234, + "language_loss": 0.65369916, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67488611, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.656724691390991 + }, + { + "auxiliary_loss_clip": 0.01084839, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.03522122, + "balance_loss_mlp": 1.02051187, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 1.858236394390998, + "language_loss": 0.84454834, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86574519, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.668736696243286 + }, + { + "auxiliary_loss_clip": 0.01082456, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.03526819, + "balance_loss_mlp": 1.01933932, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.5904100094877975, + "language_loss": 0.88509011, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90624726, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 2.6443302631378174 + }, + { + "auxiliary_loss_clip": 0.01062277, + "auxiliary_loss_mlp": 0.01031352, + "balance_loss_clip": 1.03303862, + "balance_loss_mlp": 1.01538372, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 2.1367468831969534, + "language_loss": 0.735111, + "learning_rate": 2.891825326449073e-06, + "loss": 0.75604731, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.7277517318725586 + }, + { + "auxiliary_loss_clip": 0.01108634, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.03586972, + "balance_loss_mlp": 1.01843321, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.539531543244107, + "language_loss": 0.80009967, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82150221, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 4.12614631652832 + }, + { + "auxiliary_loss_clip": 0.01076534, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.03359342, + "balance_loss_mlp": 1.02265382, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 2.225216125509548, + "language_loss": 0.84346348, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86459178, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.6461689472198486 + }, + { + "auxiliary_loss_clip": 0.01087515, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.03586769, + "balance_loss_mlp": 1.01899159, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.903863611055236, + "language_loss": 0.77223825, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79343617, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.8049731254577637 + }, + { + "auxiliary_loss_clip": 0.01087392, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.03594041, + "balance_loss_mlp": 1.01672769, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 2.1057677958412397, + "language_loss": 0.78928828, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81046438, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.591134786605835 + }, + { + "auxiliary_loss_clip": 0.01099131, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.03721714, + "balance_loss_mlp": 1.02292585, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 2.319210427544149, + "language_loss": 0.83576512, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85711282, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 4.212480545043945 + }, + { + "auxiliary_loss_clip": 0.01107133, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.03595138, + "balance_loss_mlp": 1.02190995, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 1.6457278660474648, + "language_loss": 0.64321232, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66464257, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.53171968460083 + }, + { + "auxiliary_loss_clip": 0.01097254, + "auxiliary_loss_mlp": 0.01045518, + "balance_loss_clip": 1.03737795, + "balance_loss_mlp": 1.03255355, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 7.530882081983345, + "language_loss": 0.73939025, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76081795, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 4.091706991195679 + }, + { + "auxiliary_loss_clip": 0.01083814, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.03632045, + "balance_loss_mlp": 1.02185321, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 2.0126104494896326, + "language_loss": 0.80608773, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82727385, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 3.0246310234069824 + }, + { + "auxiliary_loss_clip": 0.01073842, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.03551936, + "balance_loss_mlp": 1.02538586, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 4.0027559692697405, + "language_loss": 0.60503256, + "learning_rate": 2.88868657651991e-06, + "loss": 0.62616193, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.803124189376831 + }, + { + "auxiliary_loss_clip": 0.01101031, + "auxiliary_loss_mlp": 0.01036374, + "balance_loss_clip": 1.0381161, + "balance_loss_mlp": 1.02280831, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 2.2264958325873416, + "language_loss": 0.73102599, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75240004, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 2.5708389282226562 + }, + { + "auxiliary_loss_clip": 0.01084477, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.03417015, + "balance_loss_mlp": 1.0212332, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 4.1601927777102095, + "language_loss": 0.73749363, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.75868887, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.5752878189086914 + }, + { + "auxiliary_loss_clip": 0.01083875, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.03554583, + "balance_loss_mlp": 1.02009487, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.7684312431633566, + "language_loss": 0.8181982, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.8393479, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 2.606271505355835 + }, + { + "auxiliary_loss_clip": 0.01100817, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.03726053, + "balance_loss_mlp": 1.02645421, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 2.1016212919513855, + "language_loss": 0.75453019, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77593935, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.669309139251709 + }, + { + "auxiliary_loss_clip": 0.01091184, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.03376257, + "balance_loss_mlp": 1.02005839, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.9698495529573288, + "language_loss": 0.78058153, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80183893, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 2.636578321456909 + }, + { + "auxiliary_loss_clip": 0.01109315, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.03659725, + "balance_loss_mlp": 1.02055264, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.0271141437809734, + "language_loss": 0.93478012, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95621818, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 2.631263017654419 + }, + { + "auxiliary_loss_clip": 0.01077998, + "auxiliary_loss_mlp": 0.01029549, + "balance_loss_clip": 1.03561258, + "balance_loss_mlp": 1.0169847, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.233785451192819, + "language_loss": 0.82352579, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84460121, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 2.620943069458008 + }, + { + "auxiliary_loss_clip": 0.01095358, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.03456521, + "balance_loss_mlp": 1.01813912, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 2.0096072762285035, + "language_loss": 0.73419309, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75547975, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 2.615046262741089 + }, + { + "auxiliary_loss_clip": 0.01071301, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03616333, + "balance_loss_mlp": 1.01850474, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 2.4767763612525733, + "language_loss": 0.70002377, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.721071, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 2.700307846069336 + }, + { + "auxiliary_loss_clip": 0.01044978, + "auxiliary_loss_mlp": 0.01038551, + "balance_loss_clip": 1.03012824, + "balance_loss_mlp": 1.02169538, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.5911708576901502, + "language_loss": 0.78025627, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.80109155, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.7761197090148926 + }, + { + "auxiliary_loss_clip": 0.01101017, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.03714788, + "balance_loss_mlp": 1.02151453, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.7308283307698715, + "language_loss": 0.72971994, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75108486, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 2.7314774990081787 + }, + { + "auxiliary_loss_clip": 0.01107546, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_clip": 1.041592, + "balance_loss_mlp": 1.02946281, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 2.0384359831033856, + "language_loss": 0.81870311, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84021854, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 2.641162872314453 + }, + { + "auxiliary_loss_clip": 0.01067268, + "auxiliary_loss_mlp": 0.01039963, + "balance_loss_clip": 1.034374, + "balance_loss_mlp": 1.02472854, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.201720150678976, + "language_loss": 0.79003155, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81110382, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 2.656400442123413 + }, + { + "auxiliary_loss_clip": 0.01086744, + "auxiliary_loss_mlp": 0.01039296, + "balance_loss_clip": 1.03547072, + "balance_loss_mlp": 1.02605808, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.8039847322307332, + "language_loss": 0.84811211, + "learning_rate": 2.883798654630296e-06, + "loss": 0.86937249, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 2.75038743019104 + }, + { + "auxiliary_loss_clip": 0.01077317, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.03410029, + "balance_loss_mlp": 1.02438557, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 1.758832173914978, + "language_loss": 0.68041742, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70158517, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.667806386947632 + }, + { + "auxiliary_loss_clip": 0.0108719, + "auxiliary_loss_mlp": 0.01039352, + "balance_loss_clip": 1.03654373, + "balance_loss_mlp": 1.02459443, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.4806606375565217, + "language_loss": 0.65657413, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67783952, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 2.6220340728759766 + }, + { + "auxiliary_loss_clip": 0.01094675, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.03965867, + "balance_loss_mlp": 1.02138829, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 1.8814336296927443, + "language_loss": 0.80599201, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82730103, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.5986292362213135 + }, + { + "auxiliary_loss_clip": 0.0109484, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.03580058, + "balance_loss_mlp": 1.01743746, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.4293838385719497, + "language_loss": 0.78436446, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80561948, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.642864465713501 + }, + { + "auxiliary_loss_clip": 0.01087928, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.03761911, + "balance_loss_mlp": 1.0229764, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 1.7811060249354531, + "language_loss": 0.76801199, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.78926468, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 2.6824305057525635 + }, + { + "auxiliary_loss_clip": 0.01080734, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.03667378, + "balance_loss_mlp": 1.02370262, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.7492868570601945, + "language_loss": 0.83004701, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85123354, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 2.6524267196655273 + }, + { + "auxiliary_loss_clip": 0.01085409, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.03455412, + "balance_loss_mlp": 1.02503073, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.8486180686735247, + "language_loss": 0.76380908, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78505063, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.642700672149658 + }, + { + "auxiliary_loss_clip": 0.01078366, + "auxiliary_loss_mlp": 0.00749973, + "balance_loss_clip": 1.03566885, + "balance_loss_mlp": 1.00051713, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.6306818042611868, + "language_loss": 0.70789039, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72617376, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.753878355026245 + }, + { + "auxiliary_loss_clip": 0.01082918, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.0405091, + "balance_loss_mlp": 1.0199604, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 3.2153142287778844, + "language_loss": 0.69047511, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71163434, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.856234073638916 + }, + { + "auxiliary_loss_clip": 0.01068584, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.03771043, + "balance_loss_mlp": 1.01921177, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.6371228093607857, + "language_loss": 0.69814634, + "learning_rate": 2.880303258086228e-06, + "loss": 0.7191658, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.6125168800354004 + }, + { + "auxiliary_loss_clip": 0.01064673, + "auxiliary_loss_mlp": 0.01036456, + "balance_loss_clip": 1.03496027, + "balance_loss_mlp": 1.02137637, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 17.466586899645165, + "language_loss": 0.78993368, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81094503, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.6504337787628174 + }, + { + "auxiliary_loss_clip": 0.01084846, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.03658652, + "balance_loss_mlp": 1.02069819, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 1.7171841744252685, + "language_loss": 0.67750663, + "learning_rate": 2.879603777778917e-06, + "loss": 0.69870728, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.658278226852417 + }, + { + "auxiliary_loss_clip": 0.01070714, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.03317845, + "balance_loss_mlp": 1.01549923, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.5778024196073335, + "language_loss": 0.82928777, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85028583, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 4.071936130523682 + }, + { + "auxiliary_loss_clip": 0.01066519, + "auxiliary_loss_mlp": 0.01043776, + "balance_loss_clip": 1.03454244, + "balance_loss_mlp": 1.02783787, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.7921324731040562, + "language_loss": 0.74582005, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76692295, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 2.6586720943450928 + }, + { + "auxiliary_loss_clip": 0.010738, + "auxiliary_loss_mlp": 0.01035028, + "balance_loss_clip": 1.03526068, + "balance_loss_mlp": 1.02056217, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 2.313451003938425, + "language_loss": 0.83517325, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85626149, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.6209254264831543 + }, + { + "auxiliary_loss_clip": 0.01099039, + "auxiliary_loss_mlp": 0.01038688, + "balance_loss_clip": 1.03639364, + "balance_loss_mlp": 1.02485418, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 2.749117995730033, + "language_loss": 0.7307592, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75213647, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 2.6274919509887695 + }, + { + "auxiliary_loss_clip": 0.01098196, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.0395844, + "balance_loss_mlp": 1.02358651, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 1.9466611849252604, + "language_loss": 0.73767221, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75903338, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 2.54472279548645 + }, + { + "auxiliary_loss_clip": 0.01079615, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.03325629, + "balance_loss_mlp": 1.01817906, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.662124534411639, + "language_loss": 0.76906079, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79018492, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 4.135308742523193 + }, + { + "auxiliary_loss_clip": 0.01093543, + "auxiliary_loss_mlp": 0.01038001, + "balance_loss_clip": 1.03908908, + "balance_loss_mlp": 1.02366614, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 3.5176450470273846, + "language_loss": 0.68904102, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71035641, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.628042697906494 + }, + { + "auxiliary_loss_clip": 0.01097542, + "auxiliary_loss_mlp": 0.01040524, + "balance_loss_clip": 1.03584087, + "balance_loss_mlp": 1.02710116, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 2.1897237814357027, + "language_loss": 0.82533228, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84671289, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 2.6621692180633545 + }, + { + "auxiliary_loss_clip": 0.01116113, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.04112124, + "balance_loss_mlp": 1.01646352, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 2.2169106828923453, + "language_loss": 0.77623814, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79769808, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 4.082703113555908 + }, + { + "auxiliary_loss_clip": 0.01091022, + "auxiliary_loss_mlp": 0.01040611, + "balance_loss_clip": 1.03385162, + "balance_loss_mlp": 1.0240531, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.0494916560301064, + "language_loss": 0.73165798, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75297433, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.556737184524536 + }, + { + "auxiliary_loss_clip": 0.01088534, + "auxiliary_loss_mlp": 0.00750008, + "balance_loss_clip": 1.0344131, + "balance_loss_mlp": 1.0004735, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 3.442556018032295, + "language_loss": 0.93395376, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95233917, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 4.239748954772949 + }, + { + "auxiliary_loss_clip": 0.01114052, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.03940594, + "balance_loss_mlp": 1.01817918, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 2.123041054660162, + "language_loss": 0.71474314, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73621726, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.506530523300171 + }, + { + "auxiliary_loss_clip": 0.01045123, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.03728557, + "balance_loss_mlp": 1.01978517, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 2.2924571314266293, + "language_loss": 0.65211666, + "learning_rate": 2.875053908444895e-06, + "loss": 0.6729095, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 2.8788349628448486 + }, + { + "auxiliary_loss_clip": 0.01075791, + "auxiliary_loss_mlp": 0.00749915, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.00045037, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.1833347814129014, + "language_loss": 0.75769925, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77595639, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 2.7104709148406982 + }, + { + "auxiliary_loss_clip": 0.01081416, + "auxiliary_loss_mlp": 0.01039181, + "balance_loss_clip": 1.03658581, + "balance_loss_mlp": 1.02397001, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.6406596818586134, + "language_loss": 0.83314252, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85434848, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.6860437393188477 + }, + { + "auxiliary_loss_clip": 0.01089852, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.03827906, + "balance_loss_mlp": 1.02316058, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.6335467682178804, + "language_loss": 0.68141985, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70268595, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.662447690963745 + }, + { + "auxiliary_loss_clip": 0.01031296, + "auxiliary_loss_mlp": 0.00750232, + "balance_loss_clip": 1.03177798, + "balance_loss_mlp": 1.00040603, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 2.093887354441433, + "language_loss": 0.83672416, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85453951, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 2.754671096801758 + }, + { + "auxiliary_loss_clip": 0.01048691, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.02996409, + "balance_loss_mlp": 1.02019262, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 5.470395891959277, + "language_loss": 0.8344807, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85531622, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 2.6195216178894043 + }, + { + "auxiliary_loss_clip": 0.01069263, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.031165, + "balance_loss_mlp": 1.02366281, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 3.549950486231077, + "language_loss": 0.63814324, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.65922284, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 2.721754550933838 + }, + { + "auxiliary_loss_clip": 0.01084101, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.03581905, + "balance_loss_mlp": 1.02197814, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 3.415402827794469, + "language_loss": 0.75107187, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77228761, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 2.5861682891845703 + }, + { + "auxiliary_loss_clip": 0.01101909, + "auxiliary_loss_mlp": 0.01035696, + "balance_loss_clip": 1.03657413, + "balance_loss_mlp": 1.02131939, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 5.000190056662776, + "language_loss": 0.54511613, + "learning_rate": 2.872251199697598e-06, + "loss": 0.5664922, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 2.640108108520508 + }, + { + "auxiliary_loss_clip": 0.01092261, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.03460002, + "balance_loss_mlp": 1.02121949, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 1.7434415994976304, + "language_loss": 0.84329581, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86457574, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.601081132888794 + }, + { + "auxiliary_loss_clip": 0.01085541, + "auxiliary_loss_mlp": 0.01035139, + "balance_loss_clip": 1.03663325, + "balance_loss_mlp": 1.02117383, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.6955529247805279, + "language_loss": 0.67623162, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.69743848, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 2.7930691242218018 + }, + { + "auxiliary_loss_clip": 0.01091587, + "auxiliary_loss_mlp": 0.0103639, + "balance_loss_clip": 1.03698325, + "balance_loss_mlp": 1.02234769, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.0608920132660944, + "language_loss": 0.77727258, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79855233, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 2.6437137126922607 + }, + { + "auxiliary_loss_clip": 0.0109865, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.03763139, + "balance_loss_mlp": 1.02201295, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.0064298464683428, + "language_loss": 0.58223498, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60358083, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.7097504138946533 + }, + { + "auxiliary_loss_clip": 0.01091744, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.03616083, + "balance_loss_mlp": 1.02399242, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 3.624688269933134, + "language_loss": 0.89585972, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.9171682, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 2.5892815589904785 + }, + { + "auxiliary_loss_clip": 0.01075213, + "auxiliary_loss_mlp": 0.01034091, + "balance_loss_clip": 1.0375948, + "balance_loss_mlp": 1.02101982, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.9864660723470455, + "language_loss": 0.76719642, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78828943, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.5861401557922363 + }, + { + "auxiliary_loss_clip": 0.01072144, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.0348227, + "balance_loss_mlp": 1.02832222, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 1.983062126589612, + "language_loss": 0.62232614, + "learning_rate": 2.869797092829169e-06, + "loss": 0.64348906, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 2.571957588195801 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.0357945, + "balance_loss_mlp": 1.01809216, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.7980500907985735, + "language_loss": 0.74185187, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76320565, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.529038906097412 + }, + { + "auxiliary_loss_clip": 0.01102376, + "auxiliary_loss_mlp": 0.01042315, + "balance_loss_clip": 1.03802598, + "balance_loss_mlp": 1.02643669, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 2.0821072938753855, + "language_loss": 0.70000243, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72144938, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 2.555206298828125 + }, + { + "auxiliary_loss_clip": 0.01083834, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.03503406, + "balance_loss_mlp": 1.01418996, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.7293123537474628, + "language_loss": 0.84453183, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86564487, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 2.642000675201416 + }, + { + "auxiliary_loss_clip": 0.01071126, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.03743887, + "balance_loss_mlp": 1.02327418, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.577389126634979, + "language_loss": 0.80678034, + "learning_rate": 2.868394020133277e-06, + "loss": 0.82785738, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.6823513507843018 + }, + { + "auxiliary_loss_clip": 0.01071573, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.03719366, + "balance_loss_mlp": 1.02713943, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 3.0683262364602, + "language_loss": 0.71335155, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73449528, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 2.710415840148926 + }, + { + "auxiliary_loss_clip": 0.01082011, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.03359985, + "balance_loss_mlp": 1.02165389, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.731858329971928, + "language_loss": 0.78554457, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80672836, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.6091103553771973 + }, + { + "auxiliary_loss_clip": 0.01084498, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.0352025, + "balance_loss_mlp": 1.02909088, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 2.050016905269119, + "language_loss": 0.80608362, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82737458, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.753889560699463 + }, + { + "auxiliary_loss_clip": 0.0108802, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.03391731, + "balance_loss_mlp": 1.0183723, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 2.090268022087547, + "language_loss": 0.80417502, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82537842, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.671799659729004 + }, + { + "auxiliary_loss_clip": 0.01114379, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.03912926, + "balance_loss_mlp": 1.02188039, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 1.9133699583908685, + "language_loss": 0.79785419, + "learning_rate": 2.866639438447501e-06, + "loss": 0.81935388, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.536999464035034 + }, + { + "auxiliary_loss_clip": 0.01109305, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.03616142, + "balance_loss_mlp": 1.02759659, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.2390759282935266, + "language_loss": 0.73673582, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75824058, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.54718017578125 + }, + { + "auxiliary_loss_clip": 0.01098653, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.03756762, + "balance_loss_mlp": 1.02089822, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.7920918015597316, + "language_loss": 0.68781126, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70912766, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.6228818893432617 + }, + { + "auxiliary_loss_clip": 0.01104171, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.03628051, + "balance_loss_mlp": 1.02145243, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 2.1896768065709975, + "language_loss": 0.62783653, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.649239, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 4.053948402404785 + }, + { + "auxiliary_loss_clip": 0.01024595, + "auxiliary_loss_mlp": 0.01019577, + "balance_loss_clip": 1.01113105, + "balance_loss_mlp": 1.01787221, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7329888175304291, + "language_loss": 0.58878469, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60922647, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 3.241608142852783 + }, + { + "auxiliary_loss_clip": 0.01112344, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.03717709, + "balance_loss_mlp": 1.0223496, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.587066848660549, + "language_loss": 0.65152436, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67301655, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 2.6153688430786133 + }, + { + "auxiliary_loss_clip": 0.0107736, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.03713024, + "balance_loss_mlp": 1.01998031, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.4458312957279242, + "language_loss": 0.70334011, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.7244609, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.648580312728882 + }, + { + "auxiliary_loss_clip": 0.01034137, + "auxiliary_loss_mlp": 0.01004588, + "balance_loss_clip": 1.01029634, + "balance_loss_mlp": 1.00309837, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7134071298740905, + "language_loss": 0.56059593, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58098316, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.0744237899780273 + }, + { + "auxiliary_loss_clip": 0.01098613, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.03595877, + "balance_loss_mlp": 1.02005112, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 2.1570029796514985, + "language_loss": 0.79880071, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82013655, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 4.0352771282196045 + }, + { + "auxiliary_loss_clip": 0.01094941, + "auxiliary_loss_mlp": 0.01027238, + "balance_loss_clip": 1.03417468, + "balance_loss_mlp": 1.01487625, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 3.297767567575407, + "language_loss": 0.73782074, + "learning_rate": 2.863479122159103e-06, + "loss": 0.7590425, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.550571918487549 + }, + { + "auxiliary_loss_clip": 0.01095975, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.03649044, + "balance_loss_mlp": 1.02505946, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 2.1459236393842525, + "language_loss": 0.72013861, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.7414844, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 2.5245330333709717 + }, + { + "auxiliary_loss_clip": 0.01080167, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.03583753, + "balance_loss_mlp": 1.01966298, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 2.5583423133268197, + "language_loss": 0.83707106, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.8582027, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 2.5745432376861572 + }, + { + "auxiliary_loss_clip": 0.0105165, + "auxiliary_loss_mlp": 0.01026222, + "balance_loss_clip": 1.0327394, + "balance_loss_mlp": 1.01439667, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 3.208914434125794, + "language_loss": 0.75403297, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77481174, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.7174482345581055 + }, + { + "auxiliary_loss_clip": 0.01084842, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03381407, + "balance_loss_mlp": 1.01690459, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 3.763434746416746, + "language_loss": 0.85363877, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87479985, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 4.217817783355713 + }, + { + "auxiliary_loss_clip": 0.01101263, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.04037571, + "balance_loss_mlp": 1.0194025, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 1.866615100234697, + "language_loss": 0.78194463, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80327797, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 4.258950710296631 + }, + { + "auxiliary_loss_clip": 0.01080188, + "auxiliary_loss_mlp": 0.01038327, + "balance_loss_clip": 1.03668141, + "balance_loss_mlp": 1.02316916, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.9068080340787774, + "language_loss": 0.83301294, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85419804, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.681565999984741 + }, + { + "auxiliary_loss_clip": 0.01086673, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.03425121, + "balance_loss_mlp": 1.01601291, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 1.8930258720922684, + "language_loss": 0.74707592, + "learning_rate": 2.861019264262269e-06, + "loss": 0.76822883, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 2.678802728652954 + }, + { + "auxiliary_loss_clip": 0.01109505, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.03926146, + "balance_loss_mlp": 1.02186894, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.367977495350148, + "language_loss": 0.76127213, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78270787, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 2.6253609657287598 + }, + { + "auxiliary_loss_clip": 0.01076718, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.03356099, + "balance_loss_mlp": 1.01625276, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 2.0096575800717797, + "language_loss": 0.84189355, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86295295, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 2.612818717956543 + }, + { + "auxiliary_loss_clip": 0.01096796, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.03551543, + "balance_loss_mlp": 1.01468158, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.682995924998417, + "language_loss": 0.69629061, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.71753764, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 2.546816825866699 + }, + { + "auxiliary_loss_clip": 0.01049548, + "auxiliary_loss_mlp": 0.0104995, + "balance_loss_clip": 1.03848279, + "balance_loss_mlp": 1.03383267, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 2.332500213593082, + "language_loss": 0.75861573, + "learning_rate": 2.859612912586581e-06, + "loss": 0.77961075, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 2.7410073280334473 + }, + { + "auxiliary_loss_clip": 0.01115019, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.03895044, + "balance_loss_mlp": 1.01601386, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 3.3503129160043015, + "language_loss": 0.84949076, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.87094027, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.5019209384918213 + }, + { + "auxiliary_loss_clip": 0.01089511, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.03779817, + "balance_loss_mlp": 1.02180743, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.8773420872228708, + "language_loss": 0.84083956, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86209524, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 2.5642707347869873 + }, + { + "auxiliary_loss_clip": 0.0109325, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.0366621, + "balance_loss_mlp": 1.02292752, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.1784232568576223, + "language_loss": 0.81798244, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83927715, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 2.5148229598999023 + }, + { + "auxiliary_loss_clip": 0.01091087, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.03334081, + "balance_loss_mlp": 1.02122366, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.425184581970353, + "language_loss": 0.73053885, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75180054, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 2.606584310531616 + }, + { + "auxiliary_loss_clip": 0.01101413, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.03925943, + "balance_loss_mlp": 1.01786232, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.911423518118128, + "language_loss": 0.75408489, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77540946, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.5943686962127686 + }, + { + "auxiliary_loss_clip": 0.01098411, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.03707695, + "balance_loss_mlp": 1.02028871, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 2.392372994912622, + "language_loss": 0.73298866, + "learning_rate": 2.857502407441593e-06, + "loss": 0.7543028, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.5487685203552246 + }, + { + "auxiliary_loss_clip": 0.01071989, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.0338074, + "balance_loss_mlp": 1.02379656, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.3993839285516754, + "language_loss": 0.79426873, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81537771, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 2.640984296798706 + }, + { + "auxiliary_loss_clip": 0.0107665, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.03449667, + "balance_loss_mlp": 1.01915228, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.685747794570896, + "language_loss": 0.75953853, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78063989, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.590282678604126 + }, + { + "auxiliary_loss_clip": 0.01088317, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.03384781, + "balance_loss_mlp": 1.02946162, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 2.461901204150009, + "language_loss": 0.69475996, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71608019, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.5097670555114746 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.03555679, + "balance_loss_mlp": 1.02226937, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.8744842088776645, + "language_loss": 0.71211928, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.7335543, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 2.4804952144622803 + }, + { + "auxiliary_loss_clip": 0.01088367, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.03539586, + "balance_loss_mlp": 1.01878333, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.4330592135946114, + "language_loss": 0.82391989, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84513688, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.549837112426758 + }, + { + "auxiliary_loss_clip": 0.0108879, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.03356481, + "balance_loss_mlp": 1.02183437, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.7929615860922383, + "language_loss": 0.71425867, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73550165, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 2.673875331878662 + }, + { + "auxiliary_loss_clip": 0.01108145, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.03752112, + "balance_loss_mlp": 1.02580428, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.7410065737809215, + "language_loss": 0.76716506, + "learning_rate": 2.855038672137396e-06, + "loss": 0.78863752, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 2.5544686317443848 + }, + { + "auxiliary_loss_clip": 0.01082976, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.03462052, + "balance_loss_mlp": 1.02232027, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 2.022767781859603, + "language_loss": 0.78993148, + "learning_rate": 2.854686580151684e-06, + "loss": 0.8111136, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 2.5744736194610596 + }, + { + "auxiliary_loss_clip": 0.01047319, + "auxiliary_loss_mlp": 0.01048185, + "balance_loss_clip": 1.03095341, + "balance_loss_mlp": 1.03278947, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 1.6022063038905894, + "language_loss": 0.84461498, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86556995, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.735912322998047 + }, + { + "auxiliary_loss_clip": 0.01076539, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.03616989, + "balance_loss_mlp": 1.01725578, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.029023549797633, + "language_loss": 0.76131535, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78238976, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.604053258895874 + }, + { + "auxiliary_loss_clip": 0.01094192, + "auxiliary_loss_mlp": 0.01036277, + "balance_loss_clip": 1.03658926, + "balance_loss_mlp": 1.02107787, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 1.957883377466313, + "language_loss": 0.82865083, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84995544, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.5136027336120605 + }, + { + "auxiliary_loss_clip": 0.01094882, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.03489566, + "balance_loss_mlp": 1.02269173, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.7949631217385558, + "language_loss": 0.67797428, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69928056, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.5440077781677246 + }, + { + "auxiliary_loss_clip": 0.01061918, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.03389668, + "balance_loss_mlp": 1.02326131, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.9216999479568715, + "language_loss": 0.68275428, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70373195, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.700812339782715 + }, + { + "auxiliary_loss_clip": 0.01108512, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.0359118, + "balance_loss_mlp": 1.01766515, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.5949996259376957, + "language_loss": 0.7761389, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79753488, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.6505515575408936 + }, + { + "auxiliary_loss_clip": 0.01116153, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.03927898, + "balance_loss_mlp": 1.02055717, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 1.8601710198461987, + "language_loss": 0.80176407, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82328099, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.5858981609344482 + }, + { + "auxiliary_loss_clip": 0.01028442, + "auxiliary_loss_mlp": 0.01009004, + "balance_loss_clip": 1.01403069, + "balance_loss_mlp": 1.0071677, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9775090655498873, + "language_loss": 0.64608371, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66645813, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 4.502970218658447 + }, + { + "auxiliary_loss_clip": 0.01081802, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.03533077, + "balance_loss_mlp": 1.03223276, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.5806376525929267, + "language_loss": 0.7347399, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75604033, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 2.6534996032714844 + }, + { + "auxiliary_loss_clip": 0.01086768, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.03626001, + "balance_loss_mlp": 1.02284622, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.7057723130506444, + "language_loss": 0.78576636, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80700564, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.5939228534698486 + }, + { + "auxiliary_loss_clip": 0.01073284, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.03459835, + "balance_loss_mlp": 1.02164769, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.172513286619333, + "language_loss": 0.7278111, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.74890155, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.601567268371582 + }, + { + "auxiliary_loss_clip": 0.01046166, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.03096235, + "balance_loss_mlp": 1.02328467, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4967021622703047, + "language_loss": 0.78911781, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.809955, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.62247896194458 + }, + { + "auxiliary_loss_clip": 0.01097747, + "auxiliary_loss_mlp": 0.0074965, + "balance_loss_clip": 1.03394032, + "balance_loss_mlp": 1.00028872, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 2.099247747266839, + "language_loss": 0.76164138, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.78011537, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.53767728805542 + }, + { + "auxiliary_loss_clip": 0.01084935, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03536296, + "balance_loss_mlp": 1.0194478, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 2.0855632070500962, + "language_loss": 0.70924509, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73042023, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 2.543219804763794 + }, + { + "auxiliary_loss_clip": 0.01006043, + "auxiliary_loss_mlp": 0.01005429, + "balance_loss_clip": 1.01144576, + "balance_loss_mlp": 1.00370014, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7867161913275381, + "language_loss": 0.5614568, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58157158, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 4.613285303115845 + }, + { + "auxiliary_loss_clip": 0.01071739, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.03378916, + "balance_loss_mlp": 1.02519679, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 1.6883495602687244, + "language_loss": 0.7139886, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73509216, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.7452690601348877 + }, + { + "auxiliary_loss_clip": 0.01105525, + "auxiliary_loss_mlp": 0.01036627, + "balance_loss_clip": 1.03748965, + "balance_loss_mlp": 1.02143419, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 2.1436122339540136, + "language_loss": 0.73285913, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75428057, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 2.5195298194885254 + }, + { + "auxiliary_loss_clip": 0.01095926, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.03587377, + "balance_loss_mlp": 1.0222044, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 1.9520168415874082, + "language_loss": 0.70842719, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.72973967, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.6867597103118896 + }, + { + "auxiliary_loss_clip": 0.01078872, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.03733826, + "balance_loss_mlp": 1.01950622, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8668480176418443, + "language_loss": 0.65233046, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67343801, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 4.2310264110565186 + }, + { + "auxiliary_loss_clip": 0.01094444, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.03371, + "balance_loss_mlp": 1.01914859, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.275875300067928, + "language_loss": 0.85992575, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.88117963, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 4.020153284072876 + }, + { + "auxiliary_loss_clip": 0.01087835, + "auxiliary_loss_mlp": 0.01038416, + "balance_loss_clip": 1.03577232, + "balance_loss_mlp": 1.02411687, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 1.989473680915023, + "language_loss": 0.76188409, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.7831465, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.591636896133423 + }, + { + "auxiliary_loss_clip": 0.01111787, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.03820419, + "balance_loss_mlp": 1.0238483, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.7190486161431826, + "language_loss": 0.63731915, + "learning_rate": 2.846932380444744e-06, + "loss": 0.65880775, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 2.5383119583129883 + }, + { + "auxiliary_loss_clip": 0.01081235, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.04437673, + "balance_loss_mlp": 1.02663636, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.7536558204296266, + "language_loss": 0.71086228, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73207772, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 2.7932486534118652 + }, + { + "auxiliary_loss_clip": 0.01071321, + "auxiliary_loss_mlp": 0.0103696, + "balance_loss_clip": 1.03311205, + "balance_loss_mlp": 1.0233705, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 2.0033670926199365, + "language_loss": 0.74112058, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76220334, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 2.686615228652954 + }, + { + "auxiliary_loss_clip": 0.0109903, + "auxiliary_loss_mlp": 0.01038121, + "balance_loss_clip": 1.03575921, + "balance_loss_mlp": 1.0244354, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.0198928057751813, + "language_loss": 0.85078126, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87215275, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 2.5665364265441895 + }, + { + "auxiliary_loss_clip": 0.01086611, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.03520489, + "balance_loss_mlp": 1.01962817, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.1378716296244757, + "language_loss": 0.73281252, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75402045, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 2.612628698348999 + }, + { + "auxiliary_loss_clip": 0.01081255, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.0361495, + "balance_loss_mlp": 1.02069998, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.9664871570746847, + "language_loss": 0.84005755, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86121988, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 2.70976185798645 + }, + { + "auxiliary_loss_clip": 0.01088731, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.03826809, + "balance_loss_mlp": 1.01877177, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 2.0328295424647305, + "language_loss": 0.79948294, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.82068896, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 2.5893709659576416 + }, + { + "auxiliary_loss_clip": 0.01096488, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.03503621, + "balance_loss_mlp": 1.02241874, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.7678654352673016, + "language_loss": 0.7264266, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74774128, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 2.658862352371216 + }, + { + "auxiliary_loss_clip": 0.0111105, + "auxiliary_loss_mlp": 0.00749583, + "balance_loss_clip": 1.03856349, + "balance_loss_mlp": 1.00018644, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.6528635656035906, + "language_loss": 0.82950509, + "learning_rate": 2.844108810081459e-06, + "loss": 0.84811145, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.6014139652252197 + }, + { + "auxiliary_loss_clip": 0.01098049, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.03616285, + "balance_loss_mlp": 1.01723576, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.454270697094106, + "language_loss": 0.61135674, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63263941, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.560814619064331 + }, + { + "auxiliary_loss_clip": 0.01078893, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.03441846, + "balance_loss_mlp": 1.02055979, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 2.8374014231550615, + "language_loss": 0.55890048, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58003175, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 2.531395673751831 + }, + { + "auxiliary_loss_clip": 0.01069902, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.03940845, + "balance_loss_mlp": 1.02144516, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.4768007097159772, + "language_loss": 0.65512419, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.67616224, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.675394058227539 + }, + { + "auxiliary_loss_clip": 0.01099422, + "auxiliary_loss_mlp": 0.0104155, + "balance_loss_clip": 1.04100132, + "balance_loss_mlp": 1.02766824, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.7516789950686134, + "language_loss": 0.75186348, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77327323, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 2.5282812118530273 + }, + { + "auxiliary_loss_clip": 0.01051605, + "auxiliary_loss_mlp": 0.0074985, + "balance_loss_clip": 1.03580618, + "balance_loss_mlp": 1.00027633, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.1925504893359067, + "language_loss": 0.81767029, + "learning_rate": 2.842343037886987e-06, + "loss": 0.83568478, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 2.6952037811279297 + }, + { + "auxiliary_loss_clip": 0.01098248, + "auxiliary_loss_mlp": 0.01034026, + "balance_loss_clip": 1.03637445, + "balance_loss_mlp": 1.02095485, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.7524313273281966, + "language_loss": 0.86355734, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88488013, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 2.6186113357543945 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.03610659, + "balance_loss_mlp": 1.02150106, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.2462572449636538, + "language_loss": 0.79212737, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81348902, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 2.579580545425415 + }, + { + "auxiliary_loss_clip": 0.01101011, + "auxiliary_loss_mlp": 0.01033055, + "balance_loss_clip": 1.03630412, + "balance_loss_mlp": 1.01921487, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 2.7047510107626866, + "language_loss": 0.72973216, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75107288, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 2.6691861152648926 + }, + { + "auxiliary_loss_clip": 0.01097054, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.03676438, + "balance_loss_mlp": 1.02017307, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.9525757084569866, + "language_loss": 0.69709384, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71839499, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.5877089500427246 + }, + { + "auxiliary_loss_clip": 0.01087177, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.03591692, + "balance_loss_mlp": 1.02043462, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.8193962243225779, + "language_loss": 0.63457489, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65578836, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.69814133644104 + }, + { + "auxiliary_loss_clip": 0.0108898, + "auxiliary_loss_mlp": 0.01038065, + "balance_loss_clip": 1.03576255, + "balance_loss_mlp": 1.02386689, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.6866472120993006, + "language_loss": 0.69124097, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71251142, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.521754264831543 + }, + { + "auxiliary_loss_clip": 0.01083876, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.03651047, + "balance_loss_mlp": 1.02412069, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.0787652192823454, + "language_loss": 0.68402445, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70524049, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.5859358310699463 + }, + { + "auxiliary_loss_clip": 0.01077688, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.03569496, + "balance_loss_mlp": 1.02184463, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 1.9745955628073368, + "language_loss": 0.89590102, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91703814, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.5859293937683105 + }, + { + "auxiliary_loss_clip": 0.01105003, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.03907561, + "balance_loss_mlp": 1.02247953, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.690208950848202, + "language_loss": 0.74656785, + "learning_rate": 2.83916263673333e-06, + "loss": 0.76799107, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.5648200511932373 + }, + { + "auxiliary_loss_clip": 0.01091079, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.03760886, + "balance_loss_mlp": 1.02171564, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.7182879365362898, + "language_loss": 0.83848786, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85975403, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.5796406269073486 + }, + { + "auxiliary_loss_clip": 0.01049346, + "auxiliary_loss_mlp": 0.01042799, + "balance_loss_clip": 1.03574383, + "balance_loss_mlp": 1.02811241, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 2.438699039907213, + "language_loss": 0.77242839, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79334986, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.6771581172943115 + }, + { + "auxiliary_loss_clip": 0.01075303, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.03646171, + "balance_loss_mlp": 1.02628446, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.053644807112756, + "language_loss": 0.73237735, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75355107, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 2.619746446609497 + }, + { + "auxiliary_loss_clip": 0.01074755, + "auxiliary_loss_mlp": 0.0074959, + "balance_loss_clip": 1.0360837, + "balance_loss_mlp": 1.00022042, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.9387124892733025, + "language_loss": 0.6950177, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71326125, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 4.080588340759277 + }, + { + "auxiliary_loss_clip": 0.01101311, + "auxiliary_loss_mlp": 0.0103553, + "balance_loss_clip": 1.03935599, + "balance_loss_mlp": 1.02238679, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.8763851179769535, + "language_loss": 0.75626016, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.7776286, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.5679078102111816 + }, + { + "auxiliary_loss_clip": 0.01101013, + "auxiliary_loss_mlp": 0.01034574, + "balance_loss_clip": 1.03698492, + "balance_loss_mlp": 1.02216387, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.6195966049499448, + "language_loss": 0.74429834, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76565421, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.515833616256714 + }, + { + "auxiliary_loss_clip": 0.01088694, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.03797185, + "balance_loss_mlp": 1.02095413, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.3824169818997527, + "language_loss": 0.87083936, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89207309, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.7981655597686768 + }, + { + "auxiliary_loss_clip": 0.01098428, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.03776395, + "balance_loss_mlp": 1.02166152, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 1.768991740501926, + "language_loss": 0.7674197, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78875387, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.667757034301758 + }, + { + "auxiliary_loss_clip": 0.01078105, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.03581238, + "balance_loss_mlp": 1.01473773, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.273341908028449, + "language_loss": 0.76689243, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78796613, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 2.654707431793213 + }, + { + "auxiliary_loss_clip": 0.0109875, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.03684568, + "balance_loss_mlp": 1.01988649, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.7682764685893202, + "language_loss": 0.74061155, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76194477, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.6947085857391357 + }, + { + "auxiliary_loss_clip": 0.01072928, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03549409, + "balance_loss_mlp": 1.01895118, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.7334314087562128, + "language_loss": 0.642735, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66377866, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 4.123847007751465 + }, + { + "auxiliary_loss_clip": 0.0111208, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.03843307, + "balance_loss_mlp": 1.01853406, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 3.365407239378744, + "language_loss": 0.83193761, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85337704, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.5822246074676514 + }, + { + "auxiliary_loss_clip": 0.01113601, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.04131854, + "balance_loss_mlp": 1.01817727, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.825382625945985, + "language_loss": 0.8058539, + "learning_rate": 2.834564176091943e-06, + "loss": 0.827299, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.526639223098755 + }, + { + "auxiliary_loss_clip": 0.01066037, + "auxiliary_loss_mlp": 0.01036083, + "balance_loss_clip": 1.03463745, + "balance_loss_mlp": 1.02235579, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.8861714649971184, + "language_loss": 0.75273472, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77375591, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.691887855529785 + }, + { + "auxiliary_loss_clip": 0.01104033, + "auxiliary_loss_mlp": 0.00749756, + "balance_loss_clip": 1.03996038, + "balance_loss_mlp": 1.00031567, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 2.199236963080413, + "language_loss": 0.80986643, + "learning_rate": 2.833856245169348e-06, + "loss": 0.82840437, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 4.196283578872681 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.04400992, + "balance_loss_mlp": 1.02778971, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 1.8313386258066977, + "language_loss": 0.77482373, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.79623616, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 4.089383840560913 + }, + { + "auxiliary_loss_clip": 0.01090487, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.03581381, + "balance_loss_mlp": 1.02325773, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.067965596788882, + "language_loss": 0.78649312, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.8077696, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 2.654891014099121 + }, + { + "auxiliary_loss_clip": 0.01041507, + "auxiliary_loss_mlp": 0.01046088, + "balance_loss_clip": 1.03204477, + "balance_loss_mlp": 1.03047132, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 2.0504622020626684, + "language_loss": 0.69642961, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71730554, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 2.957209587097168 + }, + { + "auxiliary_loss_clip": 0.01079559, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.03524256, + "balance_loss_mlp": 1.01887238, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 4.078065842062511, + "language_loss": 0.78753436, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80865884, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 2.6429810523986816 + }, + { + "auxiliary_loss_clip": 0.01087513, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.03617084, + "balance_loss_mlp": 1.02227521, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 3.2425323768359626, + "language_loss": 0.65088481, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67211759, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 2.7954373359680176 + }, + { + "auxiliary_loss_clip": 0.01112645, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.0389235, + "balance_loss_mlp": 1.01771593, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 2.015546232298988, + "language_loss": 0.81662059, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.83807003, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 2.4888970851898193 + }, + { + "auxiliary_loss_clip": 0.01057644, + "auxiliary_loss_mlp": 0.01037911, + "balance_loss_clip": 1.03820288, + "balance_loss_mlp": 1.0235225, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.715399062817991, + "language_loss": 0.59140831, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.61236387, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 2.887225389480591 + }, + { + "auxiliary_loss_clip": 0.01095332, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.03837764, + "balance_loss_mlp": 1.01956296, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.1717814634465293, + "language_loss": 0.68854463, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.70984173, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.61568021774292 + }, + { + "auxiliary_loss_clip": 0.01099747, + "auxiliary_loss_mlp": 0.01032671, + "balance_loss_clip": 1.03914809, + "balance_loss_mlp": 1.01816332, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 6.38108332606933, + "language_loss": 0.73424393, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75556815, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 2.5815417766571045 + }, + { + "auxiliary_loss_clip": 0.01091141, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.03670275, + "balance_loss_mlp": 1.01970696, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.7894396656012286, + "language_loss": 0.68078995, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70203733, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.587836503982544 + }, + { + "auxiliary_loss_clip": 0.01100188, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.0381012, + "balance_loss_mlp": 1.01808405, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 2.291426334423592, + "language_loss": 0.63792378, + "learning_rate": 2.82996036715143e-06, + "loss": 0.65924692, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 2.5447044372558594 + }, + { + "auxiliary_loss_clip": 0.01115373, + "auxiliary_loss_mlp": 0.01036034, + "balance_loss_clip": 1.04102421, + "balance_loss_mlp": 1.0217762, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.3038838168044249, + "language_loss": 0.68207443, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70358849, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 2.553947925567627 + }, + { + "auxiliary_loss_clip": 0.01058911, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.03259695, + "balance_loss_mlp": 1.02359104, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.6632247567009852, + "language_loss": 0.78589278, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80685955, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.834886074066162 + }, + { + "auxiliary_loss_clip": 0.01094031, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.03642249, + "balance_loss_mlp": 1.02810121, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 4.627609877565258, + "language_loss": 0.64051497, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.6618917, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 2.628530263900757 + }, + { + "auxiliary_loss_clip": 0.01085272, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.02202892, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.7373201951157449, + "language_loss": 0.72679245, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.74801385, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 2.635009527206421 + }, + { + "auxiliary_loss_clip": 0.01104366, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.03860331, + "balance_loss_mlp": 1.01900196, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.8938264147162052, + "language_loss": 0.84988487, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87126142, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 2.5556840896606445 + }, + { + "auxiliary_loss_clip": 0.01059998, + "auxiliary_loss_mlp": 0.01044555, + "balance_loss_clip": 1.03464961, + "balance_loss_mlp": 1.02972531, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 2.198588612664093, + "language_loss": 0.74688089, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76792645, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.7478506565093994 + }, + { + "auxiliary_loss_clip": 0.01102714, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.0401063, + "balance_loss_mlp": 1.02282166, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 3.3250358144096306, + "language_loss": 0.75671291, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77811009, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 2.6211626529693604 + }, + { + "auxiliary_loss_clip": 0.01103052, + "auxiliary_loss_mlp": 0.01035703, + "balance_loss_clip": 1.03851652, + "balance_loss_mlp": 1.02179146, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 1.930449088676351, + "language_loss": 0.72431391, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74570143, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 2.5488014221191406 + }, + { + "auxiliary_loss_clip": 0.01099598, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.03709614, + "balance_loss_mlp": 1.02638221, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 2.3933773068451747, + "language_loss": 0.68090618, + "learning_rate": 2.826769997289796e-06, + "loss": 0.70230854, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.608919858932495 + }, + { + "auxiliary_loss_clip": 0.01085945, + "auxiliary_loss_mlp": 0.01039229, + "balance_loss_clip": 1.04025388, + "balance_loss_mlp": 1.02428603, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 1.7761107865855161, + "language_loss": 0.73366159, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75491333, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.5831196308135986 + }, + { + "auxiliary_loss_clip": 0.01062257, + "auxiliary_loss_mlp": 0.01038058, + "balance_loss_clip": 1.03702426, + "balance_loss_mlp": 1.02501607, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.5773280558461784, + "language_loss": 0.68780053, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.70880365, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.642298698425293 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01037274, + "balance_loss_clip": 1.04011393, + "balance_loss_mlp": 1.02296948, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.8165145404906797, + "language_loss": 0.83228886, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.8536979, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.51842999458313 + }, + { + "auxiliary_loss_clip": 0.01112309, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.04060698, + "balance_loss_mlp": 1.0213933, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.9288478671048417, + "language_loss": 0.8107506, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83222109, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.701988935470581 + }, + { + "auxiliary_loss_clip": 0.01040555, + "auxiliary_loss_mlp": 0.01023922, + "balance_loss_clip": 1.01552224, + "balance_loss_mlp": 1.02246141, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.8082069376280885, + "language_loss": 0.60413384, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62477863, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.0457749366760254 + }, + { + "auxiliary_loss_clip": 0.01114849, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.03812337, + "balance_loss_mlp": 1.01861, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.42020754259156, + "language_loss": 0.66491598, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68639553, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 2.6080024242401123 + }, + { + "auxiliary_loss_clip": 0.01076064, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.03594041, + "balance_loss_mlp": 1.01876235, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 1.8134821184925898, + "language_loss": 0.74741024, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76849627, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.655829668045044 + }, + { + "auxiliary_loss_clip": 0.01098788, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.03944898, + "balance_loss_mlp": 1.02063167, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.4588008537152082, + "language_loss": 0.75944304, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78077233, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.528336524963379 + }, + { + "auxiliary_loss_clip": 0.01027151, + "auxiliary_loss_mlp": 0.0100726, + "balance_loss_clip": 1.01240134, + "balance_loss_mlp": 1.00565112, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9359844673198657, + "language_loss": 0.67175949, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69210362, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 4.4991631507873535 + }, + { + "auxiliary_loss_clip": 0.01070504, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.03576493, + "balance_loss_mlp": 1.01536417, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.895758466379572, + "language_loss": 0.72338414, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74437249, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.6103646755218506 + }, + { + "auxiliary_loss_clip": 0.01111485, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.04021204, + "balance_loss_mlp": 1.02749479, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.7367962597274138, + "language_loss": 0.8119123, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83343291, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 2.5875446796417236 + }, + { + "auxiliary_loss_clip": 0.0107569, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.03452253, + "balance_loss_mlp": 1.01733327, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.758279849851838, + "language_loss": 0.76032007, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78137738, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 2.5897057056427 + }, + { + "auxiliary_loss_clip": 0.0109293, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.03933167, + "balance_loss_mlp": 1.02357388, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.7589514471516108, + "language_loss": 0.76291585, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.7842291, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 2.579422950744629 + }, + { + "auxiliary_loss_clip": 0.01060915, + "auxiliary_loss_mlp": 0.010415, + "balance_loss_clip": 1.03174901, + "balance_loss_mlp": 1.026003, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.8703474966523812, + "language_loss": 0.69916332, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72018743, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.689652681350708 + }, + { + "auxiliary_loss_clip": 0.0110223, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.03662634, + "balance_loss_mlp": 1.02059531, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 2.0325978921330026, + "language_loss": 0.83913052, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86050451, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 4.060842037200928 + }, + { + "auxiliary_loss_clip": 0.01095427, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.03486013, + "balance_loss_mlp": 1.02027011, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.872327580166395, + "language_loss": 0.60500467, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.62629592, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.4996185302734375 + }, + { + "auxiliary_loss_clip": 0.01084325, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.03668594, + "balance_loss_mlp": 1.02204335, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 1.9147285484847971, + "language_loss": 0.71221423, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73342705, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.63781476020813 + }, + { + "auxiliary_loss_clip": 0.0109789, + "auxiliary_loss_mlp": 0.0103137, + "balance_loss_clip": 1.03680265, + "balance_loss_mlp": 1.01659989, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.8142906816782833, + "language_loss": 0.81882012, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.84011275, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 2.5488882064819336 + }, + { + "auxiliary_loss_clip": 0.01102263, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.04039323, + "balance_loss_mlp": 1.02524686, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 2.394021005175868, + "language_loss": 0.71133697, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73274767, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.5047659873962402 + }, + { + "auxiliary_loss_clip": 0.01022707, + "auxiliary_loss_mlp": 0.01011576, + "balance_loss_clip": 1.00835466, + "balance_loss_mlp": 1.01012766, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8969571831368548, + "language_loss": 0.59747255, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61781538, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 4.661588668823242 + }, + { + "auxiliary_loss_clip": 0.01113361, + "auxiliary_loss_mlp": 0.01027837, + "balance_loss_clip": 1.03978848, + "balance_loss_mlp": 1.01426482, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 2.077618730309314, + "language_loss": 0.84786117, + "learning_rate": 2.819315942271794e-06, + "loss": 0.86927319, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 3.9558660984039307 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.03731871, + "balance_loss_mlp": 1.01775837, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.9068274771114113, + "language_loss": 0.79543591, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81684983, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 2.5038347244262695 + }, + { + "auxiliary_loss_clip": 0.01112548, + "auxiliary_loss_mlp": 0.00749818, + "balance_loss_clip": 1.03780842, + "balance_loss_mlp": 1.0002569, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8962438024496797, + "language_loss": 0.66861254, + "learning_rate": 2.818605315732038e-06, + "loss": 0.68723619, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.5381152629852295 + }, + { + "auxiliary_loss_clip": 0.01092927, + "auxiliary_loss_mlp": 0.01043616, + "balance_loss_clip": 1.03732347, + "balance_loss_mlp": 1.02946007, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.9858806792558041, + "language_loss": 0.73169583, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75306123, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 2.574307918548584 + }, + { + "auxiliary_loss_clip": 0.01077765, + "auxiliary_loss_mlp": 0.01040128, + "balance_loss_clip": 1.03597486, + "balance_loss_mlp": 1.02653253, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 2.0293200422546325, + "language_loss": 0.71759665, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.73877549, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 2.6377627849578857 + }, + { + "auxiliary_loss_clip": 0.01108528, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.03647316, + "balance_loss_mlp": 1.02529216, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.8150911381204953, + "language_loss": 0.83050358, + "learning_rate": 2.817539143144128e-06, + "loss": 0.85197115, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 2.4643263816833496 + }, + { + "auxiliary_loss_clip": 0.01051841, + "auxiliary_loss_mlp": 0.01036335, + "balance_loss_clip": 1.03151572, + "balance_loss_mlp": 1.02253032, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 2.2335941281753495, + "language_loss": 0.82960635, + "learning_rate": 2.817183690261189e-06, + "loss": 0.85048819, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.61023211479187 + }, + { + "auxiliary_loss_clip": 0.010847, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.03582609, + "balance_loss_mlp": 1.02268314, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.4622190673438404, + "language_loss": 0.69537407, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71657711, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 2.6694066524505615 + }, + { + "auxiliary_loss_clip": 0.01078242, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.03355873, + "balance_loss_mlp": 1.02613521, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 2.4702630301751993, + "language_loss": 0.7893185, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81048465, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.537928342819214 + }, + { + "auxiliary_loss_clip": 0.01102265, + "auxiliary_loss_mlp": 0.01038887, + "balance_loss_clip": 1.03870964, + "balance_loss_mlp": 1.0252018, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 4.4424234954439825, + "language_loss": 0.84066802, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.8620795, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 2.4944353103637695 + }, + { + "auxiliary_loss_clip": 0.0102234, + "auxiliary_loss_mlp": 0.01031576, + "balance_loss_clip": 1.00827599, + "balance_loss_mlp": 1.03026485, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 1.0101103245426135, + "language_loss": 0.64799619, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66853535, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.143275260925293 + }, + { + "auxiliary_loss_clip": 0.01084592, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.03712118, + "balance_loss_mlp": 1.02699447, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.6095528731660096, + "language_loss": 0.73228103, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75354338, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 2.5867342948913574 + }, + { + "auxiliary_loss_clip": 0.01067993, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.0363307, + "balance_loss_mlp": 1.03275013, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.2699078939948087, + "language_loss": 0.69996762, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72112858, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 2.6280338764190674 + }, + { + "auxiliary_loss_clip": 0.00994331, + "auxiliary_loss_mlp": 0.00746848, + "balance_loss_clip": 1.00925565, + "balance_loss_mlp": 0.99976367, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.7653152861327027, + "language_loss": 0.60342646, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62083828, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 3.2915091514587402 + }, + { + "auxiliary_loss_clip": 0.01069316, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.03637767, + "balance_loss_mlp": 1.01787996, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 2.402316892941002, + "language_loss": 0.77800024, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79900175, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.657348871231079 + }, + { + "auxiliary_loss_clip": 0.01073199, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.0349983, + "balance_loss_mlp": 1.01994705, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.827639561272377, + "language_loss": 0.77648771, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.79757482, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 2.754283905029297 + }, + { + "auxiliary_loss_clip": 0.01033703, + "auxiliary_loss_mlp": 0.01005334, + "balance_loss_clip": 1.00931001, + "balance_loss_mlp": 1.00383186, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8154477251302313, + "language_loss": 0.61338097, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63377136, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 2.934570550918579 + }, + { + "auxiliary_loss_clip": 0.01079334, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.04062581, + "balance_loss_mlp": 1.02223778, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.1219112877501716, + "language_loss": 0.77378213, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79493403, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 2.6264421939849854 + }, + { + "auxiliary_loss_clip": 0.0108376, + "auxiliary_loss_mlp": 0.010253, + "balance_loss_clip": 1.03628349, + "balance_loss_mlp": 1.01325428, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.5698450186369248, + "language_loss": 0.79995495, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.82104558, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.63232684135437 + }, + { + "auxiliary_loss_clip": 0.01096288, + "auxiliary_loss_mlp": 0.00749557, + "balance_loss_clip": 1.0364238, + "balance_loss_mlp": 1.00016868, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 2.609184999496282, + "language_loss": 0.7900126, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.80847108, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.5920634269714355 + }, + { + "auxiliary_loss_clip": 0.01081952, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.03472006, + "balance_loss_mlp": 1.0202955, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 2.102760967429752, + "language_loss": 0.79696429, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.81811142, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.6047425270080566 + }, + { + "auxiliary_loss_clip": 0.01082255, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.03501058, + "balance_loss_mlp": 1.01727653, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 2.0505215775710894, + "language_loss": 0.79503757, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81615955, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.642578363418579 + }, + { + "auxiliary_loss_clip": 0.01088445, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.03816509, + "balance_loss_mlp": 1.01851964, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.1364456470414317, + "language_loss": 0.67274082, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69395244, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 2.642928123474121 + }, + { + "auxiliary_loss_clip": 0.01066094, + "auxiliary_loss_mlp": 0.01040792, + "balance_loss_clip": 1.03527522, + "balance_loss_mlp": 1.02663636, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.0670343874720727, + "language_loss": 0.81423527, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83530414, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 2.5828170776367188 + }, + { + "auxiliary_loss_clip": 0.01085503, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.03504515, + "balance_loss_mlp": 1.01717675, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.05121106562919, + "language_loss": 0.71557957, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.73674047, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.640566349029541 + }, + { + "auxiliary_loss_clip": 0.01083403, + "auxiliary_loss_mlp": 0.01032769, + "balance_loss_clip": 1.03632879, + "balance_loss_mlp": 1.02055597, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.8300047591504138, + "language_loss": 0.66486597, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.68602765, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.565657138824463 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.04205406, + "balance_loss_mlp": 1.01932669, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.7471800689659376, + "language_loss": 0.68811619, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70950645, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.672670602798462 + }, + { + "auxiliary_loss_clip": 0.01075974, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.03692329, + "balance_loss_mlp": 1.01957786, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.3950104408182646, + "language_loss": 0.72066134, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74174672, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 4.12214732170105 + }, + { + "auxiliary_loss_clip": 0.01078409, + "auxiliary_loss_mlp": 0.00749857, + "balance_loss_clip": 1.03584027, + "balance_loss_mlp": 1.00020933, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.078830782464422, + "language_loss": 0.80193251, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82021517, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 2.6479547023773193 + }, + { + "auxiliary_loss_clip": 0.01100662, + "auxiliary_loss_mlp": 0.01031716, + "balance_loss_clip": 1.03789759, + "balance_loss_mlp": 1.01768506, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 1.8605528727415068, + "language_loss": 0.74601734, + "learning_rate": 2.80899974864781e-06, + "loss": 0.76734102, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 2.605376958847046 + }, + { + "auxiliary_loss_clip": 0.01048318, + "auxiliary_loss_mlp": 0.01050829, + "balance_loss_clip": 1.03131831, + "balance_loss_mlp": 1.03529561, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 2.2864599787688684, + "language_loss": 0.69579196, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.7167834, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.6121294498443604 + }, + { + "auxiliary_loss_clip": 0.0109302, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.03937435, + "balance_loss_mlp": 1.0262475, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 3.670338199267612, + "language_loss": 0.84727967, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.8686074, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 2.663133144378662 + }, + { + "auxiliary_loss_clip": 0.01090579, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.03875744, + "balance_loss_mlp": 1.02073801, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.0327188684539435, + "language_loss": 0.81279469, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83404267, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.6329565048217773 + }, + { + "auxiliary_loss_clip": 0.01007012, + "auxiliary_loss_mlp": 0.00998884, + "balance_loss_clip": 1.01259398, + "balance_loss_mlp": 0.99728698, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7219578404885675, + "language_loss": 0.58810508, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60816407, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.203505277633667 + }, + { + "auxiliary_loss_clip": 0.01060633, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.03645205, + "balance_loss_mlp": 1.02056539, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 2.7840363759386015, + "language_loss": 0.7904315, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81139016, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 4.183533430099487 + }, + { + "auxiliary_loss_clip": 0.01101162, + "auxiliary_loss_mlp": 0.01041038, + "balance_loss_clip": 1.03584576, + "balance_loss_mlp": 1.02675688, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 2.0291997021417374, + "language_loss": 0.8053335, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82675552, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.5377414226531982 + }, + { + "auxiliary_loss_clip": 0.01090741, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.03995824, + "balance_loss_mlp": 1.02182555, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.6235154378502272, + "language_loss": 0.70627123, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72754395, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.6251988410949707 + }, + { + "auxiliary_loss_clip": 0.01075286, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.03697848, + "balance_loss_mlp": 1.03312206, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 1.7703655130617655, + "language_loss": 0.7734102, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79465926, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 2.780796527862549 + }, + { + "auxiliary_loss_clip": 0.01098426, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.03671086, + "balance_loss_mlp": 1.02071643, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.7327407151368066, + "language_loss": 0.79537582, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81670189, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.5750088691711426 + }, + { + "auxiliary_loss_clip": 0.01084857, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.03736222, + "balance_loss_mlp": 1.01902759, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.7427710948569168, + "language_loss": 0.76396662, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.78513491, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 5.564223766326904 + }, + { + "auxiliary_loss_clip": 0.01090992, + "auxiliary_loss_mlp": 0.01038107, + "balance_loss_clip": 1.03846264, + "balance_loss_mlp": 1.02607334, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.0568548566790192, + "language_loss": 0.81506634, + "learning_rate": 2.805079942855074e-06, + "loss": 0.83635736, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 2.6262080669403076 + }, + { + "auxiliary_loss_clip": 0.01087289, + "auxiliary_loss_mlp": 0.00749671, + "balance_loss_clip": 1.03423059, + "balance_loss_mlp": 1.00023746, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.6360783764676519, + "language_loss": 0.75363445, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77200401, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 2.6361076831817627 + }, + { + "auxiliary_loss_clip": 0.01109427, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.03946078, + "balance_loss_mlp": 1.01756406, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 2.3589564808097534, + "language_loss": 0.73852271, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.75992513, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 2.5923173427581787 + }, + { + "auxiliary_loss_clip": 0.01100789, + "auxiliary_loss_mlp": 0.01038111, + "balance_loss_clip": 1.03615236, + "balance_loss_mlp": 1.02424049, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 2.148883914961415, + "language_loss": 0.82198161, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84337056, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 2.5550620555877686 + }, + { + "auxiliary_loss_clip": 0.01112815, + "auxiliary_loss_mlp": 0.01045437, + "balance_loss_clip": 1.03957343, + "balance_loss_mlp": 1.03309286, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.4109786701443876, + "language_loss": 0.80940485, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83098745, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.4844675064086914 + }, + { + "auxiliary_loss_clip": 0.01065555, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.03375542, + "balance_loss_mlp": 1.02102542, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.7322420428399474, + "language_loss": 0.83764899, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85865176, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.6381988525390625 + }, + { + "auxiliary_loss_clip": 0.01013365, + "auxiliary_loss_mlp": 0.01011046, + "balance_loss_clip": 1.00978351, + "balance_loss_mlp": 1.00956774, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.758541881536219, + "language_loss": 0.5023483, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52259243, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 3.1819188594818115 + }, + { + "auxiliary_loss_clip": 0.01062082, + "auxiliary_loss_mlp": 0.00749715, + "balance_loss_clip": 1.03152621, + "balance_loss_mlp": 1.00012767, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.5945198714507178, + "language_loss": 0.78724182, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80535984, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.794790267944336 + }, + { + "auxiliary_loss_clip": 0.0108915, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.03613055, + "balance_loss_mlp": 1.02185798, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.8597420936030553, + "language_loss": 0.81126666, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83251053, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 2.5597457885742188 + }, + { + "auxiliary_loss_clip": 0.01087888, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.03597903, + "balance_loss_mlp": 1.02590442, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.7002797707701736, + "language_loss": 0.77092689, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79219806, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.614915609359741 + }, + { + "auxiliary_loss_clip": 0.01087875, + "auxiliary_loss_mlp": 0.01034506, + "balance_loss_clip": 1.03670216, + "balance_loss_mlp": 1.02156615, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5639373662074467, + "language_loss": 0.76040399, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78162777, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 2.616870641708374 + }, + { + "auxiliary_loss_clip": 0.01081583, + "auxiliary_loss_mlp": 0.01034048, + "balance_loss_clip": 1.03570497, + "balance_loss_mlp": 1.02060103, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.7343994408527434, + "language_loss": 0.76084208, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.7819984, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 2.573772430419922 + }, + { + "auxiliary_loss_clip": 0.01075565, + "auxiliary_loss_mlp": 0.00749764, + "balance_loss_clip": 1.03244138, + "balance_loss_mlp": 1.00009227, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.6367814210335663, + "language_loss": 0.78521442, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80346769, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.611354112625122 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.03692245, + "balance_loss_mlp": 1.01893246, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.4744998845886506, + "language_loss": 0.77628255, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79775882, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.5733683109283447 + }, + { + "auxiliary_loss_clip": 0.01106787, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.03597319, + "balance_loss_mlp": 1.01582539, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 2.2007986029083644, + "language_loss": 0.76319826, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78455174, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 2.5288918018341064 + }, + { + "auxiliary_loss_clip": 0.01082917, + "auxiliary_loss_mlp": 0.01039229, + "balance_loss_clip": 1.03812754, + "balance_loss_mlp": 1.02556169, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.9949742629042067, + "language_loss": 0.79640025, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81762171, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.5881166458129883 + }, + { + "auxiliary_loss_clip": 0.01108722, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.04019523, + "balance_loss_mlp": 1.01907349, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 8.558292096565806, + "language_loss": 0.71638387, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73780632, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 2.6169636249542236 + }, + { + "auxiliary_loss_clip": 0.01115034, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.03860724, + "balance_loss_mlp": 1.02268982, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 1.7281940234094961, + "language_loss": 0.77723044, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.79875356, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.516436815261841 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.03733468, + "balance_loss_mlp": 1.01973581, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.4650070615682722, + "language_loss": 0.75691223, + "learning_rate": 2.798657755439662e-06, + "loss": 0.77833831, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 2.5273518562316895 + }, + { + "auxiliary_loss_clip": 0.01040106, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.03439116, + "balance_loss_mlp": 1.01771963, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.1939290812782093, + "language_loss": 0.60519612, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62591457, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 2.7083263397216797 + }, + { + "auxiliary_loss_clip": 0.01112961, + "auxiliary_loss_mlp": 0.01036791, + "balance_loss_clip": 1.03749681, + "balance_loss_mlp": 1.02176487, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 3.0946900427394217, + "language_loss": 0.79450679, + "learning_rate": 2.797943571912841e-06, + "loss": 0.8160044, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 2.5112099647521973 + }, + { + "auxiliary_loss_clip": 0.01060188, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.03550351, + "balance_loss_mlp": 1.0280757, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.9272719378604992, + "language_loss": 0.81602597, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83706355, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 2.69095516204834 + }, + { + "auxiliary_loss_clip": 0.01081706, + "auxiliary_loss_mlp": 0.01031937, + "balance_loss_clip": 1.03375888, + "balance_loss_mlp": 1.01868129, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 1.733941237064937, + "language_loss": 0.61417246, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63530886, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.5368709564208984 + }, + { + "auxiliary_loss_clip": 0.01098728, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.03718007, + "balance_loss_mlp": 1.01806211, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.934441934872907, + "language_loss": 0.85965109, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88094419, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.5504515171051025 + }, + { + "auxiliary_loss_clip": 0.01099824, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.03712988, + "balance_loss_mlp": 1.02091658, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 2.170709766171787, + "language_loss": 0.714064, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73540747, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.600684642791748 + }, + { + "auxiliary_loss_clip": 0.01060594, + "auxiliary_loss_mlp": 0.0103928, + "balance_loss_clip": 1.03124619, + "balance_loss_mlp": 1.02401507, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.3348825978433845, + "language_loss": 0.7585007, + "learning_rate": 2.796157583816052e-06, + "loss": 0.77949953, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.6357879638671875 + }, + { + "auxiliary_loss_clip": 0.01080828, + "auxiliary_loss_mlp": 0.01043669, + "balance_loss_clip": 1.03684187, + "balance_loss_mlp": 1.02860117, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 1.9564359978802004, + "language_loss": 0.70272708, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72397202, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 4.049339771270752 + }, + { + "auxiliary_loss_clip": 0.01087735, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.03697252, + "balance_loss_mlp": 1.01904535, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.8186144786129133, + "language_loss": 0.69508135, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.71628881, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 2.5854909420013428 + }, + { + "auxiliary_loss_clip": 0.01075954, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.03618479, + "balance_loss_mlp": 1.02331829, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.423268727076055, + "language_loss": 0.78420752, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80534559, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 2.643434524536133 + }, + { + "auxiliary_loss_clip": 0.01075225, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.0372237, + "balance_loss_mlp": 1.01971102, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.611661610756983, + "language_loss": 0.69748247, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71857131, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 2.698899745941162 + }, + { + "auxiliary_loss_clip": 0.010739, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.03428864, + "balance_loss_mlp": 1.02616978, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.463624318108145, + "language_loss": 0.8387, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85984516, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 2.579885721206665 + }, + { + "auxiliary_loss_clip": 0.01086251, + "auxiliary_loss_mlp": 0.01034635, + "balance_loss_clip": 1.03602827, + "balance_loss_mlp": 1.02250504, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 2.632006455934265, + "language_loss": 0.85006928, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.87127811, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.60357403755188 + }, + { + "auxiliary_loss_clip": 0.0106779, + "auxiliary_loss_mlp": 0.01041044, + "balance_loss_clip": 1.03346455, + "balance_loss_mlp": 1.02594018, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 2.074028584970971, + "language_loss": 0.74917054, + "learning_rate": 2.793655932864273e-06, + "loss": 0.7702589, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 2.6548001766204834 + }, + { + "auxiliary_loss_clip": 0.01073195, + "auxiliary_loss_mlp": 0.00749716, + "balance_loss_clip": 1.03531659, + "balance_loss_mlp": 1.00012589, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.9181395045544514, + "language_loss": 0.74495351, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.76318264, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 4.15249490737915 + }, + { + "auxiliary_loss_clip": 0.01052494, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.03103924, + "balance_loss_mlp": 1.025316, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 3.0300447009642495, + "language_loss": 0.67825544, + "learning_rate": 2.792940904386562e-06, + "loss": 0.69917333, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.6690995693206787 + }, + { + "auxiliary_loss_clip": 0.01085155, + "auxiliary_loss_mlp": 0.01043804, + "balance_loss_clip": 1.03882122, + "balance_loss_mlp": 1.03049445, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.9959293751550482, + "language_loss": 0.75845599, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.77974552, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 2.6521713733673096 + }, + { + "auxiliary_loss_clip": 0.01091212, + "auxiliary_loss_mlp": 0.01044398, + "balance_loss_clip": 1.03897822, + "balance_loss_mlp": 1.03061175, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 2.004485420770145, + "language_loss": 0.70552158, + "learning_rate": 2.792225755635257e-06, + "loss": 0.72687769, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 2.645008087158203 + }, + { + "auxiliary_loss_clip": 0.01111791, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.03726518, + "balance_loss_mlp": 1.02549005, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.5975139251022348, + "language_loss": 0.68736267, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70886642, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.5784943103790283 + }, + { + "auxiliary_loss_clip": 0.01087306, + "auxiliary_loss_mlp": 0.01052864, + "balance_loss_clip": 1.03638911, + "balance_loss_mlp": 1.03689051, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.9204723430882957, + "language_loss": 0.75714195, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.77854371, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 4.130558490753174 + }, + { + "auxiliary_loss_clip": 0.01015883, + "auxiliary_loss_mlp": 0.01007367, + "balance_loss_clip": 1.0113802, + "balance_loss_mlp": 1.00579345, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7891766822115418, + "language_loss": 0.58279276, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60302532, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 4.608468770980835 + }, + { + "auxiliary_loss_clip": 0.01069557, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.03561211, + "balance_loss_mlp": 1.0201546, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 3.2087119229725216, + "language_loss": 0.7806803, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80172396, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 2.641681671142578 + }, + { + "auxiliary_loss_clip": 0.01093546, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.0346539, + "balance_loss_mlp": 1.02039635, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 2.053463745212231, + "language_loss": 0.82568467, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.84695578, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 2.516453981399536 + }, + { + "auxiliary_loss_clip": 0.01111868, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.03850651, + "balance_loss_mlp": 1.02223754, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.7467907989965825, + "language_loss": 0.79805642, + "learning_rate": 2.790079588824617e-06, + "loss": 0.81954277, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 2.5268712043762207 + }, + { + "auxiliary_loss_clip": 0.01081792, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.03439796, + "balance_loss_mlp": 1.01530051, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.8709844828560234, + "language_loss": 0.82941902, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85052443, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.663965940475464 + }, + { + "auxiliary_loss_clip": 0.0107936, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.03670025, + "balance_loss_mlp": 1.02069044, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.632325936090704, + "language_loss": 0.753075, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77420366, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.6517767906188965 + }, + { + "auxiliary_loss_clip": 0.01077536, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.03431892, + "balance_loss_mlp": 1.01944721, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 1.9248283338564036, + "language_loss": 0.78773451, + "learning_rate": 2.78900610077756e-06, + "loss": 0.80883926, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 2.682709217071533 + }, + { + "auxiliary_loss_clip": 0.0109817, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.03564906, + "balance_loss_mlp": 1.01405644, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4622385446416435, + "language_loss": 0.8001368, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82140422, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 2.5862767696380615 + }, + { + "auxiliary_loss_clip": 0.0109279, + "auxiliary_loss_mlp": 0.01050223, + "balance_loss_clip": 1.03708434, + "balance_loss_mlp": 1.03424883, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.6031166140741393, + "language_loss": 0.77631545, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.79774559, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 2.5507662296295166 + }, + { + "auxiliary_loss_clip": 0.01065386, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.0353353, + "balance_loss_mlp": 1.02122879, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.388705831974316, + "language_loss": 0.85274494, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87375528, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 2.7073090076446533 + }, + { + "auxiliary_loss_clip": 0.01088286, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.03567815, + "balance_loss_mlp": 1.01987529, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 2.0882027776199426, + "language_loss": 0.85709715, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.8783139, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 2.6663146018981934 + }, + { + "auxiliary_loss_clip": 0.01080855, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.03462625, + "balance_loss_mlp": 1.01647806, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.6727258948329633, + "language_loss": 0.7293511, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75046796, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 2.6072022914886475 + }, + { + "auxiliary_loss_clip": 0.01072376, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.03568316, + "balance_loss_mlp": 1.02145505, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 1.8571757420099777, + "language_loss": 0.68832374, + "learning_rate": 2.786858317231779e-06, + "loss": 0.70941532, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.6688244342803955 + }, + { + "auxiliary_loss_clip": 0.01085442, + "auxiliary_loss_mlp": 0.01039263, + "balance_loss_clip": 1.03652573, + "balance_loss_mlp": 1.0260489, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.8948302031808888, + "language_loss": 0.80801678, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.82926387, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.623363494873047 + }, + { + "auxiliary_loss_clip": 0.01102417, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.03732872, + "balance_loss_mlp": 1.01918697, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 3.3697105255986632, + "language_loss": 0.89584625, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91719866, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 2.582714796066284 + }, + { + "auxiliary_loss_clip": 0.01068412, + "auxiliary_loss_mlp": 0.01040792, + "balance_loss_clip": 1.03357375, + "balance_loss_mlp": 1.0263437, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.8118683003707463, + "language_loss": 0.78585374, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.8069458, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 2.6213510036468506 + }, + { + "auxiliary_loss_clip": 0.0108248, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03243876, + "balance_loss_mlp": 1.01882029, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.7646030660592533, + "language_loss": 0.74423766, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76538515, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.6067755222320557 + }, + { + "auxiliary_loss_clip": 0.01072044, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.03805768, + "balance_loss_mlp": 1.02128983, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.96969230260994, + "language_loss": 0.75991458, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78099263, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.6229047775268555 + }, + { + "auxiliary_loss_clip": 0.01107873, + "auxiliary_loss_mlp": 0.01044192, + "balance_loss_clip": 1.03840053, + "balance_loss_mlp": 1.02902853, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 3.3655395148811738, + "language_loss": 0.74303204, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76455271, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.5721426010131836 + }, + { + "auxiliary_loss_clip": 0.01114775, + "auxiliary_loss_mlp": 0.0103938, + "balance_loss_clip": 1.03995228, + "balance_loss_mlp": 1.02501523, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.5798983188684574, + "language_loss": 0.67589593, + "learning_rate": 2.784351212350352e-06, + "loss": 0.69743752, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 2.6547720432281494 + }, + { + "auxiliary_loss_clip": 0.01005802, + "auxiliary_loss_mlp": 0.01003168, + "balance_loss_clip": 1.01153612, + "balance_loss_mlp": 1.00168407, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6577908330305304, + "language_loss": 0.53957057, + "learning_rate": 2.783992935430775e-06, + "loss": 0.55966032, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 3.2729461193084717 + }, + { + "auxiliary_loss_clip": 0.01072389, + "auxiliary_loss_mlp": 0.00749716, + "balance_loss_clip": 1.03604889, + "balance_loss_mlp": 1.00021386, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.1683106486349555, + "language_loss": 0.69446683, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71268785, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.644113540649414 + }, + { + "auxiliary_loss_clip": 0.01004924, + "auxiliary_loss_mlp": 0.01000862, + "balance_loss_clip": 1.00965309, + "balance_loss_mlp": 0.99927664, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.821613498115465, + "language_loss": 0.51773137, + "learning_rate": 2.783276292417936e-06, + "loss": 0.53778923, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.2251956462860107 + }, + { + "auxiliary_loss_clip": 0.01101617, + "auxiliary_loss_mlp": 0.01043425, + "balance_loss_clip": 1.03714824, + "balance_loss_mlp": 1.02833939, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.828521865914805, + "language_loss": 0.73767877, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.75912917, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.5959715843200684 + }, + { + "auxiliary_loss_clip": 0.0110088, + "auxiliary_loss_mlp": 0.01040948, + "balance_loss_clip": 1.0372045, + "balance_loss_mlp": 1.02803123, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 21.294843914458145, + "language_loss": 0.68625617, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.7076745, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.6150717735290527 + }, + { + "auxiliary_loss_clip": 0.01100139, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.03735518, + "balance_loss_mlp": 1.02256036, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.7917735412612878, + "language_loss": 0.78946584, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81082261, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 2.538825035095215 + }, + { + "auxiliary_loss_clip": 0.01087792, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.03871346, + "balance_loss_mlp": 1.022928, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.6260321382618175, + "language_loss": 0.80230284, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.82353079, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 2.651606798171997 + }, + { + "auxiliary_loss_clip": 0.01081489, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.03287172, + "balance_loss_mlp": 1.01980352, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 4.777053131494119, + "language_loss": 0.71219409, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73332322, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.5666697025299072 + }, + { + "auxiliary_loss_clip": 0.01107996, + "auxiliary_loss_mlp": 0.01040249, + "balance_loss_clip": 1.03549576, + "balance_loss_mlp": 1.02717137, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.7714271975720959, + "language_loss": 0.83249891, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85398132, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 4.076677560806274 + }, + { + "auxiliary_loss_clip": 0.01108661, + "auxiliary_loss_mlp": 0.01037753, + "balance_loss_clip": 1.03761828, + "balance_loss_mlp": 1.02322114, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 2.179679527084042, + "language_loss": 0.71217132, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73363543, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.5363945960998535 + }, + { + "auxiliary_loss_clip": 0.010846, + "auxiliary_loss_mlp": 0.0104084, + "balance_loss_clip": 1.03570747, + "balance_loss_mlp": 1.02861476, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.010015828336959, + "language_loss": 0.75105965, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77231407, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 2.5698158740997314 + }, + { + "auxiliary_loss_clip": 0.01033593, + "auxiliary_loss_mlp": 0.01018073, + "balance_loss_clip": 1.01012254, + "balance_loss_mlp": 1.01672637, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7601822243918132, + "language_loss": 0.56478703, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58530366, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 3.2193048000335693 + }, + { + "auxiliary_loss_clip": 0.01098243, + "auxiliary_loss_mlp": 0.01039875, + "balance_loss_clip": 1.03664517, + "balance_loss_mlp": 1.02751315, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.8481779217233336, + "language_loss": 0.75978488, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78116596, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.5526702404022217 + }, + { + "auxiliary_loss_clip": 0.01078103, + "auxiliary_loss_mlp": 0.01045149, + "balance_loss_clip": 1.03393412, + "balance_loss_mlp": 1.02953279, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 3.372784782133377, + "language_loss": 0.82391733, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84514987, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 2.635549545288086 + }, + { + "auxiliary_loss_clip": 0.01098527, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_clip": 1.03485107, + "balance_loss_mlp": 1.03000474, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 2.0592245542612138, + "language_loss": 0.76840001, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78981382, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 2.5415573120117188 + }, + { + "auxiliary_loss_clip": 0.010143, + "auxiliary_loss_mlp": 0.01007266, + "balance_loss_clip": 1.01080012, + "balance_loss_mlp": 1.00571656, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7285343525268086, + "language_loss": 0.57760692, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59782255, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 4.677913665771484 + }, + { + "auxiliary_loss_clip": 0.01113774, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.03904963, + "balance_loss_mlp": 1.01683855, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6651361664693236, + "language_loss": 0.6987921, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72024173, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.5709800720214844 + }, + { + "auxiliary_loss_clip": 0.01076398, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.04164577, + "balance_loss_mlp": 1.02651381, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.9394755947198146, + "language_loss": 0.7563557, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.77752733, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 2.7026233673095703 + }, + { + "auxiliary_loss_clip": 0.01073952, + "auxiliary_loss_mlp": 0.01035198, + "balance_loss_clip": 1.03785431, + "balance_loss_mlp": 1.02228737, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 2.443361557612108, + "language_loss": 0.77163064, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79272211, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 2.6106269359588623 + }, + { + "auxiliary_loss_clip": 0.01062283, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_clip": 1.03084207, + "balance_loss_mlp": 1.02967143, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.3956965253942473, + "language_loss": 0.79636526, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81742102, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 4.214152097702026 + }, + { + "auxiliary_loss_clip": 0.01074206, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.03888166, + "balance_loss_mlp": 1.02142823, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 1.9992082601034988, + "language_loss": 0.70564055, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72673249, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 2.6998984813690186 + }, + { + "auxiliary_loss_clip": 0.01063174, + "auxiliary_loss_mlp": 0.01040115, + "balance_loss_clip": 1.0310359, + "balance_loss_mlp": 1.02598274, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.6071934514968131, + "language_loss": 0.72125483, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74228776, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 2.792855978012085 + }, + { + "auxiliary_loss_clip": 0.01099924, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.03802752, + "balance_loss_mlp": 1.02195525, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.8646755311607055, + "language_loss": 0.61422718, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63558018, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 2.759819269180298 + }, + { + "auxiliary_loss_clip": 0.01118556, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.04035342, + "balance_loss_mlp": 1.02136087, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.626456880296214, + "language_loss": 0.6752919, + "learning_rate": 2.775744388563563e-06, + "loss": 0.69683671, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.5759265422821045 + }, + { + "auxiliary_loss_clip": 0.01109741, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.03739083, + "balance_loss_mlp": 1.01753092, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 2.026903639347353, + "language_loss": 0.7896862, + "learning_rate": 2.775385401898104e-06, + "loss": 0.8110894, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.5605084896087646 + }, + { + "auxiliary_loss_clip": 0.01103717, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.03703558, + "balance_loss_mlp": 1.02085376, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.3092769757048637, + "language_loss": 0.70749116, + "learning_rate": 2.775026385829952e-06, + "loss": 0.72890091, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.5898094177246094 + }, + { + "auxiliary_loss_clip": 0.01086744, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.03493214, + "balance_loss_mlp": 1.02118182, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 2.314109486341466, + "language_loss": 0.76635462, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78756922, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 2.571903705596924 + }, + { + "auxiliary_loss_clip": 0.0108806, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.03469896, + "balance_loss_mlp": 1.02104163, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 3.15128241446243, + "language_loss": 0.62217736, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.6434021, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 2.699570894241333 + }, + { + "auxiliary_loss_clip": 0.01111244, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.03624177, + "balance_loss_mlp": 1.02016664, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.8434540769022647, + "language_loss": 0.73876762, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76022863, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.579314947128296 + }, + { + "auxiliary_loss_clip": 0.01088153, + "auxiliary_loss_mlp": 0.01031015, + "balance_loss_clip": 1.03434324, + "balance_loss_mlp": 1.01825404, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.5553686579216963, + "language_loss": 0.81606501, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83725673, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 2.5665581226348877 + }, + { + "auxiliary_loss_clip": 0.01100252, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.03623748, + "balance_loss_mlp": 1.02544737, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.87312711068591, + "language_loss": 0.70068228, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72207236, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 2.5681984424591064 + }, + { + "auxiliary_loss_clip": 0.01070374, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.03502047, + "balance_loss_mlp": 1.0163306, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.7807781711685915, + "language_loss": 0.81875432, + "learning_rate": 2.772871672726965e-06, + "loss": 0.8397547, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.6184051036834717 + }, + { + "auxiliary_loss_clip": 0.01085559, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.0358634, + "balance_loss_mlp": 1.02015364, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.8234121432171142, + "language_loss": 0.68803322, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70921791, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 2.6994857788085938 + }, + { + "auxiliary_loss_clip": 0.01087176, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.03360629, + "balance_loss_mlp": 1.0213685, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.2669324905553507, + "language_loss": 0.80373979, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82496482, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.7050113677978516 + }, + { + "auxiliary_loss_clip": 0.01097952, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.0344131, + "balance_loss_mlp": 1.02557588, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.4174385013869466, + "language_loss": 0.75598603, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77734917, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.5994534492492676 + }, + { + "auxiliary_loss_clip": 0.01035559, + "auxiliary_loss_mlp": 0.01010426, + "balance_loss_clip": 1.01150131, + "balance_loss_mlp": 1.0088408, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8172064715238307, + "language_loss": 0.60348225, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62394214, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 2.9806909561157227 + }, + { + "auxiliary_loss_clip": 0.01015886, + "auxiliary_loss_mlp": 0.01001281, + "balance_loss_clip": 1.01184559, + "balance_loss_mlp": 0.99976081, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7913663820400764, + "language_loss": 0.55565882, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57583046, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.213466167449951 + }, + { + "auxiliary_loss_clip": 0.01090108, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.0376513, + "balance_loss_mlp": 1.02529144, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 2.0213335387257043, + "language_loss": 0.75830942, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.77959514, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 2.684763193130493 + }, + { + "auxiliary_loss_clip": 0.01096021, + "auxiliary_loss_mlp": 0.01050323, + "balance_loss_clip": 1.03458714, + "balance_loss_mlp": 1.03493273, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.3676707330571727, + "language_loss": 0.7798667, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80133015, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 2.5571634769439697 + }, + { + "auxiliary_loss_clip": 0.0107364, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.03815556, + "balance_loss_mlp": 1.02367425, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.430367873147597, + "language_loss": 0.68735081, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70844793, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.726860761642456 + }, + { + "auxiliary_loss_clip": 0.01081447, + "auxiliary_loss_mlp": 0.01042506, + "balance_loss_clip": 1.03564906, + "balance_loss_mlp": 1.03032851, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.9027578499141788, + "language_loss": 0.69177502, + "learning_rate": 2.769637625744738e-06, + "loss": 0.7130146, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.6692402362823486 + }, + { + "auxiliary_loss_clip": 0.0110079, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_clip": 1.03879976, + "balance_loss_mlp": 1.02972817, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 2.146863524001742, + "language_loss": 0.78599262, + "learning_rate": 2.769278141085763e-06, + "loss": 0.80743313, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.576882839202881 + }, + { + "auxiliary_loss_clip": 0.00984359, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.00914335, + "balance_loss_mlp": 1.0291729, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8187343744246718, + "language_loss": 0.61844802, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63859642, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 3.0001728534698486 + }, + { + "auxiliary_loss_clip": 0.01082743, + "auxiliary_loss_mlp": 0.01034278, + "balance_loss_clip": 1.03636789, + "balance_loss_mlp": 1.02017593, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 6.253572338895277, + "language_loss": 0.68382514, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70499539, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 2.7926836013793945 + }, + { + "auxiliary_loss_clip": 0.01084805, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.03480518, + "balance_loss_mlp": 1.02371335, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.9092250126200945, + "language_loss": 0.72445697, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74566764, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 2.6766014099121094 + }, + { + "auxiliary_loss_clip": 0.01034041, + "auxiliary_loss_mlp": 0.01016215, + "balance_loss_clip": 1.01046515, + "balance_loss_mlp": 1.0146538, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8307839819787199, + "language_loss": 0.60404098, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62454355, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 2.953611373901367 + }, + { + "auxiliary_loss_clip": 0.01099665, + "auxiliary_loss_mlp": 0.01033912, + "balance_loss_clip": 1.03667223, + "balance_loss_mlp": 1.02144837, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.6218322496746866, + "language_loss": 0.82103151, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84236729, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.6910901069641113 + }, + { + "auxiliary_loss_clip": 0.01082961, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.03197598, + "balance_loss_mlp": 1.02084589, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.8993255709117776, + "language_loss": 0.69332188, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71449584, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.673649787902832 + }, + { + "auxiliary_loss_clip": 0.01094467, + "auxiliary_loss_mlp": 0.01042657, + "balance_loss_clip": 1.03768241, + "balance_loss_mlp": 1.0283401, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 2.039785394715413, + "language_loss": 0.75293577, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77430701, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 4.278113603591919 + }, + { + "auxiliary_loss_clip": 0.01070435, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.03670883, + "balance_loss_mlp": 1.02156532, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.424890338214052, + "language_loss": 0.74495304, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76598638, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 2.6338045597076416 + }, + { + "auxiliary_loss_clip": 0.01087414, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.03766596, + "balance_loss_mlp": 1.02188528, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 1.7014821395369657, + "language_loss": 0.81125128, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.8324815, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 2.5646913051605225 + }, + { + "auxiliary_loss_clip": 0.01091042, + "auxiliary_loss_mlp": 0.00749726, + "balance_loss_clip": 1.03358877, + "balance_loss_mlp": 1.00021636, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 1.7969297437275666, + "language_loss": 0.84196293, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86037058, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 2.5264649391174316 + }, + { + "auxiliary_loss_clip": 0.01098157, + "auxiliary_loss_mlp": 0.00749409, + "balance_loss_clip": 1.03685784, + "balance_loss_mlp": 1.00011468, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.6899246109658066, + "language_loss": 0.72661734, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74509299, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 2.5702004432678223 + }, + { + "auxiliary_loss_clip": 0.01050727, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.03570783, + "balance_loss_mlp": 1.02402711, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.8216976786707757, + "language_loss": 0.77637613, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79727197, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 2.680272102355957 + }, + { + "auxiliary_loss_clip": 0.01076174, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.0347929, + "balance_loss_mlp": 1.01546252, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.7269671448945252, + "language_loss": 0.8096174, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83067024, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 2.6200976371765137 + }, + { + "auxiliary_loss_clip": 0.01100594, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.03629923, + "balance_loss_mlp": 1.02267516, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.9484868427162394, + "language_loss": 0.79993439, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82130462, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 4.091007471084595 + }, + { + "auxiliary_loss_clip": 0.01113044, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.03752685, + "balance_loss_mlp": 1.02194285, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 1.8027207268470948, + "language_loss": 0.70969927, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73118353, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.5343799591064453 + }, + { + "auxiliary_loss_clip": 0.01100212, + "auxiliary_loss_mlp": 0.00749526, + "balance_loss_clip": 1.03840446, + "balance_loss_mlp": 1.00014114, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.8880925047025956, + "language_loss": 0.64263928, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.66113663, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.687931776046753 + }, + { + "auxiliary_loss_clip": 0.0109098, + "auxiliary_loss_mlp": 0.01038308, + "balance_loss_clip": 1.03819454, + "balance_loss_mlp": 1.02563572, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 1.8889501775968536, + "language_loss": 0.79380214, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81509501, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.752546787261963 + }, + { + "auxiliary_loss_clip": 0.01080669, + "auxiliary_loss_mlp": 0.0103998, + "balance_loss_clip": 1.03511631, + "balance_loss_mlp": 1.0249362, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 2.2759006140719493, + "language_loss": 0.71574771, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73695427, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 4.164620399475098 + }, + { + "auxiliary_loss_clip": 0.01111539, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.03663182, + "balance_loss_mlp": 1.01696885, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.8844905362797542, + "language_loss": 0.83590424, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.85732245, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 4.1661529541015625 + }, + { + "auxiliary_loss_clip": 0.01098583, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.03797817, + "balance_loss_mlp": 1.0196898, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 2.6285162512391467, + "language_loss": 0.79843587, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.81975693, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 2.6344637870788574 + }, + { + "auxiliary_loss_clip": 0.0111108, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.03857827, + "balance_loss_mlp": 1.01811111, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 2.0505226072765983, + "language_loss": 0.71064544, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73205823, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.4725630283355713 + }, + { + "auxiliary_loss_clip": 0.01095241, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.03781891, + "balance_loss_mlp": 1.02512133, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 2.032489146747667, + "language_loss": 0.79926026, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.5631790161132812 + }, + { + "auxiliary_loss_clip": 0.01089986, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.0376513, + "balance_loss_mlp": 1.022452, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 1.9816807668176861, + "language_loss": 0.83030224, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85157371, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 2.543532371520996 + }, + { + "auxiliary_loss_clip": 0.01100649, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.03751111, + "balance_loss_mlp": 1.02374125, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.6010748013933873, + "language_loss": 0.79839253, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.8197695, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 2.556905746459961 + }, + { + "auxiliary_loss_clip": 0.01080281, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.03506684, + "balance_loss_mlp": 1.02148724, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.8949727758155965, + "language_loss": 0.81285304, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83400536, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 2.595583915710449 + }, + { + "auxiliary_loss_clip": 0.01056924, + "auxiliary_loss_mlp": 0.01045055, + "balance_loss_clip": 1.03146935, + "balance_loss_mlp": 1.02779317, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 2.081828712371943, + "language_loss": 0.70005101, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72107077, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 2.649761915206909 + }, + { + "auxiliary_loss_clip": 0.01098984, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.03672791, + "balance_loss_mlp": 1.02301848, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 1.9673438415267939, + "language_loss": 0.82774347, + "learning_rate": 2.759561073299676e-06, + "loss": 0.84909844, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 2.5641512870788574 + }, + { + "auxiliary_loss_clip": 0.01069481, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_clip": 1.0347271, + "balance_loss_mlp": 1.03103399, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 2.4634425573206093, + "language_loss": 0.83295059, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85410601, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 2.6016087532043457 + }, + { + "auxiliary_loss_clip": 0.01117462, + "auxiliary_loss_mlp": 0.01036719, + "balance_loss_clip": 1.03881383, + "balance_loss_mlp": 1.02305126, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 1.9614970983511129, + "language_loss": 0.76687503, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.78841674, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.61110258102417 + }, + { + "auxiliary_loss_clip": 0.01096892, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.03600216, + "balance_loss_mlp": 1.02441669, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 1.907245066960692, + "language_loss": 0.80399328, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82533634, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 2.5925276279449463 + }, + { + "auxiliary_loss_clip": 0.01085266, + "auxiliary_loss_mlp": 0.01038423, + "balance_loss_clip": 1.04204428, + "balance_loss_mlp": 1.02535748, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 2.039653141917777, + "language_loss": 0.84686375, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86810064, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.82354474067688 + }, + { + "auxiliary_loss_clip": 0.01054476, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.04014575, + "balance_loss_mlp": 1.02078247, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 3.2943665385070267, + "language_loss": 0.75026822, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.7711544, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.75667142868042 + }, + { + "auxiliary_loss_clip": 0.01072107, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.03554845, + "balance_loss_mlp": 1.0230937, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 2.3391549378013345, + "language_loss": 0.80123156, + "learning_rate": 2.757398863979922e-06, + "loss": 0.82232523, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.671820878982544 + }, + { + "auxiliary_loss_clip": 0.0107901, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.03522062, + "balance_loss_mlp": 1.02806473, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.6811618492324338, + "language_loss": 0.77497149, + "learning_rate": 2.757038395157997e-06, + "loss": 0.79618341, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 2.5713014602661133 + }, + { + "auxiliary_loss_clip": 0.01076641, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.03637815, + "balance_loss_mlp": 1.02260768, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.8741231939726246, + "language_loss": 0.75010228, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77123511, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 2.7782633304595947 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.03617835, + "balance_loss_mlp": 1.02377141, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.8086507996235694, + "language_loss": 0.68014789, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70148969, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 2.7441773414611816 + }, + { + "auxiliary_loss_clip": 0.0104654, + "auxiliary_loss_mlp": 0.01047309, + "balance_loss_clip": 1.03240359, + "balance_loss_mlp": 1.03137052, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.9983783239888364, + "language_loss": 0.71501595, + "learning_rate": 2.755956816505072e-06, + "loss": 0.7359544, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.6672959327697754 + }, + { + "auxiliary_loss_clip": 0.01083741, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.03472638, + "balance_loss_mlp": 1.02543914, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 1.9693864526549154, + "language_loss": 0.73720467, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.7584368, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.562422752380371 + }, + { + "auxiliary_loss_clip": 0.01110458, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.03684783, + "balance_loss_mlp": 1.02262044, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.4795714153444295, + "language_loss": 0.83739555, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.85885215, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.570619821548462 + }, + { + "auxiliary_loss_clip": 0.01089472, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.03817344, + "balance_loss_mlp": 1.02154374, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.02153418110403, + "language_loss": 0.89904052, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92028224, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 2.6182849407196045 + }, + { + "auxiliary_loss_clip": 0.01090829, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.03747606, + "balance_loss_mlp": 1.01861763, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.8468995518572395, + "language_loss": 0.77709246, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.79833376, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 2.6111793518066406 + }, + { + "auxiliary_loss_clip": 0.01058013, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.03413439, + "balance_loss_mlp": 1.01586699, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.7283884887344194, + "language_loss": 0.67774594, + "learning_rate": 2.754153612280037e-06, + "loss": 0.69863194, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.617619037628174 + }, + { + "auxiliary_loss_clip": 0.01098197, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.03750992, + "balance_loss_mlp": 1.01671362, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.7687346383453546, + "language_loss": 0.5868963, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60817182, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.5834598541259766 + }, + { + "auxiliary_loss_clip": 0.01086255, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.03749168, + "balance_loss_mlp": 1.02730846, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 1.7456612445445068, + "language_loss": 0.69722021, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.71851528, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.602391481399536 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.00749612, + "balance_loss_clip": 1.03841889, + "balance_loss_mlp": 1.00025558, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.7939244076994756, + "language_loss": 0.76001596, + "learning_rate": 2.753071346464642e-06, + "loss": 0.77864587, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 2.5034897327423096 + }, + { + "auxiliary_loss_clip": 0.01065662, + "auxiliary_loss_mlp": 0.00749668, + "balance_loss_clip": 1.03668272, + "balance_loss_mlp": 1.00025654, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.6919535390849647, + "language_loss": 0.65976387, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.67791724, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 2.65242600440979 + }, + { + "auxiliary_loss_clip": 0.01079663, + "auxiliary_loss_mlp": 0.01042263, + "balance_loss_clip": 1.03967285, + "balance_loss_mlp": 1.02799392, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.198223005031174, + "language_loss": 0.72478318, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74600244, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 4.3204567432403564 + }, + { + "auxiliary_loss_clip": 0.01079859, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.0365212, + "balance_loss_mlp": 1.01960838, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 2.102161147577509, + "language_loss": 0.73263806, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75376582, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 2.726085662841797 + }, + { + "auxiliary_loss_clip": 0.01079152, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03433847, + "balance_loss_mlp": 1.01987541, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.154598369989381, + "language_loss": 0.71566236, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73678809, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 2.6273863315582275 + }, + { + "auxiliary_loss_clip": 0.00988542, + "auxiliary_loss_mlp": 0.0101521, + "balance_loss_clip": 1.00992429, + "balance_loss_mlp": 1.01354122, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9021942989015468, + "language_loss": 0.61139202, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63142955, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 3.0722615718841553 + }, + { + "auxiliary_loss_clip": 0.0109042, + "auxiliary_loss_mlp": 0.00749741, + "balance_loss_clip": 1.03618705, + "balance_loss_mlp": 1.00021112, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 2.3080637015827157, + "language_loss": 0.81384927, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83225083, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.592010021209717 + }, + { + "auxiliary_loss_clip": 0.0108936, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.03843629, + "balance_loss_mlp": 1.02241015, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 90.72889425805106, + "language_loss": 0.70552945, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72679079, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 2.6077444553375244 + }, + { + "auxiliary_loss_clip": 0.01095262, + "auxiliary_loss_mlp": 0.01046933, + "balance_loss_clip": 1.03685308, + "balance_loss_mlp": 1.03271127, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 2.2242757550522083, + "language_loss": 0.75805968, + "learning_rate": 2.750184048805956e-06, + "loss": 0.77948165, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.6071319580078125 + }, + { + "auxiliary_loss_clip": 0.01021426, + "auxiliary_loss_mlp": 0.01040062, + "balance_loss_clip": 1.03267741, + "balance_loss_mlp": 1.02624559, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 2.4142035881041544, + "language_loss": 0.78479975, + "learning_rate": 2.749823008443152e-06, + "loss": 0.80541462, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 4.272135019302368 + }, + { + "auxiliary_loss_clip": 0.01040604, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.03467679, + "balance_loss_mlp": 1.01910603, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 2.002209950510454, + "language_loss": 0.69393009, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.7146585, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 2.874725103378296 + }, + { + "auxiliary_loss_clip": 0.01038603, + "auxiliary_loss_mlp": 0.01037991, + "balance_loss_clip": 1.03172672, + "balance_loss_mlp": 1.02343559, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.9620909590021902, + "language_loss": 0.77783734, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.7986033, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.7290735244750977 + }, + { + "auxiliary_loss_clip": 0.01006048, + "auxiliary_loss_mlp": 0.01002183, + "balance_loss_clip": 1.01259089, + "balance_loss_mlp": 1.00047803, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9478383794833519, + "language_loss": 0.63066655, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65074885, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.2107455730438232 + }, + { + "auxiliary_loss_clip": 0.01077384, + "auxiliary_loss_mlp": 0.01049267, + "balance_loss_clip": 1.03747392, + "balance_loss_mlp": 1.03319764, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.6917619033556663, + "language_loss": 0.63441789, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65568447, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 2.716221570968628 + }, + { + "auxiliary_loss_clip": 0.01103777, + "auxiliary_loss_mlp": 0.01039453, + "balance_loss_clip": 1.04018164, + "balance_loss_mlp": 1.02589321, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 1.8821590860483637, + "language_loss": 0.78785956, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80929184, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 5.64096212387085 + }, + { + "auxiliary_loss_clip": 0.01081539, + "auxiliary_loss_mlp": 0.00749892, + "balance_loss_clip": 1.03790581, + "balance_loss_mlp": 1.00023246, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 2.5729161501761615, + "language_loss": 0.67867708, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69699138, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.762467861175537 + }, + { + "auxiliary_loss_clip": 0.0111415, + "auxiliary_loss_mlp": 0.01042931, + "balance_loss_clip": 1.0382787, + "balance_loss_mlp": 1.0300082, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.2065637063662047, + "language_loss": 0.78405815, + "learning_rate": 2.747294930536157e-06, + "loss": 0.80562901, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 2.570549726486206 + }, + { + "auxiliary_loss_clip": 0.01070364, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_clip": 1.03557968, + "balance_loss_mlp": 1.02436423, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 2.777854932788021, + "language_loss": 0.72645247, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74756074, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 2.656157970428467 + }, + { + "auxiliary_loss_clip": 0.01063473, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.0306617, + "balance_loss_mlp": 1.02079225, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 2.187172364264242, + "language_loss": 0.86039805, + "learning_rate": 2.746572367319791e-06, + "loss": 0.88138425, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 2.651034116744995 + }, + { + "auxiliary_loss_clip": 0.01080624, + "auxiliary_loss_mlp": 0.01042308, + "balance_loss_clip": 1.03747392, + "balance_loss_mlp": 1.02602351, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.381168758278939, + "language_loss": 0.70493251, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.7261619, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 2.5887880325317383 + }, + { + "auxiliary_loss_clip": 0.01114763, + "auxiliary_loss_mlp": 0.01043742, + "balance_loss_clip": 1.03883767, + "balance_loss_mlp": 1.03010392, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.4459912709043445, + "language_loss": 0.83200419, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85358924, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 2.5083045959472656 + }, + { + "auxiliary_loss_clip": 0.01093192, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.02124906, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.5747676289297265, + "language_loss": 0.73196393, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75323957, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 2.613826036453247 + }, + { + "auxiliary_loss_clip": 0.01086766, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.03663266, + "balance_loss_mlp": 1.02262354, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.6346509624979382, + "language_loss": 0.82303351, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84426695, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.623161792755127 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.03765702, + "balance_loss_mlp": 1.01920319, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.6798545963882547, + "language_loss": 0.73948634, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.76090586, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 2.5372233390808105 + }, + { + "auxiliary_loss_clip": 0.01070801, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.03667736, + "balance_loss_mlp": 1.02755284, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.754696816990406, + "language_loss": 0.73958689, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76072371, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.624035120010376 + }, + { + "auxiliary_loss_clip": 0.01107592, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.04012883, + "balance_loss_mlp": 1.02426255, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.8512854452750784, + "language_loss": 0.67525613, + "learning_rate": 2.744042505013797e-06, + "loss": 0.69671285, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 2.7328498363494873 + }, + { + "auxiliary_loss_clip": 0.0107284, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_clip": 1.03321791, + "balance_loss_mlp": 1.03280902, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 1.7845590100150968, + "language_loss": 0.74189448, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76311928, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 2.6091551780700684 + }, + { + "auxiliary_loss_clip": 0.01090195, + "auxiliary_loss_mlp": 0.01036732, + "balance_loss_clip": 1.03830898, + "balance_loss_mlp": 1.02273679, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 1.5456571227212934, + "language_loss": 0.71482801, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73609728, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 2.5864686965942383 + }, + { + "auxiliary_loss_clip": 0.0108912, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.03327048, + "balance_loss_mlp": 1.01661861, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.70450898207891, + "language_loss": 0.78260934, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80380476, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 2.512436628341675 + }, + { + "auxiliary_loss_clip": 0.01102653, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.0394994, + "balance_loss_mlp": 1.02123809, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 3.0444833223525136, + "language_loss": 0.78708255, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.80846071, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.609220504760742 + }, + { + "auxiliary_loss_clip": 0.01004029, + "auxiliary_loss_mlp": 0.01007282, + "balance_loss_clip": 1.01082373, + "balance_loss_mlp": 1.0055182, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8643747777641386, + "language_loss": 0.65013015, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67024326, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.0855941772460938 + }, + { + "auxiliary_loss_clip": 0.01076875, + "auxiliary_loss_mlp": 0.01045239, + "balance_loss_clip": 1.03365493, + "balance_loss_mlp": 1.02870452, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.3989358539696037, + "language_loss": 0.71432191, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73554307, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.634852886199951 + }, + { + "auxiliary_loss_clip": 0.0110047, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.03911817, + "balance_loss_mlp": 1.01799142, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 3.453381333090959, + "language_loss": 0.8189739, + "learning_rate": 2.741511260213862e-06, + "loss": 0.84029067, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.5472140312194824 + }, + { + "auxiliary_loss_clip": 0.0107631, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.03811836, + "balance_loss_mlp": 1.01883221, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.175798961650471, + "language_loss": 0.67169386, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69277537, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 2.7670743465423584 + }, + { + "auxiliary_loss_clip": 0.01115314, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.03908157, + "balance_loss_mlp": 1.02879405, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 1.9680392165487792, + "language_loss": 0.83067816, + "learning_rate": 2.740787794144541e-06, + "loss": 0.85226226, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.512441635131836 + }, + { + "auxiliary_loss_clip": 0.01110037, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.03982592, + "balance_loss_mlp": 1.02395809, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 2.1785892141555983, + "language_loss": 0.72913468, + "learning_rate": 2.7404260189669e-06, + "loss": 0.75060081, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.512406349182129 + }, + { + "auxiliary_loss_clip": 0.01093047, + "auxiliary_loss_mlp": 0.01041298, + "balance_loss_clip": 1.0392828, + "balance_loss_mlp": 1.02596748, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.8115874052263659, + "language_loss": 0.65471476, + "learning_rate": 2.740064215712231e-06, + "loss": 0.67605817, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.6322643756866455 + }, + { + "auxiliary_loss_clip": 0.01032561, + "auxiliary_loss_mlp": 0.01006977, + "balance_loss_clip": 1.00843716, + "balance_loss_mlp": 1.0056479, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7733263940502274, + "language_loss": 0.58249295, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60288835, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 3.0207457542419434 + }, + { + "auxiliary_loss_clip": 0.01095061, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.04075873, + "balance_loss_mlp": 1.02227044, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.7595859293641376, + "language_loss": 0.78934371, + "learning_rate": 2.739340525026686e-06, + "loss": 0.8106401, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 2.570495843887329 + }, + { + "auxiliary_loss_clip": 0.01091068, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.03779268, + "balance_loss_mlp": 1.01749659, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 2.4087954434927905, + "language_loss": 0.7768141, + "learning_rate": 2.738978637623252e-06, + "loss": 0.79802823, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.6262707710266113 + }, + { + "auxiliary_loss_clip": 0.01082069, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.03358412, + "balance_loss_mlp": 1.01858127, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.880172205832662, + "language_loss": 0.74762255, + "learning_rate": 2.738616722197674e-06, + "loss": 0.76876915, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 2.6317594051361084 + }, + { + "auxiliary_loss_clip": 0.01069224, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.03551126, + "balance_loss_mlp": 1.02555418, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 2.0911285385274723, + "language_loss": 0.80181575, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.82290196, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 2.5890846252441406 + }, + { + "auxiliary_loss_clip": 0.01118463, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.04041886, + "balance_loss_mlp": 1.02708662, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 7.0872062145616415, + "language_loss": 0.83450603, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85611355, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 4.144981384277344 + }, + { + "auxiliary_loss_clip": 0.01096684, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.03688669, + "balance_loss_mlp": 1.02362418, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.3390971831363783, + "language_loss": 0.86692989, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88827288, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 2.52812123298645 + }, + { + "auxiliary_loss_clip": 0.01042736, + "auxiliary_loss_mlp": 0.00750051, + "balance_loss_clip": 1.03044558, + "balance_loss_mlp": 1.00028551, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.4281061003563593, + "language_loss": 0.83984023, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85776812, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 2.6636574268341064 + }, + { + "auxiliary_loss_clip": 0.01066236, + "auxiliary_loss_mlp": 0.00749527, + "balance_loss_clip": 1.03243113, + "balance_loss_mlp": 1.00022101, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.6434637392162221, + "language_loss": 0.82744086, + "learning_rate": 2.736806725217998e-06, + "loss": 0.84559846, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 2.6416656970977783 + }, + { + "auxiliary_loss_clip": 0.01070163, + "auxiliary_loss_mlp": 0.01052645, + "balance_loss_clip": 1.03462636, + "balance_loss_mlp": 1.03729081, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.9757507243276229, + "language_loss": 0.71228236, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73351049, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.6768558025360107 + }, + { + "auxiliary_loss_clip": 0.01075673, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.03664112, + "balance_loss_mlp": 1.02038884, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 1.838243025327484, + "language_loss": 0.80803323, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82912862, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.6543946266174316 + }, + { + "auxiliary_loss_clip": 0.01056954, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.03767681, + "balance_loss_mlp": 1.01768589, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.8655719825589199, + "language_loss": 0.75068194, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77156389, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.65120530128479 + }, + { + "auxiliary_loss_clip": 0.01072093, + "auxiliary_loss_mlp": 0.01041776, + "balance_loss_clip": 1.03373623, + "balance_loss_mlp": 1.02726841, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 1.844782638659717, + "language_loss": 0.71566606, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73680478, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 4.0936150550842285 + }, + { + "auxiliary_loss_clip": 0.01046649, + "auxiliary_loss_mlp": 0.00749451, + "balance_loss_clip": 1.03482175, + "balance_loss_mlp": 1.00019598, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 2.1722428825060747, + "language_loss": 0.74741089, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76537186, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.646418333053589 + }, + { + "auxiliary_loss_clip": 0.01085009, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.03570569, + "balance_loss_mlp": 1.01470923, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 3.0871789812363977, + "language_loss": 0.81074214, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83187175, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.6450741291046143 + }, + { + "auxiliary_loss_clip": 0.01081893, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.036291, + "balance_loss_mlp": 1.015347, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 2.737660049569458, + "language_loss": 0.74450874, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.76562238, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 2.6022307872772217 + }, + { + "auxiliary_loss_clip": 0.0107918, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.03679943, + "balance_loss_mlp": 1.02519739, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 1.8365842408774573, + "language_loss": 0.66289747, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68409598, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 2.641908884048462 + }, + { + "auxiliary_loss_clip": 0.0109828, + "auxiliary_loss_mlp": 0.0103479, + "balance_loss_clip": 1.0365715, + "balance_loss_mlp": 1.02197456, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.800009301397553, + "language_loss": 0.814987, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83631772, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 5.535091400146484 + }, + { + "auxiliary_loss_clip": 0.01001501, + "auxiliary_loss_mlp": 0.01020499, + "balance_loss_clip": 1.00788891, + "balance_loss_mlp": 1.01899135, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7231898105020699, + "language_loss": 0.53170824, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55192822, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 3.222303628921509 + }, + { + "auxiliary_loss_clip": 0.01091363, + "auxiliary_loss_mlp": 0.00749836, + "balance_loss_clip": 1.03747022, + "balance_loss_mlp": 1.00039697, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 2.9107822773724012, + "language_loss": 0.75524592, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77365792, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.5814425945281982 + }, + { + "auxiliary_loss_clip": 0.01031417, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.02981436, + "balance_loss_mlp": 1.02564657, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.7062878247540947, + "language_loss": 0.76535517, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78606331, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 2.7235889434814453 + }, + { + "auxiliary_loss_clip": 0.01084052, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.03835964, + "balance_loss_mlp": 1.02002525, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.0413371297614744, + "language_loss": 0.82156909, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84274942, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 2.6633400917053223 + }, + { + "auxiliary_loss_clip": 0.01114473, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.03989315, + "balance_loss_mlp": 1.01829422, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.1284728955513588, + "language_loss": 0.76881969, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.79028749, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.5154869556427 + }, + { + "auxiliary_loss_clip": 0.01085751, + "auxiliary_loss_mlp": 0.01034283, + "balance_loss_clip": 1.03643429, + "balance_loss_mlp": 1.02003181, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.1650610066417393, + "language_loss": 0.7210511, + "learning_rate": 2.731372550178393e-06, + "loss": 0.7422514, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 2.6860806941986084 + }, + { + "auxiliary_loss_clip": 0.01100822, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.03667855, + "balance_loss_mlp": 1.01565909, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.588516002746073, + "language_loss": 0.66129178, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68259633, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.570359706878662 + }, + { + "auxiliary_loss_clip": 0.01109187, + "auxiliary_loss_mlp": 0.0103689, + "balance_loss_clip": 1.03488839, + "balance_loss_mlp": 1.02323437, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 2.5932101530825893, + "language_loss": 0.78441638, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80587721, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 2.519047975540161 + }, + { + "auxiliary_loss_clip": 0.01100644, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.03648329, + "balance_loss_mlp": 1.02045119, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 2.0130803844952334, + "language_loss": 0.69892091, + "learning_rate": 2.73028496487595e-06, + "loss": 0.7202701, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 2.5867385864257812 + }, + { + "auxiliary_loss_clip": 0.01055002, + "auxiliary_loss_mlp": 0.01035047, + "balance_loss_clip": 1.02932227, + "balance_loss_mlp": 1.02084947, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 2.0872166330369186, + "language_loss": 0.71605456, + "learning_rate": 2.729922381038513e-06, + "loss": 0.73695505, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.629641532897949 + }, + { + "auxiliary_loss_clip": 0.01074482, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.03724718, + "balance_loss_mlp": 1.02086782, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.6461993195854585, + "language_loss": 0.74204624, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.7631259, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.722883701324463 + }, + { + "auxiliary_loss_clip": 0.01109879, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.03546751, + "balance_loss_mlp": 1.01698267, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 1.6117328378314861, + "language_loss": 0.65961117, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68102115, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 2.5350184440612793 + }, + { + "auxiliary_loss_clip": 0.0108522, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.03960228, + "balance_loss_mlp": 1.02335501, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 2.0160753464328893, + "language_loss": 0.75318223, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77440369, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 2.7386534214019775 + }, + { + "auxiliary_loss_clip": 0.01110025, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.03618181, + "balance_loss_mlp": 1.02435219, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 2.755624289655829, + "language_loss": 0.71862346, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74010229, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.5538394451141357 + }, + { + "auxiliary_loss_clip": 0.01111301, + "auxiliary_loss_mlp": 0.0103766, + "balance_loss_clip": 1.03647566, + "balance_loss_mlp": 1.02386737, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.9624784701497322, + "language_loss": 0.73386908, + "learning_rate": 2.728109046945403e-06, + "loss": 0.7553587, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.515634059906006 + }, + { + "auxiliary_loss_clip": 0.01003887, + "auxiliary_loss_mlp": 0.01000615, + "balance_loss_clip": 1.00954604, + "balance_loss_mlp": 0.99894565, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8339919718249518, + "language_loss": 0.60637581, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62642074, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.072047472000122 + }, + { + "auxiliary_loss_clip": 0.01074318, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.03895748, + "balance_loss_mlp": 1.02006006, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.185108567744801, + "language_loss": 0.67068779, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69175738, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 2.5991387367248535 + }, + { + "auxiliary_loss_clip": 0.01097828, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.03566694, + "balance_loss_mlp": 1.02342772, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 3.2075704485334624, + "language_loss": 0.90181071, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92314196, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 2.5746679306030273 + }, + { + "auxiliary_loss_clip": 0.01078788, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03500938, + "balance_loss_mlp": 1.02328897, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.9303836717520364, + "language_loss": 0.73622942, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75737631, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 2.684396266937256 + }, + { + "auxiliary_loss_clip": 0.01111489, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.03742182, + "balance_loss_mlp": 1.02798963, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.744492006713837, + "language_loss": 0.73061788, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75215292, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.639650583267212 + }, + { + "auxiliary_loss_clip": 0.01113498, + "auxiliary_loss_mlp": 0.01044112, + "balance_loss_clip": 1.03847551, + "balance_loss_mlp": 1.02961659, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4519391666577275, + "language_loss": 0.79698431, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81856048, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 2.654301166534424 + }, + { + "auxiliary_loss_clip": 0.0109653, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.03498793, + "balance_loss_mlp": 1.02940416, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 2.2721879790015027, + "language_loss": 0.77246475, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79385644, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.647475481033325 + }, + { + "auxiliary_loss_clip": 0.01102958, + "auxiliary_loss_mlp": 0.01030699, + "balance_loss_clip": 1.03537679, + "balance_loss_mlp": 1.01947594, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.649243662735268, + "language_loss": 0.72525334, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74658996, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 2.4934604167938232 + }, + { + "auxiliary_loss_clip": 0.01080472, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.0338428, + "balance_loss_mlp": 1.02738166, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.7878143598426037, + "language_loss": 0.71007025, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73127007, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.6518707275390625 + }, + { + "auxiliary_loss_clip": 0.01113905, + "auxiliary_loss_mlp": 0.01039025, + "balance_loss_clip": 1.03957582, + "balance_loss_mlp": 1.02560818, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.75027104097879, + "language_loss": 0.75283706, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77436638, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 2.51413631439209 + }, + { + "auxiliary_loss_clip": 0.01093811, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.03474426, + "balance_loss_mlp": 1.01938558, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 1.9979764555867625, + "language_loss": 0.65808976, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.67935878, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.549588441848755 + }, + { + "auxiliary_loss_clip": 0.01091089, + "auxiliary_loss_mlp": 0.01041978, + "balance_loss_clip": 1.03356802, + "balance_loss_mlp": 1.02761292, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.6657826278578716, + "language_loss": 0.85564846, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87697911, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 2.5494093894958496 + }, + { + "auxiliary_loss_clip": 0.01100794, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.0382154, + "balance_loss_mlp": 1.01999831, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 2.182556465439149, + "language_loss": 0.85211444, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87345445, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 4.101025104522705 + }, + { + "auxiliary_loss_clip": 0.01103013, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.04099023, + "balance_loss_mlp": 1.02181458, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.6962807291929465, + "language_loss": 0.78235513, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80374205, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.5820529460906982 + }, + { + "auxiliary_loss_clip": 0.01101766, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.03820515, + "balance_loss_mlp": 1.02011538, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 2.183287902574274, + "language_loss": 0.73458308, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75593233, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.6344189643859863 + }, + { + "auxiliary_loss_clip": 0.01093645, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_clip": 1.03682613, + "balance_loss_mlp": 1.03645897, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.531799790808256, + "language_loss": 0.75970471, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.7811529, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.628924608230591 + }, + { + "auxiliary_loss_clip": 0.01077067, + "auxiliary_loss_mlp": 0.01039345, + "balance_loss_clip": 1.04043937, + "balance_loss_mlp": 1.0259397, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.7051933822971876, + "language_loss": 0.82201385, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84317803, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.774858236312866 + }, + { + "auxiliary_loss_clip": 0.01006866, + "auxiliary_loss_mlp": 0.00999775, + "balance_loss_clip": 1.00971651, + "balance_loss_mlp": 0.99817771, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.7796500657261142, + "language_loss": 0.53364873, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55371511, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.3454127311706543 + }, + { + "auxiliary_loss_clip": 0.01062131, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.03431797, + "balance_loss_mlp": 1.02564907, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.78277097710434, + "language_loss": 0.88632196, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.90733534, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 4.289905548095703 + }, + { + "auxiliary_loss_clip": 0.01102305, + "auxiliary_loss_mlp": 0.01034301, + "balance_loss_clip": 1.03760576, + "balance_loss_mlp": 1.02027047, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.7553018051911806, + "language_loss": 0.7887587, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81012475, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.598792314529419 + }, + { + "auxiliary_loss_clip": 0.01072381, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.03509986, + "balance_loss_mlp": 1.02261794, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 2.9499550883453525, + "language_loss": 0.62659967, + "learning_rate": 2.72048552626888e-06, + "loss": 0.64769614, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.6895499229431152 + }, + { + "auxiliary_loss_clip": 0.01087135, + "auxiliary_loss_mlp": 0.00749585, + "balance_loss_clip": 1.03638172, + "balance_loss_mlp": 1.00043249, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.629642219719737, + "language_loss": 0.80094552, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.81931275, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 2.670388698577881 + }, + { + "auxiliary_loss_clip": 0.01071907, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.0406394, + "balance_loss_mlp": 1.01856863, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.4312491130052933, + "language_loss": 0.82520181, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84623981, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 2.679004430770874 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.03605378, + "balance_loss_mlp": 1.02103353, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 2.1166740188410347, + "language_loss": 0.93318313, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95453322, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 2.61218523979187 + }, + { + "auxiliary_loss_clip": 0.01106793, + "auxiliary_loss_mlp": 0.01043271, + "balance_loss_clip": 1.03882647, + "balance_loss_mlp": 1.02844167, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 2.2491005054114797, + "language_loss": 0.7954855, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81698608, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 4.191368818283081 + }, + { + "auxiliary_loss_clip": 0.0109024, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.04073513, + "balance_loss_mlp": 1.02177668, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 1.8551367145333577, + "language_loss": 0.83549529, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85674876, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 4.127662420272827 + }, + { + "auxiliary_loss_clip": 0.01110304, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.03840363, + "balance_loss_mlp": 1.02429819, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.116660726958811, + "language_loss": 0.63606513, + "learning_rate": 2.718305158935434e-06, + "loss": 0.65754247, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 2.555600643157959 + }, + { + "auxiliary_loss_clip": 0.0108088, + "auxiliary_loss_mlp": 0.01027807, + "balance_loss_clip": 1.03517747, + "balance_loss_mlp": 1.01517701, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.450298161982634, + "language_loss": 0.78645712, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80754399, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 2.6595897674560547 + }, + { + "auxiliary_loss_clip": 0.01080877, + "auxiliary_loss_mlp": 0.00750008, + "balance_loss_clip": 1.03758359, + "balance_loss_mlp": 1.00062168, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.7908140808751658, + "language_loss": 0.75717866, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77548754, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 2.7478559017181396 + }, + { + "auxiliary_loss_clip": 0.01071736, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.03820848, + "balance_loss_mlp": 1.01817501, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 2.5088940045672548, + "language_loss": 0.63585764, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.65688878, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.73771071434021 + }, + { + "auxiliary_loss_clip": 0.01062092, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.03184426, + "balance_loss_mlp": 1.02029586, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.8313891601636672, + "language_loss": 0.72911197, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75007033, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 2.789813756942749 + }, + { + "auxiliary_loss_clip": 0.01098227, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.03666878, + "balance_loss_mlp": 1.02618587, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 2.3299832937097915, + "language_loss": 0.73100269, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75237787, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.6454379558563232 + }, + { + "auxiliary_loss_clip": 0.0101696, + "auxiliary_loss_mlp": 0.01006266, + "balance_loss_clip": 1.00741291, + "balance_loss_mlp": 1.00497842, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8072562231506469, + "language_loss": 0.60400331, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62423551, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 3.265713691711426 + }, + { + "auxiliary_loss_clip": 0.01101128, + "auxiliary_loss_mlp": 0.01029205, + "balance_loss_clip": 1.03675485, + "balance_loss_mlp": 1.01602054, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 1.7333920427940444, + "language_loss": 0.69774997, + "learning_rate": 2.715760157917357e-06, + "loss": 0.71905333, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 2.6699788570404053 + }, + { + "auxiliary_loss_clip": 0.01089938, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.03701866, + "balance_loss_mlp": 1.02220929, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4938854328450488, + "language_loss": 0.7455101, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76676309, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 2.7711081504821777 + }, + { + "auxiliary_loss_clip": 0.01091533, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.03852522, + "balance_loss_mlp": 1.02351582, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 2.050284454846463, + "language_loss": 0.71013248, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73141193, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.637071132659912 + }, + { + "auxiliary_loss_clip": 0.01083798, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.03416514, + "balance_loss_mlp": 1.02412939, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.641734555937927, + "language_loss": 0.64032996, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.6615532, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.718541145324707 + }, + { + "auxiliary_loss_clip": 0.01100773, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.03561532, + "balance_loss_mlp": 1.01833081, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.138844805089097, + "language_loss": 0.73416716, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.7554906, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.5764660835266113 + }, + { + "auxiliary_loss_clip": 0.01077647, + "auxiliary_loss_mlp": 0.01031768, + "balance_loss_clip": 1.0351721, + "balance_loss_mlp": 1.01873851, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.6357787493235367, + "language_loss": 0.74587595, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.7669701, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 2.7357709407806396 + }, + { + "auxiliary_loss_clip": 0.01095032, + "auxiliary_loss_mlp": 0.01039095, + "balance_loss_clip": 1.04101229, + "balance_loss_mlp": 1.02582741, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.728665500239335, + "language_loss": 0.72216558, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74350691, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 2.6550087928771973 + }, + { + "auxiliary_loss_clip": 0.01059427, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.03234231, + "balance_loss_mlp": 1.01851773, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 2.4004641801345854, + "language_loss": 0.83776498, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.85867923, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 2.776524543762207 + }, + { + "auxiliary_loss_clip": 0.01070202, + "auxiliary_loss_mlp": 0.01040802, + "balance_loss_clip": 1.03597927, + "balance_loss_mlp": 1.0261215, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.9002101481571732, + "language_loss": 0.70947492, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73058498, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.768080949783325 + }, + { + "auxiliary_loss_clip": 0.01084076, + "auxiliary_loss_mlp": 0.01039054, + "balance_loss_clip": 1.03541255, + "balance_loss_mlp": 1.0253334, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.779624779548971, + "language_loss": 0.67493141, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.6961627, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.7513623237609863 + }, + { + "auxiliary_loss_clip": 0.0107858, + "auxiliary_loss_mlp": 0.01035198, + "balance_loss_clip": 1.03190565, + "balance_loss_mlp": 1.02087486, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 6.826661013294048, + "language_loss": 0.79218841, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81332624, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 3.0095973014831543 + }, + { + "auxiliary_loss_clip": 0.01082902, + "auxiliary_loss_mlp": 0.01043051, + "balance_loss_clip": 1.03538871, + "balance_loss_mlp": 1.02719665, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.8604129905786633, + "language_loss": 0.70918602, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73044556, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 2.6744155883789062 + }, + { + "auxiliary_loss_clip": 0.01096456, + "auxiliary_loss_mlp": 0.01040323, + "balance_loss_clip": 1.03462756, + "balance_loss_mlp": 1.02746022, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.065755380127677, + "language_loss": 0.61469585, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63606369, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.6567299365997314 + }, + { + "auxiliary_loss_clip": 0.01100792, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.037323, + "balance_loss_mlp": 1.02024472, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.8787947278068013, + "language_loss": 0.76571536, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78705692, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 2.6400790214538574 + }, + { + "auxiliary_loss_clip": 0.01076159, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.03701818, + "balance_loss_mlp": 1.01456976, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.5928422244583047, + "language_loss": 0.80171764, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82275224, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 2.7050085067749023 + }, + { + "auxiliary_loss_clip": 0.01088447, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.03686702, + "balance_loss_mlp": 1.02069569, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 1.7616637275908236, + "language_loss": 0.74410754, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.7653473, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 2.7670834064483643 + }, + { + "auxiliary_loss_clip": 0.01079277, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.03489137, + "balance_loss_mlp": 1.02456939, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.8156853028721087, + "language_loss": 0.65967011, + "learning_rate": 2.709938026276208e-06, + "loss": 0.68083876, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 2.6900203227996826 + }, + { + "auxiliary_loss_clip": 0.01080018, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.03412473, + "balance_loss_mlp": 1.02246439, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.5525099020866704, + "language_loss": 0.65568399, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.67685926, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 2.637312173843384 + }, + { + "auxiliary_loss_clip": 0.01034935, + "auxiliary_loss_mlp": 0.01038457, + "balance_loss_clip": 1.0321126, + "balance_loss_mlp": 1.02313924, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 1.8540497300640872, + "language_loss": 0.81477594, + "learning_rate": 2.709209774085071e-06, + "loss": 0.8355099, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 4.304243087768555 + }, + { + "auxiliary_loss_clip": 0.01089135, + "auxiliary_loss_mlp": 0.01034068, + "balance_loss_clip": 1.03697574, + "balance_loss_mlp": 1.02016234, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 2.320450348709507, + "language_loss": 0.73409045, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75532246, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 2.680548667907715 + }, + { + "auxiliary_loss_clip": 0.01097803, + "auxiliary_loss_mlp": 0.0103606, + "balance_loss_clip": 1.03687906, + "balance_loss_mlp": 1.02289939, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.762572524402882, + "language_loss": 0.66333061, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68466926, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.533567190170288 + }, + { + "auxiliary_loss_clip": 0.01103538, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.03905082, + "balance_loss_mlp": 1.02680171, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.3940727074367185, + "language_loss": 0.71569335, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73712897, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.631622791290283 + }, + { + "auxiliary_loss_clip": 0.01084655, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.03599846, + "balance_loss_mlp": 1.01915884, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.534332129726684, + "language_loss": 0.79743612, + "learning_rate": 2.707752947093611e-06, + "loss": 0.8186143, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.6272428035736084 + }, + { + "auxiliary_loss_clip": 0.01059724, + "auxiliary_loss_mlp": 0.01041007, + "balance_loss_clip": 1.03275359, + "balance_loss_mlp": 1.02592754, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 1.9889637312399362, + "language_loss": 0.82664156, + "learning_rate": 2.70738867321606e-06, + "loss": 0.8476488, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.6566312313079834 + }, + { + "auxiliary_loss_clip": 0.01101504, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.03916049, + "balance_loss_mlp": 1.02191985, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.5712955678818397, + "language_loss": 0.71499735, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73636925, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 4.119339942932129 + }, + { + "auxiliary_loss_clip": 0.01077396, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.03493786, + "balance_loss_mlp": 1.02076364, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 2.213629318717901, + "language_loss": 0.84983224, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87095666, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.5667314529418945 + }, + { + "auxiliary_loss_clip": 0.01102835, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.03743052, + "balance_loss_mlp": 1.01841378, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 2.6890451545675833, + "language_loss": 0.76047939, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78183246, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.5345518589019775 + }, + { + "auxiliary_loss_clip": 0.01080398, + "auxiliary_loss_mlp": 0.01040137, + "balance_loss_clip": 1.03826022, + "balance_loss_mlp": 1.0258379, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 1.9581159514019815, + "language_loss": 0.78624076, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.80744612, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 2.7186741828918457 + }, + { + "auxiliary_loss_clip": 0.01074279, + "auxiliary_loss_mlp": 0.01040572, + "balance_loss_clip": 1.03317976, + "balance_loss_mlp": 1.02463436, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 2.960000397704864, + "language_loss": 0.88290465, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90405321, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.642472267150879 + }, + { + "auxiliary_loss_clip": 0.01102335, + "auxiliary_loss_mlp": 0.01038932, + "balance_loss_clip": 1.03832626, + "balance_loss_mlp": 1.02490127, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 2.006265427219611, + "language_loss": 0.69479859, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71621132, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 2.6217894554138184 + }, + { + "auxiliary_loss_clip": 0.01065784, + "auxiliary_loss_mlp": 0.01035118, + "balance_loss_clip": 1.03271389, + "balance_loss_mlp": 1.02075374, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 2.264040591852907, + "language_loss": 0.77212179, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79313076, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 4.215281963348389 + }, + { + "auxiliary_loss_clip": 0.01065724, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.0362978, + "balance_loss_mlp": 1.02320766, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 2.010451472187934, + "language_loss": 0.760481, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78150463, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 4.192890644073486 + }, + { + "auxiliary_loss_clip": 0.01003921, + "auxiliary_loss_mlp": 0.00996615, + "balance_loss_clip": 1.0078814, + "balance_loss_mlp": 0.99541658, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.9340448830150097, + "language_loss": 0.60726988, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62727523, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 3.0126821994781494 + }, + { + "auxiliary_loss_clip": 0.01117626, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_clip": 1.03891253, + "balance_loss_mlp": 1.02212191, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.0399789942199202, + "language_loss": 0.74772429, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76927739, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 2.4715006351470947 + }, + { + "auxiliary_loss_clip": 0.01103817, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_clip": 1.03765416, + "balance_loss_mlp": 1.02921152, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.7075777885586008, + "language_loss": 0.81542134, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83689559, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 2.5817196369171143 + }, + { + "auxiliary_loss_clip": 0.01087224, + "auxiliary_loss_mlp": 0.01032959, + "balance_loss_clip": 1.0339731, + "balance_loss_mlp": 1.01928592, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.9330364915424176, + "language_loss": 0.7666474, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.78784919, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 2.583587646484375 + }, + { + "auxiliary_loss_clip": 0.01074303, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.03609872, + "balance_loss_mlp": 1.01641345, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 1.765191333171406, + "language_loss": 0.72591406, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74694061, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.643291473388672 + }, + { + "auxiliary_loss_clip": 0.01099173, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.01944232, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.9088298646234196, + "language_loss": 0.65658796, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67790508, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 2.618161916732788 + }, + { + "auxiliary_loss_clip": 0.01100001, + "auxiliary_loss_mlp": 0.01046278, + "balance_loss_clip": 1.03943014, + "balance_loss_mlp": 1.0315969, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.5469034093425362, + "language_loss": 0.7370832, + "learning_rate": 2.701921353880734e-06, + "loss": 0.75854599, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 2.590055465698242 + }, + { + "auxiliary_loss_clip": 0.01078248, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.03504169, + "balance_loss_mlp": 1.02298653, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.8643701657959852, + "language_loss": 0.74865162, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76979345, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 2.6519412994384766 + }, + { + "auxiliary_loss_clip": 0.01096375, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.03611743, + "balance_loss_mlp": 1.02146041, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 2.0729262482063473, + "language_loss": 0.76551193, + "learning_rate": 2.701191924463126e-06, + "loss": 0.78683412, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 2.8270986080169678 + }, + { + "auxiliary_loss_clip": 0.01081128, + "auxiliary_loss_mlp": 0.00750143, + "balance_loss_clip": 1.03211427, + "balance_loss_mlp": 1.00070095, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 3.46024074763271, + "language_loss": 0.82003933, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83835196, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 2.6296725273132324 + }, + { + "auxiliary_loss_clip": 0.01112782, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.03851688, + "balance_loss_mlp": 1.02131033, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 3.0932341927557387, + "language_loss": 0.85567236, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87715018, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.5621159076690674 + }, + { + "auxiliary_loss_clip": 0.01075149, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.03582335, + "balance_loss_mlp": 1.02456522, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.7784199261689664, + "language_loss": 0.81825876, + "learning_rate": 2.700097580951786e-06, + "loss": 0.83939213, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 2.665299415588379 + }, + { + "auxiliary_loss_clip": 0.01086177, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.03423417, + "balance_loss_mlp": 1.02627063, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 5.155585044355558, + "language_loss": 0.73352593, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75478774, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 2.645394802093506 + }, + { + "auxiliary_loss_clip": 0.01095465, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.03477526, + "balance_loss_mlp": 1.02540207, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 2.0686745664229536, + "language_loss": 0.67726815, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69861454, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.690491199493408 + }, + { + "auxiliary_loss_clip": 0.01110099, + "auxiliary_loss_mlp": 0.01037242, + "balance_loss_clip": 1.03648829, + "balance_loss_mlp": 1.02495706, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.6520043854278152, + "language_loss": 0.73841238, + "learning_rate": 2.699002998510517e-06, + "loss": 0.75988579, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.550947427749634 + }, + { + "auxiliary_loss_clip": 0.01086093, + "auxiliary_loss_mlp": 0.00749444, + "balance_loss_clip": 1.03783691, + "balance_loss_mlp": 1.00056791, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.8182569433632236, + "language_loss": 0.76733899, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.78569436, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 2.751546621322632 + }, + { + "auxiliary_loss_clip": 0.01080827, + "auxiliary_loss_mlp": 0.01044132, + "balance_loss_clip": 1.03255057, + "balance_loss_mlp": 1.02877212, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 3.217951429736772, + "language_loss": 0.7657091, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78695869, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 2.6314961910247803 + }, + { + "auxiliary_loss_clip": 0.01087524, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.03617144, + "balance_loss_mlp": 1.02293277, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.347871059801615, + "language_loss": 0.646231, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.66746682, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 2.6704115867614746 + }, + { + "auxiliary_loss_clip": 0.01064661, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.02963626, + "balance_loss_mlp": 1.0259428, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.703461300921775, + "language_loss": 0.83233476, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85337704, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.676090955734253 + }, + { + "auxiliary_loss_clip": 0.01082432, + "auxiliary_loss_mlp": 0.0074984, + "balance_loss_clip": 1.03646755, + "balance_loss_mlp": 1.00066447, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.7732865010443604, + "language_loss": 0.75100678, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.76932949, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 2.707521438598633 + }, + { + "auxiliary_loss_clip": 0.01097835, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.03534746, + "balance_loss_mlp": 1.02702022, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.339248577039585, + "language_loss": 0.71516728, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73654342, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 2.533507823944092 + }, + { + "auxiliary_loss_clip": 0.0107543, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.03341198, + "balance_loss_mlp": 1.02031922, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 2.2406393411039778, + "language_loss": 0.75134933, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77242482, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 2.6380653381347656 + }, + { + "auxiliary_loss_clip": 0.01070662, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.03357935, + "balance_loss_mlp": 1.02052426, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.7810435924519625, + "language_loss": 0.74058998, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76163191, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 2.7011237144470215 + }, + { + "auxiliary_loss_clip": 0.01089862, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.03390181, + "balance_loss_mlp": 1.01695943, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.571026316827869, + "language_loss": 0.77067876, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79187602, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 2.5932610034942627 + }, + { + "auxiliary_loss_clip": 0.01113009, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.03742099, + "balance_loss_mlp": 1.02124262, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.8429276803720536, + "language_loss": 0.70619327, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.7276758, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.608022451400757 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.03804541, + "balance_loss_mlp": 1.01661909, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.2905314021046648, + "language_loss": 0.72082186, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74225271, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.5905771255493164 + }, + { + "auxiliary_loss_clip": 0.01083733, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.03320026, + "balance_loss_mlp": 1.02253234, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 2.507318592664467, + "language_loss": 0.70263016, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72383463, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 4.207824230194092 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.01035701, + "balance_loss_clip": 1.03731823, + "balance_loss_mlp": 1.02342296, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 2.225572619494801, + "language_loss": 0.80192757, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82327825, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.7109994888305664 + }, + { + "auxiliary_loss_clip": 0.01087604, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.03730488, + "balance_loss_mlp": 1.01940751, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 2.71963338897537, + "language_loss": 0.66335636, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68456173, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.577458381652832 + }, + { + "auxiliary_loss_clip": 0.01070585, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.03447747, + "balance_loss_mlp": 1.01976275, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.5928020601537418, + "language_loss": 0.57041442, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59144545, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.8635413646698 + }, + { + "auxiliary_loss_clip": 0.01079663, + "auxiliary_loss_mlp": 0.01041806, + "balance_loss_clip": 1.038311, + "balance_loss_mlp": 1.02948594, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.6516694693270981, + "language_loss": 0.84416783, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86538249, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.7193589210510254 + }, + { + "auxiliary_loss_clip": 0.0108406, + "auxiliary_loss_mlp": 0.01038686, + "balance_loss_clip": 1.03726339, + "balance_loss_mlp": 1.02584672, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.33631725047707, + "language_loss": 0.81523317, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83646065, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 2.5574800968170166 + }, + { + "auxiliary_loss_clip": 0.01096268, + "auxiliary_loss_mlp": 0.00749655, + "balance_loss_clip": 1.03704262, + "balance_loss_mlp": 1.00044656, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.7727742410149376, + "language_loss": 0.75328493, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77174413, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 4.039013624191284 + }, + { + "auxiliary_loss_clip": 0.01093866, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.03670442, + "balance_loss_mlp": 1.01963329, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.021498529898941, + "language_loss": 0.74004072, + "learning_rate": 2.692065118669195e-06, + "loss": 0.76131737, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 2.6185495853424072 + }, + { + "auxiliary_loss_clip": 0.01063219, + "auxiliary_loss_mlp": 0.01040245, + "balance_loss_clip": 1.0368588, + "balance_loss_mlp": 1.02504039, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 1.6292970304683017, + "language_loss": 0.66781604, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.6888507, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 2.6837847232818604 + }, + { + "auxiliary_loss_clip": 0.01067677, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.03725052, + "balance_loss_mlp": 1.02325416, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 2.039751300515475, + "language_loss": 0.7093662, + "learning_rate": 2.691334262772948e-06, + "loss": 0.73042226, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 2.9459152221679688 + }, + { + "auxiliary_loss_clip": 0.01084323, + "auxiliary_loss_mlp": 0.01038109, + "balance_loss_clip": 1.03340614, + "balance_loss_mlp": 1.02374482, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.1943208627636777, + "language_loss": 0.71681893, + "learning_rate": 2.690968795494699e-06, + "loss": 0.73804331, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 2.6551833152770996 + }, + { + "auxiliary_loss_clip": 0.01075911, + "auxiliary_loss_mlp": 0.01037424, + "balance_loss_clip": 1.03434134, + "balance_loss_mlp": 1.02429926, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7040993768089874, + "language_loss": 0.82514715, + "learning_rate": 2.690603302014844e-06, + "loss": 0.84628046, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 2.7161149978637695 + }, + { + "auxiliary_loss_clip": 0.01065769, + "auxiliary_loss_mlp": 0.01037456, + "balance_loss_clip": 1.03674424, + "balance_loss_mlp": 1.0237174, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 2.570968586592904, + "language_loss": 0.70614237, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72717464, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 4.242826700210571 + }, + { + "auxiliary_loss_clip": 0.01046623, + "auxiliary_loss_mlp": 0.00750021, + "balance_loss_clip": 1.0318079, + "balance_loss_mlp": 1.00054765, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.8007755753551844, + "language_loss": 0.78565896, + "learning_rate": 2.689872236505755e-06, + "loss": 0.80362546, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 4.457036256790161 + }, + { + "auxiliary_loss_clip": 0.01090994, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.03729868, + "balance_loss_mlp": 1.01703155, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 5.5272509953972, + "language_loss": 0.7863903, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80760133, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 2.6277077198028564 + }, + { + "auxiliary_loss_clip": 0.01073898, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.03760707, + "balance_loss_mlp": 1.01818931, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.330827151247476, + "language_loss": 0.88671917, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90777552, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 2.673830509185791 + }, + { + "auxiliary_loss_clip": 0.01076703, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.03745043, + "balance_loss_mlp": 1.02134979, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 1.845243376746891, + "language_loss": 0.64153445, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66264302, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 2.6593985557556152 + }, + { + "auxiliary_loss_clip": 0.01098283, + "auxiliary_loss_mlp": 0.01033727, + "balance_loss_clip": 1.03516436, + "balance_loss_mlp": 1.01956487, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.7450766644222973, + "language_loss": 0.75024855, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77156866, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.702848196029663 + }, + { + "auxiliary_loss_clip": 0.01078923, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.03734159, + "balance_loss_mlp": 1.0214541, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 3.5784901287526236, + "language_loss": 0.70154494, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72267222, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 2.657151699066162 + }, + { + "auxiliary_loss_clip": 0.01097473, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.03918958, + "balance_loss_mlp": 1.02240515, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 1.5498137181863798, + "language_loss": 0.73110735, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75242782, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 2.7240750789642334 + }, + { + "auxiliary_loss_clip": 0.01065311, + "auxiliary_loss_mlp": 0.01036356, + "balance_loss_clip": 1.0319804, + "balance_loss_mlp": 1.02187228, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 2.793145584855255, + "language_loss": 0.69127208, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71228874, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.6061205863952637 + }, + { + "auxiliary_loss_clip": 0.01079559, + "auxiliary_loss_mlp": 0.0103663, + "balance_loss_clip": 1.03494227, + "balance_loss_mlp": 1.02132368, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.642307398351742, + "language_loss": 0.90735042, + "learning_rate": 2.686946929177557e-06, + "loss": 0.92851233, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 2.7127630710601807 + }, + { + "auxiliary_loss_clip": 0.0109803, + "auxiliary_loss_mlp": 0.01041363, + "balance_loss_clip": 1.03599989, + "balance_loss_mlp": 1.02718925, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.321027770222512, + "language_loss": 0.78773403, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80912793, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 2.570995330810547 + }, + { + "auxiliary_loss_clip": 0.01113466, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.03658295, + "balance_loss_mlp": 1.02335238, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.9807969254538609, + "language_loss": 0.76502603, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78652972, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.5615570545196533 + }, + { + "auxiliary_loss_clip": 0.0110414, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.04013634, + "balance_loss_mlp": 1.02020514, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 2.0284483167825225, + "language_loss": 0.77315545, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79452348, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 2.6711018085479736 + }, + { + "auxiliary_loss_clip": 0.01113107, + "auxiliary_loss_mlp": 0.01032252, + "balance_loss_clip": 1.03880632, + "balance_loss_mlp": 1.01977706, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.7887557695296206, + "language_loss": 0.87070817, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89216179, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.5542798042297363 + }, + { + "auxiliary_loss_clip": 0.01080964, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_clip": 1.03666544, + "balance_loss_mlp": 1.02961659, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 2.1068647249649217, + "language_loss": 0.80719233, + "learning_rate": 2.685117765051156e-06, + "loss": 0.82842928, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 2.5938096046447754 + }, + { + "auxiliary_loss_clip": 0.0111621, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.03942645, + "balance_loss_mlp": 1.01468825, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6224083503471347, + "language_loss": 0.80510175, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82654709, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 2.6404519081115723 + }, + { + "auxiliary_loss_clip": 0.01075485, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.03522253, + "balance_loss_mlp": 1.02310956, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.3953609796174005, + "language_loss": 0.76124376, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78236026, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 2.785804510116577 + }, + { + "auxiliary_loss_clip": 0.01090019, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.03580856, + "balance_loss_mlp": 1.02413964, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.6612030842314782, + "language_loss": 0.8130151, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83429146, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.596906900405884 + }, + { + "auxiliary_loss_clip": 0.01015886, + "auxiliary_loss_mlp": 0.01015363, + "balance_loss_clip": 1.01240385, + "balance_loss_mlp": 1.01359892, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8369600579962202, + "language_loss": 0.64400244, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66431493, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 3.0675370693206787 + }, + { + "auxiliary_loss_clip": 0.01070292, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.03701603, + "balance_loss_mlp": 1.01928055, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 1.9628053009819464, + "language_loss": 0.72320688, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74423194, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 2.685408353805542 + }, + { + "auxiliary_loss_clip": 0.01080781, + "auxiliary_loss_mlp": 0.00749737, + "balance_loss_clip": 1.03783834, + "balance_loss_mlp": 1.00051212, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.828555727255372, + "language_loss": 0.77838856, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.7966938, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 2.6450698375701904 + }, + { + "auxiliary_loss_clip": 0.01105859, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.03926396, + "balance_loss_mlp": 1.02486181, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.1636894088432177, + "language_loss": 0.79334152, + "learning_rate": 2.682555844513981e-06, + "loss": 0.8147881, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 2.564471483230591 + }, + { + "auxiliary_loss_clip": 0.01032671, + "auxiliary_loss_mlp": 0.01000292, + "balance_loss_clip": 1.00935698, + "balance_loss_mlp": 0.99882585, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6789728069598991, + "language_loss": 0.53152788, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55185747, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.1170361042022705 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.00749688, + "balance_loss_clip": 1.03981137, + "balance_loss_mlp": 1.00071955, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.139443878621471, + "language_loss": 0.82641339, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84505248, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.5815749168395996 + }, + { + "auxiliary_loss_clip": 0.01101856, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.03724754, + "balance_loss_mlp": 1.02133226, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.8796511055397382, + "language_loss": 0.7621429, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78351456, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.689072370529175 + }, + { + "auxiliary_loss_clip": 0.01097364, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.0385077, + "balance_loss_mlp": 1.0179497, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 1.915205888227084, + "language_loss": 0.66110754, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68238091, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.5736794471740723 + }, + { + "auxiliary_loss_clip": 0.01084211, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.03317952, + "balance_loss_mlp": 1.02134395, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.6196558702472945, + "language_loss": 0.71431267, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73550808, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.7017951011657715 + }, + { + "auxiliary_loss_clip": 0.01104794, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.03904653, + "balance_loss_mlp": 1.01967108, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 2.3418227713487223, + "language_loss": 0.82058489, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84195513, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 2.5428900718688965 + }, + { + "auxiliary_loss_clip": 0.0109668, + "auxiliary_loss_mlp": 0.0103601, + "balance_loss_clip": 1.03785968, + "balance_loss_mlp": 1.02288508, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.6776739006913621, + "language_loss": 0.80838692, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82971382, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 4.125620365142822 + }, + { + "auxiliary_loss_clip": 0.01089326, + "auxiliary_loss_mlp": 0.01039963, + "balance_loss_clip": 1.0411725, + "balance_loss_mlp": 1.02489471, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.777806229155463, + "language_loss": 0.65829206, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67958492, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.6515791416168213 + }, + { + "auxiliary_loss_clip": 0.0109651, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.03795123, + "balance_loss_mlp": 1.0197345, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 2.1554736126464817, + "language_loss": 0.79881477, + "learning_rate": 2.679260083800989e-06, + "loss": 0.82010484, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.614738702774048 + }, + { + "auxiliary_loss_clip": 0.01113512, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.03944349, + "balance_loss_mlp": 1.02063, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.622144306602099, + "language_loss": 0.8173992, + "learning_rate": 2.678893759192982e-06, + "loss": 0.83886302, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 2.6010286808013916 + }, + { + "auxiliary_loss_clip": 0.01098792, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.03798485, + "balance_loss_mlp": 1.01939964, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.9861851543757523, + "language_loss": 0.67271757, + "learning_rate": 2.678527408841255e-06, + "loss": 0.69402742, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 2.5429329872131348 + }, + { + "auxiliary_loss_clip": 0.01081026, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.03363931, + "balance_loss_mlp": 1.02603984, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 1.8916943082347542, + "language_loss": 0.65999603, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68121189, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.816678047180176 + }, + { + "auxiliary_loss_clip": 0.01059818, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.03170276, + "balance_loss_mlp": 1.0198009, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 2.9579607061089774, + "language_loss": 0.60342681, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62435532, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 4.143125295639038 + }, + { + "auxiliary_loss_clip": 0.01092716, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.03804803, + "balance_loss_mlp": 1.02162337, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 3.2209893963573077, + "language_loss": 0.69790554, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71919036, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.4817111492156982 + }, + { + "auxiliary_loss_clip": 0.01024019, + "auxiliary_loss_mlp": 0.0100502, + "balance_loss_clip": 1.01054001, + "balance_loss_mlp": 1.00337458, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7461208956249058, + "language_loss": 0.59658986, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61688024, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 3.1661651134490967 + }, + { + "auxiliary_loss_clip": 0.01117799, + "auxiliary_loss_mlp": 0.01041489, + "balance_loss_clip": 1.04122353, + "balance_loss_mlp": 1.02794683, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 5.299928530351193, + "language_loss": 0.80296898, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82456183, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 2.5635855197906494 + }, + { + "auxiliary_loss_clip": 0.0110606, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.0403744, + "balance_loss_mlp": 1.02047729, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.673451784640278, + "language_loss": 0.84916878, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87057418, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 2.6401171684265137 + }, + { + "auxiliary_loss_clip": 0.01082623, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.03875244, + "balance_loss_mlp": 1.02014899, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.624727867464355, + "language_loss": 0.79780293, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.81896138, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.7509045600891113 + }, + { + "auxiliary_loss_clip": 0.01103443, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.0387547, + "balance_loss_mlp": 1.02147925, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 3.278308785264557, + "language_loss": 0.70276594, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7241599, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 4.214451789855957 + }, + { + "auxiliary_loss_clip": 0.01093297, + "auxiliary_loss_mlp": 0.00750079, + "balance_loss_clip": 1.03531289, + "balance_loss_mlp": 1.00051117, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.765549596101173, + "language_loss": 0.77867955, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.7971133, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 2.5883498191833496 + }, + { + "auxiliary_loss_clip": 0.01100646, + "auxiliary_loss_mlp": 0.01044001, + "balance_loss_clip": 1.03532195, + "balance_loss_mlp": 1.03166878, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 1.8538682450285047, + "language_loss": 0.85395652, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87540305, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 4.230974435806274 + }, + { + "auxiliary_loss_clip": 0.01108493, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.03697634, + "balance_loss_mlp": 1.02192569, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.7319002852304293, + "language_loss": 0.84371066, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86513317, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 2.648008346557617 + }, + { + "auxiliary_loss_clip": 0.01067078, + "auxiliary_loss_mlp": 0.01049604, + "balance_loss_clip": 1.03487098, + "balance_loss_mlp": 1.03419018, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.4590471419333864, + "language_loss": 0.83211207, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85327888, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 2.681631088256836 + }, + { + "auxiliary_loss_clip": 0.01100916, + "auxiliary_loss_mlp": 0.01038254, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.02524817, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.882168301979147, + "language_loss": 0.74368811, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76507986, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 2.6527938842773438 + }, + { + "auxiliary_loss_clip": 0.01104859, + "auxiliary_loss_mlp": 0.0103539, + "balance_loss_clip": 1.03773665, + "balance_loss_mlp": 1.02157974, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 2.4654722455904867, + "language_loss": 0.80151677, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82291925, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 2.5658366680145264 + }, + { + "auxiliary_loss_clip": 0.01100891, + "auxiliary_loss_mlp": 0.01037511, + "balance_loss_clip": 1.03977203, + "balance_loss_mlp": 1.02282405, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 4.854473669925819, + "language_loss": 0.76292348, + "learning_rate": 2.673029073767934e-06, + "loss": 0.78430754, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.53977108001709 + }, + { + "auxiliary_loss_clip": 0.010496, + "auxiliary_loss_mlp": 0.00749751, + "balance_loss_clip": 1.03470755, + "balance_loss_mlp": 1.00037432, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 2.4221049387143108, + "language_loss": 0.78436267, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80235612, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 2.7033092975616455 + }, + { + "auxiliary_loss_clip": 0.01116613, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.03822768, + "balance_loss_mlp": 1.0264982, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.87666272020397, + "language_loss": 0.75169039, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77324593, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.5680084228515625 + }, + { + "auxiliary_loss_clip": 0.01066162, + "auxiliary_loss_mlp": 0.01044165, + "balance_loss_clip": 1.03597307, + "balance_loss_mlp": 1.03111768, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.7552060130711378, + "language_loss": 0.79404134, + "learning_rate": 2.671928716175804e-06, + "loss": 0.8151446, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 2.705620765686035 + }, + { + "auxiliary_loss_clip": 0.01106871, + "auxiliary_loss_mlp": 0.01032766, + "balance_loss_clip": 1.04008126, + "balance_loss_mlp": 1.01908648, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 3.5196887085858193, + "language_loss": 0.7207588, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74215519, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.5708377361297607 + }, + { + "auxiliary_loss_clip": 0.01011238, + "auxiliary_loss_mlp": 0.00999177, + "balance_loss_clip": 1.02171838, + "balance_loss_mlp": 0.99781245, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8475439337410687, + "language_loss": 0.58819008, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60829425, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 3.255715847015381 + }, + { + "auxiliary_loss_clip": 0.01095422, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_clip": 1.03944993, + "balance_loss_mlp": 1.02778697, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.80804063222078, + "language_loss": 0.54645944, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56781727, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 2.6222667694091797 + }, + { + "auxiliary_loss_clip": 0.01088237, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.03668857, + "balance_loss_mlp": 1.01702023, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.923342072531122, + "language_loss": 0.8327269, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85390586, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 2.6471292972564697 + }, + { + "auxiliary_loss_clip": 0.01085943, + "auxiliary_loss_mlp": 0.01043263, + "balance_loss_clip": 1.03778672, + "balance_loss_mlp": 1.02783704, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.075223834662927, + "language_loss": 0.77318823, + "learning_rate": 2.670094277448999e-06, + "loss": 0.7944802, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.6787145137786865 + }, + { + "auxiliary_loss_clip": 0.01115061, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.0393641, + "balance_loss_mlp": 1.02042115, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.774780051512858, + "language_loss": 0.69845158, + "learning_rate": 2.669727313417857e-06, + "loss": 0.71994734, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.465182304382324 + }, + { + "auxiliary_loss_clip": 0.01112147, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.03742516, + "balance_loss_mlp": 1.02395558, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 8.345118638757356, + "language_loss": 0.66243023, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68393266, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 3.0568931102752686 + }, + { + "auxiliary_loss_clip": 0.01093115, + "auxiliary_loss_mlp": 0.00749818, + "balance_loss_clip": 1.03660643, + "balance_loss_mlp": 1.00047171, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.9589057219180985, + "language_loss": 0.74033141, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.75876069, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.6877682209014893 + }, + { + "auxiliary_loss_clip": 0.01066811, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.03577805, + "balance_loss_mlp": 1.01862049, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.480043403024934, + "language_loss": 0.66420424, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68519813, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 2.762693166732788 + }, + { + "auxiliary_loss_clip": 0.01100659, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.03940248, + "balance_loss_mlp": 1.02467203, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.732765227892976, + "language_loss": 0.76794815, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78933144, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 2.6561496257781982 + }, + { + "auxiliary_loss_clip": 0.01098251, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.04124331, + "balance_loss_mlp": 1.02277732, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.262679733461533, + "language_loss": 0.82008028, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.84142464, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.6397297382354736 + }, + { + "auxiliary_loss_clip": 0.01090546, + "auxiliary_loss_mlp": 0.01035957, + "balance_loss_clip": 1.03558731, + "balance_loss_mlp": 1.02119303, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.7986964823117289, + "language_loss": 0.79863793, + "learning_rate": 2.667524996399444e-06, + "loss": 0.81990296, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.6619999408721924 + }, + { + "auxiliary_loss_clip": 0.01090784, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.0427053, + "balance_loss_mlp": 1.02031958, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.6550544165793337, + "language_loss": 0.66078275, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68202394, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.795602798461914 + }, + { + "auxiliary_loss_clip": 0.01091453, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.03796577, + "balance_loss_mlp": 1.02884197, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.6556740705984503, + "language_loss": 0.8538022, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87516642, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.6963093280792236 + }, + { + "auxiliary_loss_clip": 0.01101803, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.0383966, + "balance_loss_mlp": 1.01638031, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.6526799714917175, + "language_loss": 0.7093758, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73068869, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.6558098793029785 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.03904641, + "balance_loss_mlp": 1.02039361, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.7450890572195148, + "language_loss": 0.74219894, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.7635355, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.7366716861724854 + }, + { + "auxiliary_loss_clip": 0.01093479, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.03813875, + "balance_loss_mlp": 1.02109087, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 2.5675604999900017, + "language_loss": 0.76166427, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.78294539, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 2.669931650161743 + }, + { + "auxiliary_loss_clip": 0.01070243, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.0388782, + "balance_loss_mlp": 1.02199078, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 4.341296107625725, + "language_loss": 0.73256189, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75363648, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 4.39052939414978 + }, + { + "auxiliary_loss_clip": 0.01086671, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.03763366, + "balance_loss_mlp": 1.0193038, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 1.9893452863599224, + "language_loss": 0.71921426, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.74041653, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.8763513565063477 + }, + { + "auxiliary_loss_clip": 0.01071967, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.03453112, + "balance_loss_mlp": 1.02493048, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.8691964618462225, + "language_loss": 0.84663606, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86773491, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.6901283264160156 + }, + { + "auxiliary_loss_clip": 0.0108973, + "auxiliary_loss_mlp": 0.00749802, + "balance_loss_clip": 1.03812373, + "balance_loss_mlp": 1.00054455, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 3.7892274548701343, + "language_loss": 0.66477114, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68316644, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.707488775253296 + }, + { + "auxiliary_loss_clip": 0.01088711, + "auxiliary_loss_mlp": 0.01028187, + "balance_loss_clip": 1.03545904, + "balance_loss_mlp": 1.01548564, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.6996127557664467, + "language_loss": 0.71909183, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74026084, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 2.7064387798309326 + }, + { + "auxiliary_loss_clip": 0.01084998, + "auxiliary_loss_mlp": 0.0103954, + "balance_loss_clip": 1.03762949, + "balance_loss_mlp": 1.02431703, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 3.1971725369096204, + "language_loss": 0.82980937, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85105479, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.67453932762146 + }, + { + "auxiliary_loss_clip": 0.01099837, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.03697896, + "balance_loss_mlp": 1.02000427, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.4862993485113918, + "language_loss": 0.90032643, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92165542, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.6389496326446533 + }, + { + "auxiliary_loss_clip": 0.0107169, + "auxiliary_loss_mlp": 0.01027819, + "balance_loss_clip": 1.0361414, + "balance_loss_mlp": 1.01419377, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 2.5389552556289043, + "language_loss": 0.65276527, + "learning_rate": 2.662750187431268e-06, + "loss": 0.67376029, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 4.426306962966919 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.03858089, + "balance_loss_mlp": 1.01898861, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.927619892426994, + "language_loss": 0.69131792, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71275747, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 2.733983039855957 + }, + { + "auxiliary_loss_clip": 0.01062299, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.03416169, + "balance_loss_mlp": 1.0228498, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.537057595099799, + "language_loss": 0.73877752, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75974774, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 2.7624130249023438 + }, + { + "auxiliary_loss_clip": 0.01056105, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.03147697, + "balance_loss_mlp": 1.0211525, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.807854491906855, + "language_loss": 0.72667885, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74760437, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.676400661468506 + }, + { + "auxiliary_loss_clip": 0.0110512, + "auxiliary_loss_mlp": 0.01044593, + "balance_loss_clip": 1.0377661, + "balance_loss_mlp": 1.03094339, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 2.3341934406727662, + "language_loss": 0.71579188, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73728901, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 2.6401875019073486 + }, + { + "auxiliary_loss_clip": 0.01103083, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.03777695, + "balance_loss_mlp": 1.02084684, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 2.9902792076883795, + "language_loss": 0.87089634, + "learning_rate": 2.660912589851978e-06, + "loss": 0.8922801, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 2.5616097450256348 + }, + { + "auxiliary_loss_clip": 0.01100789, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.03858757, + "balance_loss_mlp": 1.02390981, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 2.2045161659898955, + "language_loss": 0.69367456, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71505761, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 4.111158132553101 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01037944, + "balance_loss_clip": 1.03994584, + "balance_loss_mlp": 1.02359128, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.793047485459434, + "language_loss": 0.75356722, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77511114, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 4.046549081802368 + }, + { + "auxiliary_loss_clip": 0.01077083, + "auxiliary_loss_mlp": 0.01035186, + "balance_loss_clip": 1.03859377, + "balance_loss_mlp": 1.02131581, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.930937539497649, + "language_loss": 0.82387567, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84499836, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.6170742511749268 + }, + { + "auxiliary_loss_clip": 0.01110819, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.0363158, + "balance_loss_mlp": 1.0210197, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.971094035710263, + "language_loss": 0.79950809, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82095397, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 2.543510913848877 + }, + { + "auxiliary_loss_clip": 0.01097484, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.03616595, + "balance_loss_mlp": 1.0207262, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.8326003094116627, + "language_loss": 0.67553604, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.69684875, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 2.5832483768463135 + }, + { + "auxiliary_loss_clip": 0.01022464, + "auxiliary_loss_mlp": 0.01007259, + "balance_loss_clip": 1.0092206, + "balance_loss_mlp": 1.00587034, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7649428499577391, + "language_loss": 0.59676135, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61705863, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.196387767791748 + }, + { + "auxiliary_loss_clip": 0.01096245, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.03807604, + "balance_loss_mlp": 1.02125216, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.0908872412705923, + "language_loss": 0.69773132, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.71902984, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 2.5987930297851562 + }, + { + "auxiliary_loss_clip": 0.01001595, + "auxiliary_loss_mlp": 0.00999923, + "balance_loss_clip": 1.01164055, + "balance_loss_mlp": 0.9985283, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7234985336321414, + "language_loss": 0.53637934, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55639458, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 3.236281394958496 + }, + { + "auxiliary_loss_clip": 0.01099004, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.03762865, + "balance_loss_mlp": 1.02198768, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 4.029539560376741, + "language_loss": 0.66259885, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68393683, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 2.567423105239868 + }, + { + "auxiliary_loss_clip": 0.0111162, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.04016638, + "balance_loss_mlp": 1.01974738, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.8679362670938584, + "language_loss": 0.70301843, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72445995, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 2.664273738861084 + }, + { + "auxiliary_loss_clip": 0.01085983, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.03580785, + "balance_loss_mlp": 1.02285933, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.477828260795086, + "language_loss": 0.64555907, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.666785, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 2.676532506942749 + }, + { + "auxiliary_loss_clip": 0.01085875, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.03601003, + "balance_loss_mlp": 1.02337372, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.4469869675181377, + "language_loss": 0.70331967, + "learning_rate": 2.656499802669069e-06, + "loss": 0.7245481, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 2.6798670291900635 + }, + { + "auxiliary_loss_clip": 0.01010478, + "auxiliary_loss_mlp": 0.0074667, + "balance_loss_clip": 1.00777185, + "balance_loss_mlp": 0.99970067, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.887764477875155, + "language_loss": 0.56234533, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.57991678, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.2269372940063477 + }, + { + "auxiliary_loss_clip": 0.01091509, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.0392313, + "balance_loss_mlp": 1.02166796, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.7406305483901776, + "language_loss": 0.76171541, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.7829839, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 2.702148914337158 + }, + { + "auxiliary_loss_clip": 0.01062669, + "auxiliary_loss_mlp": 0.01035198, + "balance_loss_clip": 1.03538549, + "balance_loss_mlp": 1.02249062, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.5924257523310092, + "language_loss": 0.6791423, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70012093, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 2.817270278930664 + }, + { + "auxiliary_loss_clip": 0.01077475, + "auxiliary_loss_mlp": 0.01038638, + "balance_loss_clip": 1.03831792, + "balance_loss_mlp": 1.02327204, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.282094132105764, + "language_loss": 0.79162681, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81278795, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 2.5980730056762695 + }, + { + "auxiliary_loss_clip": 0.01118433, + "auxiliary_loss_mlp": 0.01036147, + "balance_loss_clip": 1.04118538, + "balance_loss_mlp": 1.02138913, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 1.9745962262250891, + "language_loss": 0.77549601, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79704177, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 2.63364315032959 + }, + { + "auxiliary_loss_clip": 0.01106256, + "auxiliary_loss_mlp": 0.01039005, + "balance_loss_clip": 1.03945184, + "balance_loss_mlp": 1.02388382, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.584908683609731, + "language_loss": 0.65837312, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.67982578, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.7214202880859375 + }, + { + "auxiliary_loss_clip": 0.01082553, + "auxiliary_loss_mlp": 0.01039918, + "balance_loss_clip": 1.03632832, + "balance_loss_mlp": 1.02558863, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.9035730441070062, + "language_loss": 0.83389962, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85512435, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.6198785305023193 + }, + { + "auxiliary_loss_clip": 0.01099012, + "auxiliary_loss_mlp": 0.01041133, + "balance_loss_clip": 1.039343, + "balance_loss_mlp": 1.02803814, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.945450422288914, + "language_loss": 0.79250801, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81390941, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.5946130752563477 + }, + { + "auxiliary_loss_clip": 0.01079448, + "auxiliary_loss_mlp": 0.01037869, + "balance_loss_clip": 1.038234, + "balance_loss_mlp": 1.02380204, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 2.25968463579849, + "language_loss": 0.79375416, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81492734, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.5940661430358887 + }, + { + "auxiliary_loss_clip": 0.01102699, + "auxiliary_loss_mlp": 0.00749825, + "balance_loss_clip": 1.03641582, + "balance_loss_mlp": 1.00038791, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 6.94396540910109, + "language_loss": 0.7017349, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72026008, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.5484561920166016 + }, + { + "auxiliary_loss_clip": 0.01100702, + "auxiliary_loss_mlp": 0.01040865, + "balance_loss_clip": 1.03765154, + "balance_loss_mlp": 1.0269649, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.516790047555098, + "language_loss": 0.59309673, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61451238, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.7849411964416504 + }, + { + "auxiliary_loss_clip": 0.01114568, + "auxiliary_loss_mlp": 0.01040055, + "balance_loss_clip": 1.03776896, + "balance_loss_mlp": 1.02628636, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 3.872856046408044, + "language_loss": 0.73047209, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75201833, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.5865657329559326 + }, + { + "auxiliary_loss_clip": 0.01038185, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.03824985, + "balance_loss_mlp": 1.02120256, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.6341277922790294, + "language_loss": 0.74167001, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76240009, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.7214558124542236 + }, + { + "auxiliary_loss_clip": 0.01089253, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.03644478, + "balance_loss_mlp": 1.02100849, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 1.9801770362966131, + "language_loss": 0.79125869, + "learning_rate": 2.651347021844765e-06, + "loss": 0.8124913, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 2.6120383739471436 + }, + { + "auxiliary_loss_clip": 0.01085281, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.0376699, + "balance_loss_mlp": 1.02402139, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.707131163984949, + "language_loss": 0.75997508, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78120595, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.6606285572052 + }, + { + "auxiliary_loss_clip": 0.01021123, + "auxiliary_loss_mlp": 0.01005817, + "balance_loss_clip": 1.00823224, + "balance_loss_mlp": 1.0041008, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7029961292776159, + "language_loss": 0.52726263, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54753202, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 4.610498428344727 + }, + { + "auxiliary_loss_clip": 0.01117419, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.03888845, + "balance_loss_mlp": 1.0221014, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 4.393189165489184, + "language_loss": 0.72736651, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74890304, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.5811026096343994 + }, + { + "auxiliary_loss_clip": 0.01031472, + "auxiliary_loss_mlp": 0.01004263, + "balance_loss_clip": 1.00818384, + "balance_loss_mlp": 1.00268936, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9448387854663717, + "language_loss": 0.66615641, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68651366, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 3.015599250793457 + }, + { + "auxiliary_loss_clip": 0.01110822, + "auxiliary_loss_mlp": 0.01038165, + "balance_loss_clip": 1.03595185, + "balance_loss_mlp": 1.02480769, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 2.3592973259139205, + "language_loss": 0.81745648, + "learning_rate": 2.649505567780375e-06, + "loss": 0.83894634, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.521604299545288 + }, + { + "auxiliary_loss_clip": 0.01094281, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.03870368, + "balance_loss_mlp": 1.02047706, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.125046800471856, + "language_loss": 0.77571911, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.79700625, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.620511054992676 + }, + { + "auxiliary_loss_clip": 0.01019637, + "auxiliary_loss_mlp": 0.01005507, + "balance_loss_clip": 1.00693727, + "balance_loss_mlp": 1.00399303, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8408940920316522, + "language_loss": 0.57832682, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59857833, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 2.845503568649292 + }, + { + "auxiliary_loss_clip": 0.01099361, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.03763723, + "balance_loss_mlp": 1.02015054, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.7678216176523633, + "language_loss": 0.74882746, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77016014, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 2.5975584983825684 + }, + { + "auxiliary_loss_clip": 0.01083042, + "auxiliary_loss_mlp": 0.01040108, + "balance_loss_clip": 1.03803682, + "balance_loss_mlp": 1.02556467, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.6528027551049314, + "language_loss": 0.83342493, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85465646, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 4.305932283401489 + }, + { + "auxiliary_loss_clip": 0.01082125, + "auxiliary_loss_mlp": 0.01039428, + "balance_loss_clip": 1.03811073, + "balance_loss_mlp": 1.02514637, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 1.9322407825726269, + "language_loss": 0.68256617, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.70378166, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.6784801483154297 + }, + { + "auxiliary_loss_clip": 0.01097868, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.0390923, + "balance_loss_mlp": 1.02376091, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.8767171337273227, + "language_loss": 0.75492203, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.77627361, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 2.586655855178833 + }, + { + "auxiliary_loss_clip": 0.01086782, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.03688908, + "balance_loss_mlp": 1.02295423, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.8675995438496922, + "language_loss": 0.83092201, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85216373, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 2.614365339279175 + }, + { + "auxiliary_loss_clip": 0.01078875, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.03461742, + "balance_loss_mlp": 1.02072072, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 1.9601785806215903, + "language_loss": 0.71516073, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73630309, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 2.604952812194824 + }, + { + "auxiliary_loss_clip": 0.01080163, + "auxiliary_loss_mlp": 0.01043202, + "balance_loss_clip": 1.0361439, + "balance_loss_mlp": 1.02996337, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.8415008100210613, + "language_loss": 0.82767117, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84890485, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 2.6540677547454834 + }, + { + "auxiliary_loss_clip": 0.01102306, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.03837395, + "balance_loss_mlp": 1.02303386, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.5963377938034666, + "language_loss": 0.65286112, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.67426753, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 4.123167276382446 + }, + { + "auxiliary_loss_clip": 0.0110161, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.03854227, + "balance_loss_mlp": 1.01647627, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 1.9144905770169205, + "language_loss": 0.76293033, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78425187, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 4.201188087463379 + }, + { + "auxiliary_loss_clip": 0.01103905, + "auxiliary_loss_mlp": 0.00749781, + "balance_loss_clip": 1.03992462, + "balance_loss_mlp": 1.0003624, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.8714251358003473, + "language_loss": 0.80041289, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.81894982, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.601008415222168 + }, + { + "auxiliary_loss_clip": 0.01113976, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.03937602, + "balance_loss_mlp": 1.02041721, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.9518662713714834, + "language_loss": 0.84675694, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.8682422, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 2.570141077041626 + }, + { + "auxiliary_loss_clip": 0.01095798, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.03893924, + "balance_loss_mlp": 1.01650214, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.7326455784145571, + "language_loss": 0.70756745, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72883415, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.6095056533813477 + }, + { + "auxiliary_loss_clip": 0.011144, + "auxiliary_loss_mlp": 0.01040018, + "balance_loss_clip": 1.04113901, + "balance_loss_mlp": 1.02739382, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 2.6170253246182167, + "language_loss": 0.81351888, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.8350631, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.505204439163208 + }, + { + "auxiliary_loss_clip": 0.01090784, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.03725064, + "balance_loss_mlp": 1.0303266, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 2.0981886705236317, + "language_loss": 0.69174272, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71311903, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 2.576266050338745 + }, + { + "auxiliary_loss_clip": 0.01100653, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.03686345, + "balance_loss_mlp": 1.02660656, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.9603956117498014, + "language_loss": 0.75631553, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77771986, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.565410852432251 + }, + { + "auxiliary_loss_clip": 0.01075846, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.03541338, + "balance_loss_mlp": 1.02728176, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.397196432224652, + "language_loss": 0.7595287, + "learning_rate": 2.642871247413523e-06, + "loss": 0.78069776, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 2.704066753387451 + }, + { + "auxiliary_loss_clip": 0.01118628, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.04120445, + "balance_loss_mlp": 1.02608812, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.7472699952512094, + "language_loss": 0.69772196, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71930897, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 2.602227210998535 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.0074992, + "balance_loss_clip": 1.04068232, + "balance_loss_mlp": 1.0003984, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.6325839652306229, + "language_loss": 0.75269967, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77137685, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.5552544593811035 + }, + { + "auxiliary_loss_clip": 0.0110347, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.03766441, + "balance_loss_mlp": 1.0172534, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 1.9185051090660399, + "language_loss": 0.69948721, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72083634, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 2.745706796646118 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01039598, + "balance_loss_clip": 1.03700781, + "balance_loss_mlp": 1.02551937, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 7.439300237990421, + "language_loss": 0.76272714, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78424776, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 2.557013750076294 + }, + { + "auxiliary_loss_clip": 0.01084587, + "auxiliary_loss_mlp": 0.00749751, + "balance_loss_clip": 1.04348087, + "balance_loss_mlp": 1.00041914, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.7187813829331264, + "language_loss": 0.80283767, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82118106, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.7544498443603516 + }, + { + "auxiliary_loss_clip": 0.01113307, + "auxiliary_loss_mlp": 0.01041642, + "balance_loss_clip": 1.03964221, + "balance_loss_mlp": 1.02700949, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.7715890204567954, + "language_loss": 0.74122858, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.7627781, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 2.6072285175323486 + }, + { + "auxiliary_loss_clip": 0.01077044, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.04060006, + "balance_loss_mlp": 1.02359307, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.939169520805761, + "language_loss": 0.8411572, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86232352, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.780806303024292 + }, + { + "auxiliary_loss_clip": 0.01062228, + "auxiliary_loss_mlp": 0.00750177, + "balance_loss_clip": 1.0332067, + "balance_loss_mlp": 1.00043643, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.71474684899288, + "language_loss": 0.70353937, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72166342, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.77489972114563 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01033357, + "balance_loss_clip": 1.0395267, + "balance_loss_mlp": 1.01952267, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.4672131317104022, + "language_loss": 0.72665203, + "learning_rate": 2.639551120239279e-06, + "loss": 0.74813235, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 2.5928637981414795 + }, + { + "auxiliary_loss_clip": 0.01106303, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.03886104, + "balance_loss_mlp": 1.02069318, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 2.8847732188582142, + "language_loss": 0.62489712, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64630961, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.5580179691314697 + }, + { + "auxiliary_loss_clip": 0.01068049, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.03644168, + "balance_loss_mlp": 1.02754259, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.48760437354997, + "language_loss": 0.70379263, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72490054, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.6808128356933594 + }, + { + "auxiliary_loss_clip": 0.01115503, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.03823781, + "balance_loss_mlp": 1.02514744, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.8248169381935733, + "language_loss": 0.7304756, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75203687, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.5668394565582275 + }, + { + "auxiliary_loss_clip": 0.01104336, + "auxiliary_loss_mlp": 0.01043372, + "balance_loss_clip": 1.04230273, + "balance_loss_mlp": 1.02939475, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 1.9197620126321133, + "language_loss": 0.84363532, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86511242, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.6359140872955322 + }, + { + "auxiliary_loss_clip": 0.01069725, + "auxiliary_loss_mlp": 0.01034293, + "balance_loss_clip": 1.03599811, + "balance_loss_mlp": 1.01957679, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 2.576126092116869, + "language_loss": 0.74749577, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76853597, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.6492552757263184 + }, + { + "auxiliary_loss_clip": 0.01081034, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.03618371, + "balance_loss_mlp": 1.02047348, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.7743865866830213, + "language_loss": 0.7571339, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77830392, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.7704248428344727 + }, + { + "auxiliary_loss_clip": 0.01101318, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.03874135, + "balance_loss_mlp": 1.02344799, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.554641709046388, + "language_loss": 0.79401398, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.81541371, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 2.569448709487915 + }, + { + "auxiliary_loss_clip": 0.01072904, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.03547859, + "balance_loss_mlp": 1.02513433, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 7.917714116799373, + "language_loss": 0.70053148, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.72166842, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.596653699874878 + }, + { + "auxiliary_loss_clip": 0.01089717, + "auxiliary_loss_mlp": 0.00749755, + "balance_loss_clip": 1.04185176, + "balance_loss_mlp": 1.00044584, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 2.0947200629842446, + "language_loss": 0.83421004, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85260481, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 4.26473593711853 + }, + { + "auxiliary_loss_clip": 0.01119093, + "auxiliary_loss_mlp": 0.01038284, + "balance_loss_clip": 1.03933513, + "balance_loss_mlp": 1.02188051, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 2.0719253680393876, + "language_loss": 0.67777407, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.69934785, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.652583360671997 + }, + { + "auxiliary_loss_clip": 0.0111792, + "auxiliary_loss_mlp": 0.0074998, + "balance_loss_clip": 1.03976536, + "balance_loss_mlp": 1.00035405, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 2.2581227107106585, + "language_loss": 0.77961934, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79829836, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 2.657038927078247 + }, + { + "auxiliary_loss_clip": 0.01117069, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.03946733, + "balance_loss_mlp": 1.01793385, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.5724885410113931, + "language_loss": 0.6870048, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70850366, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.6296989917755127 + }, + { + "auxiliary_loss_clip": 0.01090612, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.03677368, + "balance_loss_mlp": 1.02238691, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 3.0030527617243057, + "language_loss": 0.67613846, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69740373, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.629112482070923 + }, + { + "auxiliary_loss_clip": 0.01086931, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.04117477, + "balance_loss_mlp": 1.02240741, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.8118556352717565, + "language_loss": 0.771644, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79287207, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.687690496444702 + }, + { + "auxiliary_loss_clip": 0.01022721, + "auxiliary_loss_mlp": 0.01024938, + "balance_loss_clip": 1.0189662, + "balance_loss_mlp": 1.02289915, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7774993327001082, + "language_loss": 0.64796686, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66844344, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.1672275066375732 + }, + { + "auxiliary_loss_clip": 0.01082054, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.04027152, + "balance_loss_mlp": 1.02289271, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 5.573749009146938, + "language_loss": 0.87127018, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89245939, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 2.6836493015289307 + }, + { + "auxiliary_loss_clip": 0.01026658, + "auxiliary_loss_mlp": 0.01005073, + "balance_loss_clip": 1.01326799, + "balance_loss_mlp": 1.00357723, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8004822168075717, + "language_loss": 0.62054145, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64085877, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 4.68317985534668 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.04078746, + "balance_loss_mlp": 1.02261472, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 3.258507743308758, + "language_loss": 0.87828994, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.89987797, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 2.5880372524261475 + }, + { + "auxiliary_loss_clip": 0.01105258, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.04312944, + "balance_loss_mlp": 1.02131844, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 2.8629644482282792, + "language_loss": 0.62865704, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65005445, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.6288464069366455 + }, + { + "auxiliary_loss_clip": 0.01087374, + "auxiliary_loss_mlp": 0.00749971, + "balance_loss_clip": 1.03657997, + "balance_loss_mlp": 1.00046015, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.8408279393832656, + "language_loss": 0.75156635, + "learning_rate": 2.632166041703586e-06, + "loss": 0.76993978, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 2.673712968826294 + }, + { + "auxiliary_loss_clip": 0.01060946, + "auxiliary_loss_mlp": 0.01043778, + "balance_loss_clip": 1.03449988, + "balance_loss_mlp": 1.02859104, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.9060443603536688, + "language_loss": 0.87688667, + "learning_rate": 2.631796535141458e-06, + "loss": 0.8979339, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 2.7211313247680664 + }, + { + "auxiliary_loss_clip": 0.01082796, + "auxiliary_loss_mlp": 0.01043974, + "balance_loss_clip": 1.03848517, + "balance_loss_mlp": 1.02916241, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.6993144553764066, + "language_loss": 0.71055973, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73182738, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 4.167760848999023 + }, + { + "auxiliary_loss_clip": 0.01120207, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.04174936, + "balance_loss_mlp": 1.01882875, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.4137949444398816, + "language_loss": 0.71867263, + "learning_rate": 2.631057450157852e-06, + "loss": 0.74021333, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 2.755105972290039 + }, + { + "auxiliary_loss_clip": 0.01090531, + "auxiliary_loss_mlp": 0.01034568, + "balance_loss_clip": 1.03799963, + "balance_loss_mlp": 1.02054906, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.4339506959899777, + "language_loss": 0.81079137, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83204246, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 4.2413880825042725 + }, + { + "auxiliary_loss_clip": 0.01107684, + "auxiliary_loss_mlp": 0.01037175, + "balance_loss_clip": 1.04278398, + "balance_loss_mlp": 1.02171993, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.4396844539811853, + "language_loss": 0.70546269, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72691131, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 2.7641212940216064 + }, + { + "auxiliary_loss_clip": 0.01093737, + "auxiliary_loss_mlp": 0.0103859, + "balance_loss_clip": 1.03943932, + "balance_loss_mlp": 1.02320623, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 2.1770209431186576, + "language_loss": 0.8137638, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83508706, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.636934518814087 + }, + { + "auxiliary_loss_clip": 0.01098002, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_clip": 1.04202402, + "balance_loss_mlp": 1.02413774, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 2.9754396534996332, + "language_loss": 0.65593147, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67730355, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 2.669854164123535 + }, + { + "auxiliary_loss_clip": 0.01092908, + "auxiliary_loss_mlp": 0.01043982, + "balance_loss_clip": 1.03771114, + "balance_loss_mlp": 1.02874684, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.2583692866713556, + "language_loss": 0.80576551, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82713437, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 2.683373212814331 + }, + { + "auxiliary_loss_clip": 0.01097167, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.04208159, + "balance_loss_mlp": 1.01951659, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.6133046045636754, + "language_loss": 0.67581952, + "learning_rate": 2.628839621341247e-06, + "loss": 0.6971209, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 2.78347110748291 + }, + { + "auxiliary_loss_clip": 0.01083059, + "auxiliary_loss_mlp": 0.0105115, + "balance_loss_clip": 1.03792262, + "balance_loss_mlp": 1.03503335, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 2.2483227267335084, + "language_loss": 0.75888991, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78023201, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 2.7121291160583496 + }, + { + "auxiliary_loss_clip": 0.01118549, + "auxiliary_loss_mlp": 0.0103662, + "balance_loss_clip": 1.04049921, + "balance_loss_mlp": 1.02230835, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.7318812164951531, + "language_loss": 0.7286948, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75024652, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.4966928958892822 + }, + { + "auxiliary_loss_clip": 0.01089032, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.03532624, + "balance_loss_mlp": 1.02086115, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 2.025250175403586, + "language_loss": 0.840168, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86140388, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 2.534564733505249 + }, + { + "auxiliary_loss_clip": 0.01087485, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.04081166, + "balance_loss_mlp": 1.02631676, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.6188455887666227, + "language_loss": 0.86716473, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88843101, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 2.6279356479644775 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.01045525, + "balance_loss_clip": 1.03969145, + "balance_loss_mlp": 1.03058183, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 2.3881052380234773, + "language_loss": 0.72329766, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74481505, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 2.5129640102386475 + }, + { + "auxiliary_loss_clip": 0.01089808, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.03820181, + "balance_loss_mlp": 1.02684379, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 2.0890033540370077, + "language_loss": 0.77997917, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80128604, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": 2.5642902851104736 + }, + { + "auxiliary_loss_clip": 0.01115294, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.03941238, + "balance_loss_mlp": 1.0180707, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 1.941135671828736, + "language_loss": 0.70994133, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73141015, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.4506120681762695 + }, + { + "auxiliary_loss_clip": 0.01091262, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.03810382, + "balance_loss_mlp": 1.02284956, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.8419067622697538, + "language_loss": 0.81090838, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83218753, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 2.4794297218322754 + }, + { + "auxiliary_loss_clip": 0.01056075, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.03112876, + "balance_loss_mlp": 1.0241828, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.8560662392179696, + "language_loss": 0.78932083, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81026983, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.6037707328796387 + }, + { + "auxiliary_loss_clip": 0.01091435, + "auxiliary_loss_mlp": 0.00749859, + "balance_loss_clip": 1.03711796, + "balance_loss_mlp": 1.00034094, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 3.359934438042754, + "language_loss": 0.81756002, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83597302, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.7249882221221924 + }, + { + "auxiliary_loss_clip": 0.01115183, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.03718591, + "balance_loss_mlp": 1.01925123, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 1.6836710107841428, + "language_loss": 0.76957381, + "learning_rate": 2.624771374460121e-06, + "loss": 0.7910701, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.506741523742676 + }, + { + "auxiliary_loss_clip": 0.0110638, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.04014659, + "balance_loss_mlp": 1.01933169, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.8398758111413915, + "language_loss": 0.67168784, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69308585, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.5475165843963623 + }, + { + "auxiliary_loss_clip": 0.01090214, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_clip": 1.03965425, + "balance_loss_mlp": 1.02852416, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.7316244360165167, + "language_loss": 0.73520076, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75653094, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.5881824493408203 + }, + { + "auxiliary_loss_clip": 0.01100181, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.03777385, + "balance_loss_mlp": 1.02147007, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.8576871778337642, + "language_loss": 0.73796308, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.75931644, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.5047333240509033 + }, + { + "auxiliary_loss_clip": 0.01090542, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.03758478, + "balance_loss_mlp": 1.02301955, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.4563016777156257, + "language_loss": 0.84226131, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86352849, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 2.6269750595092773 + }, + { + "auxiliary_loss_clip": 0.01091278, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.03945518, + "balance_loss_mlp": 1.02079964, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.2359419862309666, + "language_loss": 0.74363947, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76490569, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.6914784908294678 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01037753, + "balance_loss_clip": 1.0363462, + "balance_loss_mlp": 1.02347183, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.842480686263559, + "language_loss": 0.74921572, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77061498, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.6064155101776123 + }, + { + "auxiliary_loss_clip": 0.01114086, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.03881931, + "balance_loss_mlp": 1.02404356, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 2.8986699635524227, + "language_loss": 0.71310443, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73461527, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 2.5490171909332275 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01040975, + "balance_loss_clip": 1.03927839, + "balance_loss_mlp": 1.02703333, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 2.1528667612044607, + "language_loss": 0.73438859, + "learning_rate": 2.621810847844104e-06, + "loss": 0.75581622, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 2.574183464050293 + }, + { + "auxiliary_loss_clip": 0.01076342, + "auxiliary_loss_mlp": 0.01045471, + "balance_loss_clip": 1.03647578, + "balance_loss_mlp": 1.03053427, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.1696900175599434, + "language_loss": 0.72288173, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74409986, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 4.259183406829834 + }, + { + "auxiliary_loss_clip": 0.01084423, + "auxiliary_loss_mlp": 0.00749971, + "balance_loss_clip": 1.03817487, + "balance_loss_mlp": 1.0003401, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 2.517412306720798, + "language_loss": 0.63939601, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65773994, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.7092254161834717 + }, + { + "auxiliary_loss_clip": 0.01079869, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.03443909, + "balance_loss_mlp": 1.02504218, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.3997249664171105, + "language_loss": 0.69990706, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72109455, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.6695849895477295 + }, + { + "auxiliary_loss_clip": 0.0106562, + "auxiliary_loss_mlp": 0.01055898, + "balance_loss_clip": 1.03018069, + "balance_loss_mlp": 1.03821945, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.7128757216127846, + "language_loss": 0.81158769, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83280289, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.660238265991211 + }, + { + "auxiliary_loss_clip": 0.01100624, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.03856349, + "balance_loss_mlp": 1.0254091, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.3975450733365165, + "language_loss": 0.77339667, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79479289, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.6176633834838867 + }, + { + "auxiliary_loss_clip": 0.0111544, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.03896403, + "balance_loss_mlp": 1.01921856, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 1.5956042640431145, + "language_loss": 0.71649754, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73798561, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.6666972637176514 + }, + { + "auxiliary_loss_clip": 0.01096833, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.03476667, + "balance_loss_mlp": 1.01720738, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.4769184655088716, + "language_loss": 0.77053428, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79180837, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 2.6116533279418945 + }, + { + "auxiliary_loss_clip": 0.01095593, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.04052234, + "balance_loss_mlp": 1.02260566, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.725833575802467, + "language_loss": 0.81571865, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.83704633, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 2.613490343093872 + }, + { + "auxiliary_loss_clip": 0.01084309, + "auxiliary_loss_mlp": 0.00749435, + "balance_loss_clip": 1.03871763, + "balance_loss_mlp": 1.00037861, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.3559905067021873, + "language_loss": 0.76134092, + "learning_rate": 2.618478451956007e-06, + "loss": 0.7796784, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 4.253625392913818 + }, + { + "auxiliary_loss_clip": 0.01068968, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.03644013, + "balance_loss_mlp": 1.01924205, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.8002152382602938, + "language_loss": 0.73463678, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75566685, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.6568074226379395 + }, + { + "auxiliary_loss_clip": 0.01100845, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.04186213, + "balance_loss_mlp": 1.02071345, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 2.3145493241608013, + "language_loss": 0.71609056, + "learning_rate": 2.617737661195593e-06, + "loss": 0.73743737, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 2.54823637008667 + }, + { + "auxiliary_loss_clip": 0.010992, + "auxiliary_loss_mlp": 0.0103664, + "balance_loss_clip": 1.03745198, + "balance_loss_mlp": 1.02180445, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.5810237641581761, + "language_loss": 0.75909686, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78045523, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 2.548962354660034 + }, + { + "auxiliary_loss_clip": 0.01069921, + "auxiliary_loss_mlp": 0.01040209, + "balance_loss_clip": 1.03501844, + "balance_loss_mlp": 1.02529609, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.254590263683234, + "language_loss": 0.84473819, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86583948, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 2.682917833328247 + }, + { + "auxiliary_loss_clip": 0.01103772, + "auxiliary_loss_mlp": 0.01038519, + "balance_loss_clip": 1.03986859, + "balance_loss_mlp": 1.02528071, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.6662767537633434, + "language_loss": 0.83279824, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85422117, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 4.091214895248413 + }, + { + "auxiliary_loss_clip": 0.0107479, + "auxiliary_loss_mlp": 0.01038717, + "balance_loss_clip": 1.036327, + "balance_loss_mlp": 1.02405477, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 1.8962558834341738, + "language_loss": 0.71552444, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73665947, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.611112356185913 + }, + { + "auxiliary_loss_clip": 0.01082108, + "auxiliary_loss_mlp": 0.01042456, + "balance_loss_clip": 1.03785372, + "balance_loss_mlp": 1.02928948, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 1.8620531128318603, + "language_loss": 0.75661433, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77785993, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 4.091585397720337 + }, + { + "auxiliary_loss_clip": 0.01059006, + "auxiliary_loss_mlp": 0.00749759, + "balance_loss_clip": 1.03119624, + "balance_loss_mlp": 1.00043297, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.9919032258285907, + "language_loss": 0.77109945, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78918707, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 2.747201919555664 + }, + { + "auxiliary_loss_clip": 0.0106916, + "auxiliary_loss_mlp": 0.0074972, + "balance_loss_clip": 1.03469777, + "balance_loss_mlp": 1.00033057, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 2.2320170505234804, + "language_loss": 0.77102727, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78921604, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 2.6278491020202637 + }, + { + "auxiliary_loss_clip": 0.01081375, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.03771198, + "balance_loss_mlp": 1.0226562, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 2.5233772233034153, + "language_loss": 0.75550294, + "learning_rate": 2.614773562290835e-06, + "loss": 0.77667201, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.6372530460357666 + }, + { + "auxiliary_loss_clip": 0.01003709, + "auxiliary_loss_mlp": 0.01003538, + "balance_loss_clip": 1.01312256, + "balance_loss_mlp": 1.00220251, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7786213659551632, + "language_loss": 0.54673636, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56680882, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 3.1143345832824707 + }, + { + "auxiliary_loss_clip": 0.01103346, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.04016685, + "balance_loss_mlp": 1.02268291, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.8213793330830357, + "language_loss": 0.85641229, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87780988, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 2.5605034828186035 + }, + { + "auxiliary_loss_clip": 0.0108914, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.03918469, + "balance_loss_mlp": 1.01938736, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.5749338929204815, + "language_loss": 0.70319247, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72441208, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.739440441131592 + }, + { + "auxiliary_loss_clip": 0.01111153, + "auxiliary_loss_mlp": 0.01036583, + "balance_loss_clip": 1.03912592, + "balance_loss_mlp": 1.02343965, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 17.284563932917514, + "language_loss": 0.71260124, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73407859, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 2.6817121505737305 + }, + { + "auxiliary_loss_clip": 0.01060994, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.03482151, + "balance_loss_mlp": 1.02271032, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.5624201724630744, + "language_loss": 0.72115225, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74211001, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 2.653524398803711 + }, + { + "auxiliary_loss_clip": 0.01107403, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.03819108, + "balance_loss_mlp": 1.02266228, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.008343471449221, + "language_loss": 0.70589256, + "learning_rate": 2.612549508603375e-06, + "loss": 0.72733879, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.7007827758789062 + }, + { + "auxiliary_loss_clip": 0.01022183, + "auxiliary_loss_mlp": 0.01000924, + "balance_loss_clip": 1.00815141, + "balance_loss_mlp": 0.99946386, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6656415596723406, + "language_loss": 0.4617784, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48200947, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.109201192855835 + }, + { + "auxiliary_loss_clip": 0.01105481, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.03759193, + "balance_loss_mlp": 1.02544594, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.7548638259151161, + "language_loss": 0.75076485, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77221799, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.6069412231445312 + }, + { + "auxiliary_loss_clip": 0.01091967, + "auxiliary_loss_mlp": 0.01037538, + "balance_loss_clip": 1.03763771, + "balance_loss_mlp": 1.0248301, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 1.6746601504369765, + "language_loss": 0.80508846, + "learning_rate": 2.611437167992705e-06, + "loss": 0.82638347, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.693763017654419 + }, + { + "auxiliary_loss_clip": 0.01101953, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.03894234, + "balance_loss_mlp": 1.02505314, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 2.028098706781727, + "language_loss": 0.82990795, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.85131884, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.6490893363952637 + }, + { + "auxiliary_loss_clip": 0.01088508, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.03847277, + "balance_loss_mlp": 1.02580392, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 2.050820352029375, + "language_loss": 0.75045514, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.77174675, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.6438493728637695 + }, + { + "auxiliary_loss_clip": 0.0107903, + "auxiliary_loss_mlp": 0.01038357, + "balance_loss_clip": 1.03205657, + "balance_loss_mlp": 1.0248152, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.4645982279801677, + "language_loss": 0.72640425, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74757814, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 2.683990716934204 + }, + { + "auxiliary_loss_clip": 0.01084699, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.04120386, + "balance_loss_mlp": 1.02492023, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 1.7383595431762806, + "language_loss": 0.74916673, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77039969, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.609405755996704 + }, + { + "auxiliary_loss_clip": 0.01100271, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.03587961, + "balance_loss_mlp": 1.02527809, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 1.6952432356292415, + "language_loss": 0.72616255, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74755001, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.5378878116607666 + }, + { + "auxiliary_loss_clip": 0.01093223, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.0371995, + "balance_loss_mlp": 1.02271307, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.5305206213431952, + "language_loss": 0.80868542, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.82997948, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.643066167831421 + }, + { + "auxiliary_loss_clip": 0.01077271, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.03269732, + "balance_loss_mlp": 1.02008688, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 1.8938304534157908, + "language_loss": 0.67168272, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.69280231, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 2.6177589893341064 + }, + { + "auxiliary_loss_clip": 0.01104604, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.039482, + "balance_loss_mlp": 1.02393126, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.475330931001082, + "language_loss": 0.80784154, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.82925659, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 2.5711727142333984 + }, + { + "auxiliary_loss_clip": 0.01115662, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.0382061, + "balance_loss_mlp": 1.02278697, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 2.2489751770334445, + "language_loss": 0.82395399, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84548223, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.686680316925049 + }, + { + "auxiliary_loss_clip": 0.01110372, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.03613889, + "balance_loss_mlp": 1.02199209, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 2.1015065608837626, + "language_loss": 0.83771706, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85917175, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 2.5160460472106934 + }, + { + "auxiliary_loss_clip": 0.01115733, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.03880858, + "balance_loss_mlp": 1.02035284, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 4.756452900396812, + "language_loss": 0.79125094, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81274366, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 2.6336703300476074 + }, + { + "auxiliary_loss_clip": 0.01078189, + "auxiliary_loss_mlp": 0.0102831, + "balance_loss_clip": 1.03630435, + "balance_loss_mlp": 1.01556611, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 2.7875088839546827, + "language_loss": 0.84199142, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86305642, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.6868464946746826 + }, + { + "auxiliary_loss_clip": 0.01101096, + "auxiliary_loss_mlp": 0.01037125, + "balance_loss_clip": 1.03777242, + "balance_loss_mlp": 1.02255201, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 6.041819196809342, + "language_loss": 0.56549931, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58688152, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 4.193434476852417 + }, + { + "auxiliary_loss_clip": 0.01102942, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.03938043, + "balance_loss_mlp": 1.02468693, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 3.061740201183425, + "language_loss": 0.8211261, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84253263, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.5223076343536377 + }, + { + "auxiliary_loss_clip": 0.01092547, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.03655958, + "balance_loss_mlp": 1.01791787, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.6861605406510336, + "language_loss": 0.79376256, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81500804, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.495410919189453 + }, + { + "auxiliary_loss_clip": 0.01117786, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.03810573, + "balance_loss_mlp": 1.02083528, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.8202422886724607, + "language_loss": 0.78220367, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80373245, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 2.513197183609009 + }, + { + "auxiliary_loss_clip": 0.01088139, + "auxiliary_loss_mlp": 0.01030073, + "balance_loss_clip": 1.03579509, + "balance_loss_mlp": 1.01817632, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.4830296834388907, + "language_loss": 0.72447497, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74565709, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.6125328540802 + }, + { + "auxiliary_loss_clip": 0.01086571, + "auxiliary_loss_mlp": 0.00750019, + "balance_loss_clip": 1.03639531, + "balance_loss_mlp": 1.00040936, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4554144736264851, + "language_loss": 0.74983299, + "learning_rate": 2.604758755512104e-06, + "loss": 0.76819885, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 2.680499315261841 + }, + { + "auxiliary_loss_clip": 0.01105069, + "auxiliary_loss_mlp": 0.01036335, + "balance_loss_clip": 1.03764212, + "balance_loss_mlp": 1.02193475, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.565921130488263, + "language_loss": 0.74347317, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76488721, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 2.5660736560821533 + }, + { + "auxiliary_loss_clip": 0.01093138, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.03720498, + "balance_loss_mlp": 1.01985633, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 1.9935381661367455, + "language_loss": 0.71051073, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73177797, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 2.6285345554351807 + }, + { + "auxiliary_loss_clip": 0.01020947, + "auxiliary_loss_mlp": 0.00746543, + "balance_loss_clip": 1.01308346, + "balance_loss_mlp": 0.99977952, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8280007884516823, + "language_loss": 0.6046015, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62227631, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 4.554027080535889 + }, + { + "auxiliary_loss_clip": 0.01116436, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.03958273, + "balance_loss_mlp": 1.0232985, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.5966915867174187, + "language_loss": 0.82969987, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85123974, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 2.6017441749572754 + }, + { + "auxiliary_loss_clip": 0.01029997, + "auxiliary_loss_mlp": 0.01003103, + "balance_loss_clip": 1.00621295, + "balance_loss_mlp": 1.0018096, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.838508136076821, + "language_loss": 0.65498197, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.675313, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 3.047762632369995 + }, + { + "auxiliary_loss_clip": 0.01118951, + "auxiliary_loss_mlp": 0.01039035, + "balance_loss_clip": 1.03911734, + "balance_loss_mlp": 1.02327573, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 1.9195444574971963, + "language_loss": 0.82945883, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85103869, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.503544330596924 + }, + { + "auxiliary_loss_clip": 0.01103736, + "auxiliary_loss_mlp": 0.00749763, + "balance_loss_clip": 1.03965712, + "balance_loss_mlp": 1.00040734, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.6493421152061551, + "language_loss": 0.78266591, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80120087, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 2.6175289154052734 + }, + { + "auxiliary_loss_clip": 0.01073176, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.03509271, + "balance_loss_mlp": 1.01586795, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.5098011973725307, + "language_loss": 0.80041432, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82143736, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 4.171665668487549 + }, + { + "auxiliary_loss_clip": 0.01095795, + "auxiliary_loss_mlp": 0.00749766, + "balance_loss_clip": 1.03901124, + "balance_loss_mlp": 1.00043225, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 1.982704744041552, + "language_loss": 0.756634, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77508962, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 2.533259153366089 + }, + { + "auxiliary_loss_clip": 0.01112392, + "auxiliary_loss_mlp": 0.01039748, + "balance_loss_clip": 1.03584957, + "balance_loss_mlp": 1.02630699, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 2.026314994729233, + "language_loss": 0.7546339, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77615535, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 2.494354248046875 + }, + { + "auxiliary_loss_clip": 0.01118232, + "auxiliary_loss_mlp": 0.01040505, + "balance_loss_clip": 1.04080117, + "balance_loss_mlp": 1.02627182, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.6130149214233558, + "language_loss": 0.75761855, + "learning_rate": 2.60067384046869e-06, + "loss": 0.77920592, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 4.082204580307007 + }, + { + "auxiliary_loss_clip": 0.01060882, + "auxiliary_loss_mlp": 0.010413, + "balance_loss_clip": 1.03341639, + "balance_loss_mlp": 1.02652979, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 1.8410463759116333, + "language_loss": 0.63967264, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66069448, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 2.644211530685425 + }, + { + "auxiliary_loss_clip": 0.01072885, + "auxiliary_loss_mlp": 0.01040663, + "balance_loss_clip": 1.03640938, + "balance_loss_mlp": 1.02642941, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.6154145134609084, + "language_loss": 0.76074672, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78188217, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.689768075942993 + }, + { + "auxiliary_loss_clip": 0.01076174, + "auxiliary_loss_mlp": 0.00749706, + "balance_loss_clip": 1.03826821, + "balance_loss_mlp": 1.0004046, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.5178609190854473, + "language_loss": 0.86760187, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88586068, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.593092203140259 + }, + { + "auxiliary_loss_clip": 0.01077268, + "auxiliary_loss_mlp": 0.01039783, + "balance_loss_clip": 1.03728056, + "balance_loss_mlp": 1.02680659, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 1.9039308364756937, + "language_loss": 0.67948866, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.70065916, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 2.7566826343536377 + }, + { + "auxiliary_loss_clip": 0.011164, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.03908324, + "balance_loss_mlp": 1.02359545, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 1.9654226650247513, + "language_loss": 0.77371413, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79525757, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 2.669015407562256 + }, + { + "auxiliary_loss_clip": 0.01112395, + "auxiliary_loss_mlp": 0.01039439, + "balance_loss_clip": 1.039271, + "balance_loss_mlp": 1.02500224, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.85089302881022, + "language_loss": 0.68556911, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70708746, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.563056707382202 + }, + { + "auxiliary_loss_clip": 0.01101482, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.0374198, + "balance_loss_mlp": 1.02307045, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 1.895876058327834, + "language_loss": 0.73127866, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.75265539, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.6145355701446533 + }, + { + "auxiliary_loss_clip": 0.01115209, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.03855503, + "balance_loss_mlp": 1.02128148, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.654744923919612, + "language_loss": 0.71197855, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.73347974, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.5809497833251953 + }, + { + "auxiliary_loss_clip": 0.01088101, + "auxiliary_loss_mlp": 0.0074968, + "balance_loss_clip": 1.03594673, + "balance_loss_mlp": 1.00046611, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.9234719390440842, + "language_loss": 0.82447696, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84285474, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.590696334838867 + }, + { + "auxiliary_loss_clip": 0.01075489, + "auxiliary_loss_mlp": 0.01043643, + "balance_loss_clip": 1.03643084, + "balance_loss_mlp": 1.03010058, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 2.1856400959993048, + "language_loss": 0.71630991, + "learning_rate": 2.596957889196831e-06, + "loss": 0.73750126, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.7230162620544434 + }, + { + "auxiliary_loss_clip": 0.01113863, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.03762484, + "balance_loss_mlp": 1.01726842, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.3651069380519307, + "language_loss": 0.65148205, + "learning_rate": 2.596586169335243e-06, + "loss": 0.67293423, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.5546412467956543 + }, + { + "auxiliary_loss_clip": 0.01074405, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.03458428, + "balance_loss_mlp": 1.02326071, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.6722676309576372, + "language_loss": 0.72586632, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74697733, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.7876017093658447 + }, + { + "auxiliary_loss_clip": 0.010211, + "auxiliary_loss_mlp": 0.01011909, + "balance_loss_clip": 1.00760746, + "balance_loss_mlp": 1.01055002, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.7911855650891312, + "language_loss": 0.54366481, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56399488, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 3.0897796154022217 + }, + { + "auxiliary_loss_clip": 0.01103017, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.03792, + "balance_loss_mlp": 1.02164423, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.6579730475000238, + "language_loss": 0.78582895, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80721462, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.6095588207244873 + }, + { + "auxiliary_loss_clip": 0.01113073, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.03678584, + "balance_loss_mlp": 1.02450037, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 3.600825273936137, + "language_loss": 0.81097555, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83249414, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.518486499786377 + }, + { + "auxiliary_loss_clip": 0.0109869, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.03509402, + "balance_loss_mlp": 1.01852369, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.68881071937041, + "language_loss": 0.77518618, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79649144, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.5943474769592285 + }, + { + "auxiliary_loss_clip": 0.01114266, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.03821981, + "balance_loss_mlp": 1.02175987, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.3341613893176314, + "language_loss": 0.82020688, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84170377, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.4980812072753906 + }, + { + "auxiliary_loss_clip": 0.01073842, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.03495884, + "balance_loss_mlp": 1.02060509, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.828125435399779, + "language_loss": 0.67864037, + "learning_rate": 2.593983497660586e-06, + "loss": 0.69972205, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.6172780990600586 + }, + { + "auxiliary_loss_clip": 0.0102177, + "auxiliary_loss_mlp": 0.01002846, + "balance_loss_clip": 1.00835514, + "balance_loss_mlp": 1.00146282, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6908349272519706, + "language_loss": 0.5942809, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61452711, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.1582508087158203 + }, + { + "auxiliary_loss_clip": 0.01096368, + "auxiliary_loss_mlp": 0.01033113, + "balance_loss_clip": 1.0346725, + "balance_loss_mlp": 1.01949322, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.919236523715104, + "language_loss": 0.75496578, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77626055, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 2.547869920730591 + }, + { + "auxiliary_loss_clip": 0.01088947, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.03670025, + "balance_loss_mlp": 1.01864696, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 2.0215919311961223, + "language_loss": 0.6936295, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71485144, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.566974401473999 + }, + { + "auxiliary_loss_clip": 0.01090183, + "auxiliary_loss_mlp": 0.00749648, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.00043094, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.7010536715429165, + "language_loss": 0.81180274, + "learning_rate": 2.592495760867347e-06, + "loss": 0.83020103, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.74886417388916 + }, + { + "auxiliary_loss_clip": 0.01032206, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_clip": 1.02776921, + "balance_loss_mlp": 1.02665484, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.5303614355109436, + "language_loss": 0.69720542, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.71795368, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.857102394104004 + }, + { + "auxiliary_loss_clip": 0.01096241, + "auxiliary_loss_mlp": 0.0102839, + "balance_loss_clip": 1.03663039, + "balance_loss_mlp": 1.01661813, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.5350906173225838, + "language_loss": 0.67326236, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69450867, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 2.6183552742004395 + }, + { + "auxiliary_loss_clip": 0.01080942, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.03655148, + "balance_loss_mlp": 1.0315876, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.9915949491972487, + "language_loss": 0.69433504, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71562117, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 4.206719636917114 + }, + { + "auxiliary_loss_clip": 0.01113387, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.03887093, + "balance_loss_mlp": 1.01928496, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.8575584256316862, + "language_loss": 0.76608694, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78754801, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 2.5442211627960205 + }, + { + "auxiliary_loss_clip": 0.01077277, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.03512859, + "balance_loss_mlp": 1.02308941, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.7734861734613487, + "language_loss": 0.79642498, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81755877, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.5831456184387207 + }, + { + "auxiliary_loss_clip": 0.01031644, + "auxiliary_loss_mlp": 0.01000263, + "balance_loss_clip": 1.00833082, + "balance_loss_mlp": 0.99877876, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7190387099373902, + "language_loss": 0.61934745, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.6396665, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.1943962574005127 + }, + { + "auxiliary_loss_clip": 0.0111166, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.03863573, + "balance_loss_mlp": 1.01917028, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 2.298185244420881, + "language_loss": 0.70686984, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.72830939, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 2.553063154220581 + }, + { + "auxiliary_loss_clip": 0.010869, + "auxiliary_loss_mlp": 0.01039657, + "balance_loss_clip": 1.03578329, + "balance_loss_mlp": 1.02605498, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 2.0416539079950704, + "language_loss": 0.8205862, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84185171, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.6376535892486572 + }, + { + "auxiliary_loss_clip": 0.01069508, + "auxiliary_loss_mlp": 0.01037851, + "balance_loss_clip": 1.03641927, + "balance_loss_mlp": 1.02372432, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 1.737540961889184, + "language_loss": 0.74571693, + "learning_rate": 2.589147040109424e-06, + "loss": 0.76679051, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 2.672435760498047 + }, + { + "auxiliary_loss_clip": 0.01111288, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.0362916, + "balance_loss_mlp": 1.02045429, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 2.0517250127495155, + "language_loss": 0.86749482, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88895392, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 4.011544942855835 + }, + { + "auxiliary_loss_clip": 0.01102525, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.03836989, + "balance_loss_mlp": 1.02162313, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 2.082745441165637, + "language_loss": 0.73015952, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75154543, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 2.586400032043457 + }, + { + "auxiliary_loss_clip": 0.01079721, + "auxiliary_loss_mlp": 0.01040959, + "balance_loss_clip": 1.03374362, + "balance_loss_mlp": 1.02706528, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.6103405491600935, + "language_loss": 0.70258474, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.7237916, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 2.6419010162353516 + }, + { + "auxiliary_loss_clip": 0.01085587, + "auxiliary_loss_mlp": 0.00749773, + "balance_loss_clip": 1.03540897, + "balance_loss_mlp": 1.00041473, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.8681946397656386, + "language_loss": 0.90239769, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92075133, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 2.579545259475708 + }, + { + "auxiliary_loss_clip": 0.01089615, + "auxiliary_loss_mlp": 0.01035063, + "balance_loss_clip": 1.03611612, + "balance_loss_mlp": 1.02219462, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.5468935752882518, + "language_loss": 0.77253872, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79378545, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 2.685133934020996 + }, + { + "auxiliary_loss_clip": 0.01100303, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.03807616, + "balance_loss_mlp": 1.02606034, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.8312014281576212, + "language_loss": 0.82277656, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84417963, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 4.026791572570801 + }, + { + "auxiliary_loss_clip": 0.01087702, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.0377878, + "balance_loss_mlp": 1.02308691, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 2.105188586059279, + "language_loss": 0.70487511, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72610909, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 2.6203675270080566 + }, + { + "auxiliary_loss_clip": 0.01073523, + "auxiliary_loss_mlp": 0.00749866, + "balance_loss_clip": 1.03598869, + "balance_loss_mlp": 1.00044513, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.7641738059524767, + "language_loss": 0.77775061, + "learning_rate": 2.586168879961155e-06, + "loss": 0.79598451, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 2.6240692138671875 + }, + { + "auxiliary_loss_clip": 0.01076119, + "auxiliary_loss_mlp": 0.01043718, + "balance_loss_clip": 1.03956652, + "balance_loss_mlp": 1.02851939, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.2469839948772243, + "language_loss": 0.6598382, + "learning_rate": 2.585796509770259e-06, + "loss": 0.68103653, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 4.133596181869507 + }, + { + "auxiliary_loss_clip": 0.01106412, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.03796768, + "balance_loss_mlp": 1.02263772, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 2.9587376804787286, + "language_loss": 0.7592383, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78067386, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.5665676593780518 + }, + { + "auxiliary_loss_clip": 0.01102364, + "auxiliary_loss_mlp": 0.01032994, + "balance_loss_clip": 1.03715742, + "balance_loss_mlp": 1.01933861, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.9966820565290033, + "language_loss": 0.65220976, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.6735633, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 2.630166530609131 + }, + { + "auxiliary_loss_clip": 0.01082921, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.0348599, + "balance_loss_mlp": 1.01932502, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.8139147902722053, + "language_loss": 0.73772621, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.75888711, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 2.7773752212524414 + }, + { + "auxiliary_loss_clip": 0.01098393, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.03718495, + "balance_loss_mlp": 1.02297664, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3040463719791133, + "language_loss": 0.82070816, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84204954, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.603959321975708 + }, + { + "auxiliary_loss_clip": 0.01093111, + "auxiliary_loss_mlp": 0.01040268, + "balance_loss_clip": 1.04065168, + "balance_loss_mlp": 1.02558744, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.189788184442444, + "language_loss": 0.65577233, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67710614, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.5564699172973633 + }, + { + "auxiliary_loss_clip": 0.01095581, + "auxiliary_loss_mlp": 0.01049459, + "balance_loss_clip": 1.03717017, + "balance_loss_mlp": 1.03383076, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.6145869080620354, + "language_loss": 0.75216508, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77361548, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.6798503398895264 + }, + { + "auxiliary_loss_clip": 0.01074743, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.03730404, + "balance_loss_mlp": 1.03517199, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.073020145680011, + "language_loss": 0.80612773, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82737768, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.591959238052368 + }, + { + "auxiliary_loss_clip": 0.01027245, + "auxiliary_loss_mlp": 0.01042196, + "balance_loss_clip": 1.03739882, + "balance_loss_mlp": 1.02807605, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.741993788803636, + "language_loss": 0.76819623, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.7888906, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 2.809436321258545 + }, + { + "auxiliary_loss_clip": 0.01113295, + "auxiliary_loss_mlp": 0.01032026, + "balance_loss_clip": 1.04117095, + "balance_loss_mlp": 1.01953316, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 2.1115736843366637, + "language_loss": 0.68067461, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70212781, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.8404135704040527 + }, + { + "auxiliary_loss_clip": 0.01098084, + "auxiliary_loss_mlp": 0.01038366, + "balance_loss_clip": 1.03665257, + "balance_loss_mlp": 1.02362609, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 2.0531318645513386, + "language_loss": 0.77841932, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.79978383, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.54310941696167 + }, + { + "auxiliary_loss_clip": 0.01107793, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.04118097, + "balance_loss_mlp": 1.02819586, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.6780405203607818, + "language_loss": 0.82207537, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.8435744, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.564124584197998 + }, + { + "auxiliary_loss_clip": 0.01113387, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.03705466, + "balance_loss_mlp": 1.02049208, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.009171155311937, + "language_loss": 0.73560697, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75708157, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.4604885578155518 + }, + { + "auxiliary_loss_clip": 0.01080148, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.03662181, + "balance_loss_mlp": 1.02247667, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.4963439387632913, + "language_loss": 0.86150014, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.882653, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.6734697818756104 + }, + { + "auxiliary_loss_clip": 0.01089569, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.0360043, + "balance_loss_mlp": 1.02957344, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.4050961863378768, + "language_loss": 0.72451717, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74584502, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.605205774307251 + }, + { + "auxiliary_loss_clip": 0.01077578, + "auxiliary_loss_mlp": 0.00749871, + "balance_loss_clip": 1.03811049, + "balance_loss_mlp": 1.00051379, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.555919925651105, + "language_loss": 0.82437527, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84264982, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.648423433303833 + }, + { + "auxiliary_loss_clip": 0.0101953, + "auxiliary_loss_mlp": 0.01005383, + "balance_loss_clip": 1.00657797, + "balance_loss_mlp": 1.00404203, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7970419763860261, + "language_loss": 0.60410213, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62435126, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.051243305206299 + }, + { + "auxiliary_loss_clip": 0.01114246, + "auxiliary_loss_mlp": 0.01042719, + "balance_loss_clip": 1.03848827, + "balance_loss_mlp": 1.02841353, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.745232526667914, + "language_loss": 0.77340269, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.7949723, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.474721670150757 + }, + { + "auxiliary_loss_clip": 0.01106852, + "auxiliary_loss_mlp": 0.01039712, + "balance_loss_clip": 1.03851295, + "balance_loss_mlp": 1.02394032, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 2.1253626920925672, + "language_loss": 0.84146583, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86293149, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.581082820892334 + }, + { + "auxiliary_loss_clip": 0.01080318, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.03880882, + "balance_loss_mlp": 1.02018332, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 2.7284177036060697, + "language_loss": 0.82992375, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85107267, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 2.7951037883758545 + }, + { + "auxiliary_loss_clip": 0.01089328, + "auxiliary_loss_mlp": 0.00749652, + "balance_loss_clip": 1.03988922, + "balance_loss_mlp": 1.0004735, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.7973910573440497, + "language_loss": 0.80322468, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.8216145, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 2.617220163345337 + }, + { + "auxiliary_loss_clip": 0.01113889, + "auxiliary_loss_mlp": 0.0103767, + "balance_loss_clip": 1.03741527, + "balance_loss_mlp": 1.02295923, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 2.356186159760306, + "language_loss": 0.7023896, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72390521, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.4889657497406006 + }, + { + "auxiliary_loss_clip": 0.01106384, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.04045248, + "balance_loss_mlp": 1.01715159, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.9721843241260537, + "language_loss": 0.76383305, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78520298, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.60489559173584 + }, + { + "auxiliary_loss_clip": 0.01103326, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.0375756, + "balance_loss_mlp": 1.02327538, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 2.6292204027800774, + "language_loss": 0.72039354, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.74180651, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.01086391, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.03638935, + "balance_loss_mlp": 1.02665138, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 3.709008274198984, + "language_loss": 0.66111577, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68237972, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.5486819744110107 + }, + { + "auxiliary_loss_clip": 0.01076378, + "auxiliary_loss_mlp": 0.007497, + "balance_loss_clip": 1.03477144, + "balance_loss_mlp": 1.00046587, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.7501626700411792, + "language_loss": 0.78486097, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80312175, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 4.318702459335327 + }, + { + "auxiliary_loss_clip": 0.01112764, + "auxiliary_loss_mlp": 0.01037919, + "balance_loss_clip": 1.03693354, + "balance_loss_mlp": 1.02360809, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 4.0769913088203795, + "language_loss": 0.75489318, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77640003, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.5992894172668457 + }, + { + "auxiliary_loss_clip": 0.01103163, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.03900719, + "balance_loss_mlp": 1.01913953, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.7344962190667987, + "language_loss": 0.72483307, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74620008, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 2.588242769241333 + }, + { + "auxiliary_loss_clip": 0.01080792, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.03693414, + "balance_loss_mlp": 1.02354121, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 2.0047988418071414, + "language_loss": 0.80527908, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.82647288, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 2.6040985584259033 + }, + { + "auxiliary_loss_clip": 0.01029442, + "auxiliary_loss_mlp": 0.01001187, + "balance_loss_clip": 1.0056479, + "balance_loss_mlp": 0.99979192, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9188342574375338, + "language_loss": 0.6351186, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65542489, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 3.016124963760376 + }, + { + "auxiliary_loss_clip": 0.01113935, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.03854251, + "balance_loss_mlp": 1.02215171, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.9058981412502534, + "language_loss": 0.7217164, + "learning_rate": 2.574615138284361e-06, + "loss": 0.7432282, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 2.5050060749053955 + }, + { + "auxiliary_loss_clip": 0.01116787, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.03991592, + "balance_loss_mlp": 1.01918459, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 2.1704899816170014, + "language_loss": 0.79606432, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.8175782, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 2.4930665493011475 + }, + { + "auxiliary_loss_clip": 0.01101549, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.03726208, + "balance_loss_mlp": 1.01711702, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.938125050644703, + "language_loss": 0.70170796, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72303832, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 4.043215751647949 + }, + { + "auxiliary_loss_clip": 0.01113255, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.03799343, + "balance_loss_mlp": 1.02040339, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.3366598260805893, + "language_loss": 0.71133447, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73280525, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 2.5377981662750244 + }, + { + "auxiliary_loss_clip": 0.0106763, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.03444695, + "balance_loss_mlp": 1.02530503, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 2.089096169681774, + "language_loss": 0.81600118, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83706892, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 2.6198666095733643 + }, + { + "auxiliary_loss_clip": 0.01097488, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.03677714, + "balance_loss_mlp": 1.01572168, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.9466978370002517, + "language_loss": 0.89947844, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.92073715, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 2.5217807292938232 + }, + { + "auxiliary_loss_clip": 0.01105326, + "auxiliary_loss_mlp": 0.00749968, + "balance_loss_clip": 1.03709602, + "balance_loss_mlp": 1.00046241, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 2.4340266663152295, + "language_loss": 0.63981879, + "learning_rate": 2.572376498508805e-06, + "loss": 0.65837175, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 2.568467378616333 + }, + { + "auxiliary_loss_clip": 0.01075833, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.03540778, + "balance_loss_mlp": 1.01554513, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.5179367359909572, + "language_loss": 0.7384274, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75946689, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 4.075500726699829 + }, + { + "auxiliary_loss_clip": 0.01084126, + "auxiliary_loss_mlp": 0.01043182, + "balance_loss_clip": 1.0349189, + "balance_loss_mlp": 1.02900171, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 2.0363912442023384, + "language_loss": 0.78768647, + "learning_rate": 2.571630111462766e-06, + "loss": 0.8089596, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.601072072982788 + }, + { + "auxiliary_loss_clip": 0.01085837, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.03616524, + "balance_loss_mlp": 1.01854599, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6643259212103128, + "language_loss": 0.72725457, + "learning_rate": 2.571256885418265e-06, + "loss": 0.74841714, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.562828540802002 + }, + { + "auxiliary_loss_clip": 0.01088078, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.04149556, + "balance_loss_mlp": 1.02436972, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.9151767681173528, + "language_loss": 0.79708314, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.818335, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 2.5660603046417236 + }, + { + "auxiliary_loss_clip": 0.01101097, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.04028857, + "balance_loss_mlp": 1.01833785, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.6117653903631453, + "language_loss": 0.72033632, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.741651, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 4.384556293487549 + }, + { + "auxiliary_loss_clip": 0.01110614, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.03683686, + "balance_loss_mlp": 1.02334547, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.215600764039618, + "language_loss": 0.80370748, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82517427, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.01081312, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.03539407, + "balance_loss_mlp": 1.01763821, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.8708681405722467, + "language_loss": 0.81289852, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83401519, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.546405076980591 + }, + { + "auxiliary_loss_clip": 0.01101679, + "auxiliary_loss_mlp": 0.01040983, + "balance_loss_clip": 1.03813338, + "balance_loss_mlp": 1.02693462, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.7153234893678024, + "language_loss": 0.69510543, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71653199, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.5821478366851807 + }, + { + "auxiliary_loss_clip": 0.01020438, + "auxiliary_loss_mlp": 0.01011797, + "balance_loss_clip": 1.00619483, + "balance_loss_mlp": 1.01037216, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8741259489770977, + "language_loss": 0.6705637, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69088602, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.2196543216705322 + }, + { + "auxiliary_loss_clip": 0.01101073, + "auxiliary_loss_mlp": 0.0104473, + "balance_loss_clip": 1.03842306, + "balance_loss_mlp": 1.03026366, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 7.765502672713221, + "language_loss": 0.78170657, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.8031646, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 2.5347325801849365 + }, + { + "auxiliary_loss_clip": 0.0110304, + "auxiliary_loss_mlp": 0.01039574, + "balance_loss_clip": 1.03953695, + "balance_loss_mlp": 1.02438617, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.183799474164956, + "language_loss": 0.76053399, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78196013, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.5300393104553223 + }, + { + "auxiliary_loss_clip": 0.01088767, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.03625727, + "balance_loss_mlp": 1.02232254, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 3.2113790927846066, + "language_loss": 0.80464309, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82589233, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.527265787124634 + }, + { + "auxiliary_loss_clip": 0.01093096, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.0381124, + "balance_loss_mlp": 1.02097964, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.9812210744066483, + "language_loss": 0.65711057, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.67839181, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.6156442165374756 + }, + { + "auxiliary_loss_clip": 0.01054731, + "auxiliary_loss_mlp": 0.01038883, + "balance_loss_clip": 1.03418446, + "balance_loss_mlp": 1.02535868, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 2.982023534131216, + "language_loss": 0.68809235, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70902848, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.6733238697052 + }, + { + "auxiliary_loss_clip": 0.01064807, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.03438759, + "balance_loss_mlp": 1.02019095, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.5485801169175617, + "language_loss": 0.72831994, + "learning_rate": 2.566776487287525e-06, + "loss": 0.74930298, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.6425397396087646 + }, + { + "auxiliary_loss_clip": 0.01092383, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.03655517, + "balance_loss_mlp": 1.02396393, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.8487967350612673, + "language_loss": 0.75096327, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77226341, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.678412914276123 + }, + { + "auxiliary_loss_clip": 0.01061531, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.03597939, + "balance_loss_mlp": 1.02230883, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 2.0673426023397132, + "language_loss": 0.82232726, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84328288, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.6921792030334473 + }, + { + "auxiliary_loss_clip": 0.01091048, + "auxiliary_loss_mlp": 0.0103987, + "balance_loss_clip": 1.03586507, + "balance_loss_mlp": 1.02648234, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6234142571929442, + "language_loss": 0.74001819, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76132739, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.6230034828186035 + }, + { + "auxiliary_loss_clip": 0.01102422, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.03706145, + "balance_loss_mlp": 1.0212779, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.372667369572602, + "language_loss": 0.69642627, + "learning_rate": 2.565282332284532e-06, + "loss": 0.71779799, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.592761993408203 + }, + { + "auxiliary_loss_clip": 0.01080362, + "auxiliary_loss_mlp": 0.01036878, + "balance_loss_clip": 1.03807425, + "balance_loss_mlp": 1.02319908, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.6786558395693911, + "language_loss": 0.81316268, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83433509, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 2.651202440261841 + }, + { + "auxiliary_loss_clip": 0.01113006, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.03785717, + "balance_loss_mlp": 1.02333224, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 2.3176819223146063, + "language_loss": 0.80752194, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82902658, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.528623580932617 + }, + { + "auxiliary_loss_clip": 0.0110618, + "auxiliary_loss_mlp": 0.0103461, + "balance_loss_clip": 1.03950775, + "balance_loss_mlp": 1.021384, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 2.2062921257226322, + "language_loss": 0.65402681, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67543471, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.624119281768799 + }, + { + "auxiliary_loss_clip": 0.01079348, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.03656042, + "balance_loss_mlp": 1.01668715, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.6650600352542066, + "language_loss": 0.74591005, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76699758, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 2.619326114654541 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.0102919, + "balance_loss_clip": 1.03659153, + "balance_loss_mlp": 1.01697087, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 2.139009100466997, + "language_loss": 0.75102448, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77232075, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.531593084335327 + }, + { + "auxiliary_loss_clip": 0.0108951, + "auxiliary_loss_mlp": 0.0104112, + "balance_loss_clip": 1.03558195, + "balance_loss_mlp": 1.02720881, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 2.6412363020997733, + "language_loss": 0.82640892, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.8477152, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.581652879714966 + }, + { + "auxiliary_loss_clip": 0.01091238, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.03702044, + "balance_loss_mlp": 1.01805818, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.3174477511261842, + "language_loss": 0.82106221, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84228313, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.586580991744995 + }, + { + "auxiliary_loss_clip": 0.0111515, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.03789806, + "balance_loss_mlp": 1.01546025, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 2.274224347798672, + "language_loss": 0.73152506, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.7529614, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 2.4313652515411377 + }, + { + "auxiliary_loss_clip": 0.01098322, + "auxiliary_loss_mlp": 0.01029097, + "balance_loss_clip": 1.03606129, + "balance_loss_mlp": 1.01619267, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.8444508883731707, + "language_loss": 0.83017313, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85144734, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.4924941062927246 + }, + { + "auxiliary_loss_clip": 0.01082194, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.03322101, + "balance_loss_mlp": 1.02488685, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.5114030764344384, + "language_loss": 0.73446333, + "learning_rate": 2.561545446271294e-06, + "loss": 0.75569308, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 4.071154594421387 + }, + { + "auxiliary_loss_clip": 0.01095072, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.03613949, + "balance_loss_mlp": 1.02253866, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.6569650029433576, + "language_loss": 0.74910349, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77040988, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 2.6614010334014893 + }, + { + "auxiliary_loss_clip": 0.01114784, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.03904665, + "balance_loss_mlp": 1.02247655, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.9072117514413798, + "language_loss": 0.76519716, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78669727, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 2.52761173248291 + }, + { + "auxiliary_loss_clip": 0.01087802, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.0345602, + "balance_loss_mlp": 1.0189178, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.7756660557601809, + "language_loss": 0.7989713, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82016182, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 2.5575647354125977 + }, + { + "auxiliary_loss_clip": 0.01057967, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.0336045, + "balance_loss_mlp": 1.02695346, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.3638836913927284, + "language_loss": 0.67909747, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70008636, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 2.6774003505706787 + }, + { + "auxiliary_loss_clip": 0.01087053, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.03772771, + "balance_loss_mlp": 1.02240729, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.7688612159597672, + "language_loss": 0.71283889, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73405826, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 2.5533270835876465 + }, + { + "auxiliary_loss_clip": 0.0109613, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.03524446, + "balance_loss_mlp": 1.02305532, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 1.926323713811297, + "language_loss": 0.64377868, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66511965, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 2.5893521308898926 + }, + { + "auxiliary_loss_clip": 0.01113221, + "auxiliary_loss_mlp": 0.00749697, + "balance_loss_clip": 1.03742993, + "balance_loss_mlp": 1.0004437, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 2.185440064521467, + "language_loss": 0.76608264, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78471178, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.550572395324707 + }, + { + "auxiliary_loss_clip": 0.01081737, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.0369606, + "balance_loss_mlp": 1.01508141, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.8815346204671395, + "language_loss": 0.73126578, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75237238, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 4.108644962310791 + }, + { + "auxiliary_loss_clip": 0.01080619, + "auxiliary_loss_mlp": 0.01036992, + "balance_loss_clip": 1.03294504, + "balance_loss_mlp": 1.0240941, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.5994497966005188, + "language_loss": 0.71520019, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73637629, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 2.606637954711914 + }, + { + "auxiliary_loss_clip": 0.01103631, + "auxiliary_loss_mlp": 0.01041879, + "balance_loss_clip": 1.03831005, + "balance_loss_mlp": 1.02829528, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 2.9486262682881645, + "language_loss": 0.6200192, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64147431, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 2.5698530673980713 + }, + { + "auxiliary_loss_clip": 0.01108124, + "auxiliary_loss_mlp": 0.01040572, + "balance_loss_clip": 1.03963423, + "balance_loss_mlp": 1.02501535, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.656936942012808, + "language_loss": 0.64838892, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66987586, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.5857996940612793 + }, + { + "auxiliary_loss_clip": 0.01084886, + "auxiliary_loss_mlp": 0.01035812, + "balance_loss_clip": 1.03478646, + "balance_loss_mlp": 1.02333105, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.5235523638605044, + "language_loss": 0.73879182, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.7599988, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 4.1175267696380615 + }, + { + "auxiliary_loss_clip": 0.01079716, + "auxiliary_loss_mlp": 0.01039508, + "balance_loss_clip": 1.03278983, + "balance_loss_mlp": 1.02675295, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 3.029173042902113, + "language_loss": 0.69221377, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71340597, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 2.6670844554901123 + }, + { + "auxiliary_loss_clip": 0.01086844, + "auxiliary_loss_mlp": 0.01039648, + "balance_loss_clip": 1.03713763, + "balance_loss_mlp": 1.0263741, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.8178478370471085, + "language_loss": 0.69074738, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.71201229, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 2.5460152626037598 + }, + { + "auxiliary_loss_clip": 0.01066763, + "auxiliary_loss_mlp": 0.01042267, + "balance_loss_clip": 1.03327215, + "balance_loss_mlp": 1.02826607, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 6.901084066717033, + "language_loss": 0.74358928, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76467955, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 4.193904638290405 + }, + { + "auxiliary_loss_clip": 0.01045424, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.0329957, + "balance_loss_mlp": 1.01975119, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 2.2949703846598233, + "language_loss": 0.74706757, + "learning_rate": 2.555562005426573e-06, + "loss": 0.76784688, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.660344362258911 + }, + { + "auxiliary_loss_clip": 0.01089272, + "auxiliary_loss_mlp": 0.00749516, + "balance_loss_clip": 1.03761005, + "balance_loss_mlp": 1.00047231, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.949113392909361, + "language_loss": 0.76813591, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.78652376, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 2.61806321144104 + }, + { + "auxiliary_loss_clip": 0.01087788, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.03571391, + "balance_loss_mlp": 1.02228284, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 2.0110289647984017, + "language_loss": 0.85863721, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87986153, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.6029727458953857 + }, + { + "auxiliary_loss_clip": 0.01055024, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.03247356, + "balance_loss_mlp": 1.01951444, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 3.013702444100794, + "language_loss": 0.81314838, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83402222, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.698617458343506 + }, + { + "auxiliary_loss_clip": 0.01070869, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.03919435, + "balance_loss_mlp": 1.02162826, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.6150990520873192, + "language_loss": 0.81077629, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83182323, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.631476879119873 + }, + { + "auxiliary_loss_clip": 0.01096578, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.03344131, + "balance_loss_mlp": 1.02153945, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.7533574038952575, + "language_loss": 0.80427742, + "learning_rate": 2.553691071416498e-06, + "loss": 0.8255924, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.5794196128845215 + }, + { + "auxiliary_loss_clip": 0.01108248, + "auxiliary_loss_mlp": 0.00749305, + "balance_loss_clip": 1.03767526, + "balance_loss_mlp": 1.00032735, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.9634795462259726, + "language_loss": 0.7485109, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76708639, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.5212199687957764 + }, + { + "auxiliary_loss_clip": 0.01099408, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.03688645, + "balance_loss_mlp": 1.01826346, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 1.8183739533926262, + "language_loss": 0.81296206, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83426929, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 2.594464063644409 + }, + { + "auxiliary_loss_clip": 0.01064969, + "auxiliary_loss_mlp": 0.01037328, + "balance_loss_clip": 1.03529239, + "balance_loss_mlp": 1.02434576, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.8651455559422325, + "language_loss": 0.7607671, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78179002, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.603912115097046 + }, + { + "auxiliary_loss_clip": 0.01066015, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.03570676, + "balance_loss_mlp": 1.02282298, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.9590416870216991, + "language_loss": 0.73717296, + "learning_rate": 2.552193946194937e-06, + "loss": 0.75819284, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.707615375518799 + }, + { + "auxiliary_loss_clip": 0.0110054, + "auxiliary_loss_mlp": 0.00749513, + "balance_loss_clip": 1.03821659, + "balance_loss_mlp": 1.00044608, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 2.002461493770674, + "language_loss": 0.77674866, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79524916, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.5586864948272705 + }, + { + "auxiliary_loss_clip": 0.01092075, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.03921938, + "balance_loss_mlp": 1.02067852, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 2.2405831546371555, + "language_loss": 0.73464167, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75590175, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.5492491722106934 + }, + { + "auxiliary_loss_clip": 0.01088998, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.03664851, + "balance_loss_mlp": 1.02175117, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.270758108143972, + "language_loss": 0.77436072, + "learning_rate": 2.551070882366973e-06, + "loss": 0.79559839, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.589339017868042 + }, + { + "auxiliary_loss_clip": 0.01069823, + "auxiliary_loss_mlp": 0.00749541, + "balance_loss_clip": 1.03607345, + "balance_loss_mlp": 1.00031614, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.6619623078079182, + "language_loss": 0.78415143, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80234504, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 2.6437904834747314 + }, + { + "auxiliary_loss_clip": 0.01086011, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.03588367, + "balance_loss_mlp": 1.02305722, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 6.924821753889545, + "language_loss": 0.74572694, + "learning_rate": 2.550322068641355e-06, + "loss": 0.76694357, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 2.5119640827178955 + }, + { + "auxiliary_loss_clip": 0.01086674, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.03175092, + "balance_loss_mlp": 1.0229727, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 1.8325204912846413, + "language_loss": 0.84198374, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.8632068, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.465824604034424 + }, + { + "auxiliary_loss_clip": 0.01032662, + "auxiliary_loss_mlp": 0.01040259, + "balance_loss_clip": 1.02952886, + "balance_loss_mlp": 1.02601337, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 2.0017375613312196, + "language_loss": 0.74901646, + "learning_rate": 2.549573171442666e-06, + "loss": 0.76974565, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.697199821472168 + }, + { + "auxiliary_loss_clip": 0.01097256, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.03610885, + "balance_loss_mlp": 1.02328706, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 2.0992636651062972, + "language_loss": 0.78746009, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.80879372, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.548846960067749 + }, + { + "auxiliary_loss_clip": 0.01114475, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.03919101, + "balance_loss_mlp": 1.02023625, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 14.006084812302134, + "language_loss": 0.7643472, + "learning_rate": 2.548824190884499e-06, + "loss": 0.7858305, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 2.5253021717071533 + }, + { + "auxiliary_loss_clip": 0.01011735, + "auxiliary_loss_mlp": 0.01000861, + "balance_loss_clip": 1.00796342, + "balance_loss_mlp": 0.9992637, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7706540941994623, + "language_loss": 0.561948, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58207399, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 3.015300989151001 + }, + { + "auxiliary_loss_clip": 0.01105971, + "auxiliary_loss_mlp": 0.00749217, + "balance_loss_clip": 1.03616309, + "balance_loss_mlp": 1.00030422, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6362712458108004, + "language_loss": 0.81044537, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.82899719, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.5219321250915527 + }, + { + "auxiliary_loss_clip": 0.01097807, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.03528452, + "balance_loss_mlp": 1.01640534, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 2.3481464201011684, + "language_loss": 0.81607747, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.83734643, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 2.5295820236206055 + }, + { + "auxiliary_loss_clip": 0.01100836, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.03932595, + "balance_loss_mlp": 1.02516603, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.6764239319765286, + "language_loss": 0.86649787, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88789308, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 2.535024642944336 + }, + { + "auxiliary_loss_clip": 0.01086237, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.03915167, + "balance_loss_mlp": 1.01831865, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.9072750757252952, + "language_loss": 0.78115451, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80232298, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 2.6172704696655273 + }, + { + "auxiliary_loss_clip": 0.0105827, + "auxiliary_loss_mlp": 0.01038238, + "balance_loss_clip": 1.0356741, + "balance_loss_mlp": 1.02508307, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.0073906990963826, + "language_loss": 0.76911455, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.79007959, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 4.083094120025635 + }, + { + "auxiliary_loss_clip": 0.0107821, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.03594255, + "balance_loss_mlp": 1.01790452, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 1.5770479565544178, + "language_loss": 0.73332512, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75441849, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 2.669144630432129 + }, + { + "auxiliary_loss_clip": 0.01098236, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.03545642, + "balance_loss_mlp": 1.0171653, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 2.047649859613928, + "language_loss": 0.79021639, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81149894, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 2.563136339187622 + }, + { + "auxiliary_loss_clip": 0.01094024, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.03470099, + "balance_loss_mlp": 1.01891363, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 2.0276081852322436, + "language_loss": 0.82940698, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85065448, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 2.596242666244507 + }, + { + "auxiliary_loss_clip": 0.01096922, + "auxiliary_loss_mlp": 0.01041272, + "balance_loss_clip": 1.03934002, + "balance_loss_mlp": 1.02758646, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 2.187695295307433, + "language_loss": 0.87193221, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89331412, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 2.6370251178741455 + }, + { + "auxiliary_loss_clip": 0.01080391, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.03475845, + "balance_loss_mlp": 1.01973557, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5705464499965822, + "language_loss": 0.77719522, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.79831672, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 2.654066324234009 + }, + { + "auxiliary_loss_clip": 0.01061196, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.03118539, + "balance_loss_mlp": 1.02258873, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.6248264066941835, + "language_loss": 0.7974118, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81837738, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 2.6245696544647217 + }, + { + "auxiliary_loss_clip": 0.01103419, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.03847313, + "balance_loss_mlp": 1.02499235, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 4.149759442324059, + "language_loss": 0.75037491, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77180737, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 2.4672346115112305 + }, + { + "auxiliary_loss_clip": 0.01061058, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.03232884, + "balance_loss_mlp": 1.02047777, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 3.110888106655765, + "language_loss": 0.70207536, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72302997, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 4.1824119091033936 + }, + { + "auxiliary_loss_clip": 0.01084479, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.03476381, + "balance_loss_mlp": 1.0208199, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.637184360521112, + "language_loss": 0.7128644, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73404384, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 2.744908571243286 + }, + { + "auxiliary_loss_clip": 0.01092776, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.03437614, + "balance_loss_mlp": 1.01942253, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 1.9718015009250385, + "language_loss": 0.78442091, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80567491, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.516576051712036 + }, + { + "auxiliary_loss_clip": 0.01074804, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.03358376, + "balance_loss_mlp": 1.01877666, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.837881486360053, + "language_loss": 0.78863722, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80971014, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 2.5882554054260254 + }, + { + "auxiliary_loss_clip": 0.01083219, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.0352993, + "balance_loss_mlp": 1.01925278, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 2.17395952474454, + "language_loss": 0.88543677, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90658188, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 4.091665029525757 + }, + { + "auxiliary_loss_clip": 0.01111198, + "auxiliary_loss_mlp": 0.0103286, + "balance_loss_clip": 1.03601539, + "balance_loss_mlp": 1.019526, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.8272331569336786, + "language_loss": 0.82794869, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84938931, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.54856014251709 + }, + { + "auxiliary_loss_clip": 0.01113385, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.03810191, + "balance_loss_mlp": 1.02164054, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 2.4346865225673526, + "language_loss": 0.71599567, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.73747814, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.512024402618408 + }, + { + "auxiliary_loss_clip": 0.01098727, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.03627801, + "balance_loss_mlp": 1.02059579, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.0716255015392138, + "language_loss": 0.82306921, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.84438694, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 4.05840802192688 + }, + { + "auxiliary_loss_clip": 0.01078838, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.03262043, + "balance_loss_mlp": 1.02366567, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.661291175122568, + "language_loss": 0.82796234, + "learning_rate": 2.54057993551933e-06, + "loss": 0.84911776, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 2.5893971920013428 + }, + { + "auxiliary_loss_clip": 0.01102788, + "auxiliary_loss_mlp": 0.01037823, + "balance_loss_clip": 1.0378592, + "balance_loss_mlp": 1.0228498, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 2.206219034890555, + "language_loss": 0.76899755, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79040366, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.522132635116577 + }, + { + "auxiliary_loss_clip": 0.01093893, + "auxiliary_loss_mlp": 0.01035241, + "balance_loss_clip": 1.03445089, + "balance_loss_mlp": 1.02235436, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.0992554794324576, + "language_loss": 0.72942346, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75071478, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.5749576091766357 + }, + { + "auxiliary_loss_clip": 0.01003099, + "auxiliary_loss_mlp": 0.00746442, + "balance_loss_clip": 1.01386118, + "balance_loss_mlp": 0.999722, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 1.1290921944793557, + "language_loss": 0.59019715, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.60769254, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 3.0327537059783936 + }, + { + "auxiliary_loss_clip": 0.01078315, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.03234279, + "balance_loss_mlp": 1.02422202, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.6463191769256649, + "language_loss": 0.79006636, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.8112303, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.604759454727173 + }, + { + "auxiliary_loss_clip": 0.01111196, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.03520858, + "balance_loss_mlp": 1.02779448, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.942727091026285, + "language_loss": 0.67154372, + "learning_rate": 2.538704852009177e-06, + "loss": 0.6930635, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.53416109085083 + }, + { + "auxiliary_loss_clip": 0.01082593, + "auxiliary_loss_mlp": 0.00749624, + "balance_loss_clip": 1.0364337, + "balance_loss_mlp": 1.00041366, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 2.3561475144720476, + "language_loss": 0.75165451, + "learning_rate": 2.538329773967034e-06, + "loss": 0.76997674, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 2.549668312072754 + }, + { + "auxiliary_loss_clip": 0.01097289, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.03694367, + "balance_loss_mlp": 1.02066302, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.7443424442050406, + "language_loss": 0.7182411, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73953551, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 2.5433406829833984 + }, + { + "auxiliary_loss_clip": 0.01081758, + "auxiliary_loss_mlp": 0.00749483, + "balance_loss_clip": 1.03589308, + "balance_loss_mlp": 1.00053716, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.554198352399414, + "language_loss": 0.78510952, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80342191, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.5888593196868896 + }, + { + "auxiliary_loss_clip": 0.01087963, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.03654492, + "balance_loss_mlp": 1.0245496, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.7079763030111623, + "language_loss": 0.81867278, + "learning_rate": 2.537204417416387e-06, + "loss": 0.83992869, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.559938430786133 + }, + { + "auxiliary_loss_clip": 0.01011586, + "auxiliary_loss_mlp": 0.01002173, + "balance_loss_clip": 1.00769949, + "balance_loss_mlp": 1.00083792, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6747109944524956, + "language_loss": 0.60792601, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62806356, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.25180983543396 + }, + { + "auxiliary_loss_clip": 0.0110786, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.03483295, + "balance_loss_mlp": 1.01787138, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.8243794589732805, + "language_loss": 0.762586, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78396392, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 2.530576467514038 + }, + { + "auxiliary_loss_clip": 0.01095809, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.03515995, + "balance_loss_mlp": 1.0204941, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.5423914930006897, + "language_loss": 0.77745008, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79873097, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 2.5894994735717773 + }, + { + "auxiliary_loss_clip": 0.01079028, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.03236437, + "balance_loss_mlp": 1.02623641, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.7538168709645279, + "language_loss": 0.77267367, + "learning_rate": 2.535703656890086e-06, + "loss": 0.79388189, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.5589725971221924 + }, + { + "auxiliary_loss_clip": 0.01107863, + "auxiliary_loss_mlp": 0.00749635, + "balance_loss_clip": 1.03663802, + "balance_loss_mlp": 1.00039256, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.6305776648117376, + "language_loss": 0.7685312, + "learning_rate": 2.5353284159381e-06, + "loss": 0.78710616, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.5081064701080322 + }, + { + "auxiliary_loss_clip": 0.01110184, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.03617394, + "balance_loss_mlp": 1.01789021, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.548953135521228, + "language_loss": 0.8261373, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84755838, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 2.4583852291107178 + }, + { + "auxiliary_loss_clip": 0.01060813, + "auxiliary_loss_mlp": 0.0104515, + "balance_loss_clip": 1.03198051, + "balance_loss_mlp": 1.02978373, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.348729227082061, + "language_loss": 0.74519074, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.76625031, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.5505144596099854 + }, + { + "auxiliary_loss_clip": 0.01099717, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.03484762, + "balance_loss_mlp": 1.01688766, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.5448666249732805, + "language_loss": 0.73327309, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75456738, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 2.545125722885132 + }, + { + "auxiliary_loss_clip": 0.01094564, + "auxiliary_loss_mlp": 0.01036213, + "balance_loss_clip": 1.03531241, + "balance_loss_mlp": 1.02109087, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.7838753265893852, + "language_loss": 0.81167054, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83297825, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.573540687561035 + }, + { + "auxiliary_loss_clip": 0.01086658, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.03757763, + "balance_loss_mlp": 1.02008009, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.660318232414768, + "language_loss": 0.84099209, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86218178, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.620605945587158 + }, + { + "auxiliary_loss_clip": 0.01085055, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.03408945, + "balance_loss_mlp": 1.01706123, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 1.652825737594499, + "language_loss": 0.75157082, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77271909, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 2.552797317504883 + }, + { + "auxiliary_loss_clip": 0.01079918, + "auxiliary_loss_mlp": 0.00750255, + "balance_loss_clip": 1.03134751, + "balance_loss_mlp": 1.00058568, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.6382030753122496, + "language_loss": 0.81497049, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83327228, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 2.519415855407715 + }, + { + "auxiliary_loss_clip": 0.01085689, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.03516293, + "balance_loss_mlp": 1.02280807, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.6169430161651759, + "language_loss": 0.88762343, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90884715, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 2.5567259788513184 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.00749446, + "balance_loss_clip": 1.03751683, + "balance_loss_mlp": 1.00050831, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.6766163805289032, + "language_loss": 0.75751007, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77599126, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.5699026584625244 + }, + { + "auxiliary_loss_clip": 0.0109767, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.02028632, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.573242399825947, + "language_loss": 0.77607787, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.7973876, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 4.14955472946167 + }, + { + "auxiliary_loss_clip": 0.01076926, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.03252947, + "balance_loss_mlp": 1.01907361, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 2.07600328357548, + "language_loss": 0.73365217, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75473535, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 2.686923027038574 + }, + { + "auxiliary_loss_clip": 0.0109467, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.03751862, + "balance_loss_mlp": 1.02388668, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.3100118386320867, + "language_loss": 0.74932444, + "learning_rate": 2.530823945207421e-06, + "loss": 0.77064204, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 2.694573402404785 + }, + { + "auxiliary_loss_clip": 0.01074436, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.03478122, + "balance_loss_mlp": 1.01897454, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 4.111135789962614, + "language_loss": 0.75911945, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78017783, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 2.6296002864837646 + }, + { + "auxiliary_loss_clip": 0.01005999, + "auxiliary_loss_mlp": 0.01005336, + "balance_loss_clip": 1.01794004, + "balance_loss_mlp": 1.00371504, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.869007337578814, + "language_loss": 0.68234044, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70245379, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 3.1860828399658203 + }, + { + "auxiliary_loss_clip": 0.01076417, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.03506708, + "balance_loss_mlp": 1.02225125, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.7203910217890144, + "language_loss": 0.77934593, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80045509, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 2.561582088470459 + }, + { + "auxiliary_loss_clip": 0.01057352, + "auxiliary_loss_mlp": 0.01042787, + "balance_loss_clip": 1.0331583, + "balance_loss_mlp": 1.02893507, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 2.582610336159456, + "language_loss": 0.71364999, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73465145, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.6818349361419678 + }, + { + "auxiliary_loss_clip": 0.01081121, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.03203452, + "balance_loss_mlp": 1.02429962, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.4465244432364854, + "language_loss": 0.7964952, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81767178, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 2.6285390853881836 + }, + { + "auxiliary_loss_clip": 0.01062323, + "auxiliary_loss_mlp": 0.01030972, + "balance_loss_clip": 1.03407252, + "balance_loss_mlp": 1.01899719, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.8453956025329517, + "language_loss": 0.75232267, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77325571, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 4.228353261947632 + }, + { + "auxiliary_loss_clip": 0.01058709, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.03281355, + "balance_loss_mlp": 1.02632201, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 1.7585991350843402, + "language_loss": 0.79291034, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81389761, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 2.658498764038086 + }, + { + "auxiliary_loss_clip": 0.01089843, + "auxiliary_loss_mlp": 0.01041292, + "balance_loss_clip": 1.03603244, + "balance_loss_mlp": 1.02832234, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.8466695135913624, + "language_loss": 0.75904125, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.78035259, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 2.5344231128692627 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01037091, + "balance_loss_clip": 1.03831947, + "balance_loss_mlp": 1.02417421, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 2.392866976773754, + "language_loss": 0.59188414, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.61335504, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.7107269763946533 + }, + { + "auxiliary_loss_clip": 0.01087007, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.0350039, + "balance_loss_mlp": 1.022012, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 1.9550612105197682, + "language_loss": 0.64905405, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67027879, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 4.102670669555664 + }, + { + "auxiliary_loss_clip": 0.01113298, + "auxiliary_loss_mlp": 0.01039283, + "balance_loss_clip": 1.03723633, + "balance_loss_mlp": 1.026021, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 2.860020766603568, + "language_loss": 0.72809547, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74962127, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.513925790786743 + }, + { + "auxiliary_loss_clip": 0.01096861, + "auxiliary_loss_mlp": 0.01043836, + "balance_loss_clip": 1.03768039, + "balance_loss_mlp": 1.03099072, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 2.831712985914583, + "language_loss": 0.73036969, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.7517767, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.5746543407440186 + }, + { + "auxiliary_loss_clip": 0.01072218, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.03328133, + "balance_loss_mlp": 1.0178721, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.370981931042748, + "language_loss": 0.81199598, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83301723, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 4.335066795349121 + }, + { + "auxiliary_loss_clip": 0.01089077, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.0367887, + "balance_loss_mlp": 1.02160668, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.032582522272104, + "language_loss": 0.6842249, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70545465, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.6159603595733643 + }, + { + "auxiliary_loss_clip": 0.0108694, + "auxiliary_loss_mlp": 0.00749502, + "balance_loss_clip": 1.03546083, + "balance_loss_mlp": 1.00031126, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 2.068740460178069, + "language_loss": 0.87361676, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89198112, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.604137659072876 + }, + { + "auxiliary_loss_clip": 0.01062498, + "auxiliary_loss_mlp": 0.01041084, + "balance_loss_clip": 1.03989911, + "balance_loss_mlp": 1.02592063, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 3.1880285311734347, + "language_loss": 0.64350051, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66453636, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.7683544158935547 + }, + { + "auxiliary_loss_clip": 0.01054481, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.03245902, + "balance_loss_mlp": 1.01695192, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 2.6830081149544633, + "language_loss": 0.82288867, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.84371591, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.687251091003418 + }, + { + "auxiliary_loss_clip": 0.01075824, + "auxiliary_loss_mlp": 0.0104657, + "balance_loss_clip": 1.03381991, + "balance_loss_mlp": 1.03336191, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.8261730110082959, + "language_loss": 0.812585, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.8338089, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.6499218940734863 + }, + { + "auxiliary_loss_clip": 0.0108619, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.0344224, + "balance_loss_mlp": 1.01925576, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.9851701753628257, + "language_loss": 0.73738003, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75855446, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.5995495319366455 + }, + { + "auxiliary_loss_clip": 0.01108726, + "auxiliary_loss_mlp": 0.00749407, + "balance_loss_clip": 1.03835177, + "balance_loss_mlp": 1.00043201, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.6820575203135417, + "language_loss": 0.75280631, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77138764, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.7076566219329834 + }, + { + "auxiliary_loss_clip": 0.01057955, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.03433609, + "balance_loss_mlp": 1.02109635, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 1.631735894140753, + "language_loss": 0.78794569, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80886495, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.667781114578247 + }, + { + "auxiliary_loss_clip": 0.01079678, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.03685641, + "balance_loss_mlp": 1.02300978, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.9501541711415784, + "language_loss": 0.80808282, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.82925069, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.6025354862213135 + }, + { + "auxiliary_loss_clip": 0.01088393, + "auxiliary_loss_mlp": 0.01034305, + "balance_loss_clip": 1.03654051, + "balance_loss_mlp": 1.02162731, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.531270249260138, + "language_loss": 0.7012189, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72244585, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.5712006092071533 + }, + { + "auxiliary_loss_clip": 0.01095499, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.0350343, + "balance_loss_mlp": 1.02009177, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.5842568070482386, + "language_loss": 0.81478107, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83606619, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 2.574427604675293 + }, + { + "auxiliary_loss_clip": 0.01085514, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.03592098, + "balance_loss_mlp": 1.02291822, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.7109209928698361, + "language_loss": 0.82001638, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84121901, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.5930838584899902 + }, + { + "auxiliary_loss_clip": 0.01095782, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.03444338, + "balance_loss_mlp": 1.0233047, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 1.7962668234368857, + "language_loss": 0.74735731, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76866162, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.609156608581543 + }, + { + "auxiliary_loss_clip": 0.01084692, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.03721237, + "balance_loss_mlp": 1.01937103, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.6941702554843108, + "language_loss": 0.76593089, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78708762, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.6077377796173096 + }, + { + "auxiliary_loss_clip": 0.01101093, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.03813648, + "balance_loss_mlp": 1.01972842, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.6225418794686508, + "language_loss": 0.65112734, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67245948, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 2.6438066959381104 + }, + { + "auxiliary_loss_clip": 0.01083174, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.03440344, + "balance_loss_mlp": 1.01968372, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.802912745077673, + "language_loss": 0.71895552, + "learning_rate": 2.519926222304191e-06, + "loss": 0.74010485, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.71077036857605 + }, + { + "auxiliary_loss_clip": 0.0107659, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.03398108, + "balance_loss_mlp": 1.0226891, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 1.7600637018389722, + "language_loss": 0.75051862, + "learning_rate": 2.519550141025255e-06, + "loss": 0.7716471, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 2.5374398231506348 + }, + { + "auxiliary_loss_clip": 0.01090801, + "auxiliary_loss_mlp": 0.01036334, + "balance_loss_clip": 1.03702641, + "balance_loss_mlp": 1.02177286, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.663057248925409, + "language_loss": 0.75407028, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77534163, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 2.561546802520752 + }, + { + "auxiliary_loss_clip": 0.01071573, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.03431392, + "balance_loss_mlp": 1.02391207, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 3.080896509732689, + "language_loss": 0.74035388, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76144117, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 2.6648881435394287 + }, + { + "auxiliary_loss_clip": 0.01083645, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.03730035, + "balance_loss_mlp": 1.01681149, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.8056672306631354, + "language_loss": 0.68814766, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.70927793, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.6377363204956055 + }, + { + "auxiliary_loss_clip": 0.01078257, + "auxiliary_loss_mlp": 0.01038974, + "balance_loss_clip": 1.03530216, + "balance_loss_mlp": 1.02564597, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 2.1157177222338306, + "language_loss": 0.77308911, + "learning_rate": 2.518045619038202e-06, + "loss": 0.7942614, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 2.615825653076172 + }, + { + "auxiliary_loss_clip": 0.01044714, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.03451192, + "balance_loss_mlp": 1.02134848, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 2.805338633242822, + "language_loss": 0.69462132, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71541661, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 2.691633701324463 + }, + { + "auxiliary_loss_clip": 0.01100994, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.03602791, + "balance_loss_mlp": 1.02410054, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 1.7446125282956217, + "language_loss": 0.65278023, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67415535, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 2.5667452812194824 + }, + { + "auxiliary_loss_clip": 0.01075169, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.03389633, + "balance_loss_mlp": 1.01610255, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 2.356803693993331, + "language_loss": 0.72962141, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75067043, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.558957576751709 + }, + { + "auxiliary_loss_clip": 0.01111499, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.03572869, + "balance_loss_mlp": 1.01643968, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.986308782583909, + "language_loss": 0.94010341, + "learning_rate": 2.516540782741694e-06, + "loss": 0.9615165, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 3.9732775688171387 + }, + { + "auxiliary_loss_clip": 0.0106833, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.03354836, + "balance_loss_mlp": 1.01994944, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.6015742577368062, + "language_loss": 0.61074239, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63175631, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 2.66680645942688 + }, + { + "auxiliary_loss_clip": 0.01078647, + "auxiliary_loss_mlp": 0.00749706, + "balance_loss_clip": 1.03424275, + "balance_loss_mlp": 1.00043941, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 2.260840759815655, + "language_loss": 0.77578467, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79406822, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 2.5992777347564697 + }, + { + "auxiliary_loss_clip": 0.01098567, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.03752315, + "balance_loss_mlp": 1.01787162, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.5197397497921237, + "language_loss": 0.84335816, + "learning_rate": 2.515411949802964e-06, + "loss": 0.8646487, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 2.580590009689331 + }, + { + "auxiliary_loss_clip": 0.01094105, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.03629375, + "balance_loss_mlp": 1.01973629, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 2.1944365078879122, + "language_loss": 0.76413691, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78541452, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 2.6150054931640625 + }, + { + "auxiliary_loss_clip": 0.01068089, + "auxiliary_loss_mlp": 0.01037712, + "balance_loss_clip": 1.03829813, + "balance_loss_mlp": 1.02414584, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.7546104339520197, + "language_loss": 0.80387485, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82493281, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 2.7621517181396484 + }, + { + "auxiliary_loss_clip": 0.01096656, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.03538644, + "balance_loss_mlp": 1.0235064, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 1.7960550014381287, + "language_loss": 0.81767929, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.83901697, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 2.6644644737243652 + }, + { + "auxiliary_loss_clip": 0.01096888, + "auxiliary_loss_mlp": 0.01042007, + "balance_loss_clip": 1.03861237, + "balance_loss_mlp": 1.0282681, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.1743846187136833, + "language_loss": 0.76120138, + "learning_rate": 2.513906565661973e-06, + "loss": 0.78259027, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 2.535358428955078 + }, + { + "auxiliary_loss_clip": 0.01061963, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.03594446, + "balance_loss_mlp": 1.02097225, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.4219597050600317, + "language_loss": 0.68524837, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70619971, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 4.201461315155029 + }, + { + "auxiliary_loss_clip": 0.01073972, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.0336597, + "balance_loss_mlp": 1.01672697, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.7136961360995955, + "language_loss": 0.72235179, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74339747, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.7647862434387207 + }, + { + "auxiliary_loss_clip": 0.01051852, + "auxiliary_loss_mlp": 0.01040484, + "balance_loss_clip": 1.03515506, + "balance_loss_mlp": 1.02541637, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 2.190704878826229, + "language_loss": 0.74602246, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76694578, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 2.7445530891418457 + }, + { + "auxiliary_loss_clip": 0.01090629, + "auxiliary_loss_mlp": 0.01040857, + "balance_loss_clip": 1.03692579, + "balance_loss_mlp": 1.02658212, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 2.0882925065172806, + "language_loss": 0.58823872, + "learning_rate": 2.512400869722782e-06, + "loss": 0.60955358, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.6730401515960693 + }, + { + "auxiliary_loss_clip": 0.01038175, + "auxiliary_loss_mlp": 0.01042328, + "balance_loss_clip": 1.02985358, + "balance_loss_mlp": 1.02622294, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.4086092702874538, + "language_loss": 0.77651107, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79731613, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 4.234563589096069 + }, + { + "auxiliary_loss_clip": 0.01109766, + "auxiliary_loss_mlp": 0.01031458, + "balance_loss_clip": 1.0388732, + "balance_loss_mlp": 1.01820827, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.608409049623279, + "language_loss": 0.81288958, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83430183, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.612528085708618 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.03687358, + "balance_loss_mlp": 1.02139854, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 1.6065392122881408, + "language_loss": 0.63137782, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65270293, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.554144859313965 + }, + { + "auxiliary_loss_clip": 0.0107414, + "auxiliary_loss_mlp": 0.00749844, + "balance_loss_clip": 1.03439069, + "balance_loss_mlp": 1.00055432, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.6558124521358046, + "language_loss": 0.86011243, + "learning_rate": 2.510894862898928e-06, + "loss": 0.87835228, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 4.078108072280884 + }, + { + "auxiliary_loss_clip": 0.01089986, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.03663039, + "balance_loss_mlp": 1.01814103, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.7602586053109421, + "language_loss": 0.72497553, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74619031, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.5724143981933594 + }, + { + "auxiliary_loss_clip": 0.01082227, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.03970504, + "balance_loss_mlp": 1.01986933, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.8654274007896747, + "language_loss": 0.81845194, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83961105, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.7689802646636963 + }, + { + "auxiliary_loss_clip": 0.01083518, + "auxiliary_loss_mlp": 0.00749954, + "balance_loss_clip": 1.03864813, + "balance_loss_mlp": 1.00050843, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 2.895034573679526, + "language_loss": 0.79155862, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.80989337, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.624873161315918 + }, + { + "auxiliary_loss_clip": 0.01082718, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.03325713, + "balance_loss_mlp": 1.01994455, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 4.411335203138791, + "language_loss": 0.68710005, + "learning_rate": 2.509388546104138e-06, + "loss": 0.7082727, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.6190686225891113 + }, + { + "auxiliary_loss_clip": 0.01046895, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.03454065, + "balance_loss_mlp": 1.02127194, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6186614321282915, + "language_loss": 0.81424749, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83505785, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.696471691131592 + }, + { + "auxiliary_loss_clip": 0.01052797, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.03573775, + "balance_loss_mlp": 1.01816082, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.650663091555799, + "language_loss": 0.73439074, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75522888, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.718777894973755 + }, + { + "auxiliary_loss_clip": 0.01054873, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.03656602, + "balance_loss_mlp": 1.02314401, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.5514432215013507, + "language_loss": 0.76754624, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78845638, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.7236571311950684 + }, + { + "auxiliary_loss_clip": 0.01096566, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.03530359, + "balance_loss_mlp": 1.02545691, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 2.1536401387240276, + "language_loss": 0.85789037, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87924445, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.6352028846740723 + }, + { + "auxiliary_loss_clip": 0.01112648, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.0393858, + "balance_loss_mlp": 1.02615702, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.7378810263135651, + "language_loss": 0.72578079, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74729341, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 2.5838959217071533 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.03787982, + "balance_loss_mlp": 1.02002573, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.7101685297716882, + "language_loss": 0.8711549, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89248681, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.567996025085449 + }, + { + "auxiliary_loss_clip": 0.01093871, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.03826737, + "balance_loss_mlp": 1.02685106, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.9085679844691936, + "language_loss": 0.81630534, + "learning_rate": 2.506751748594683e-06, + "loss": 0.83763736, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.5968101024627686 + }, + { + "auxiliary_loss_clip": 0.01104281, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.04100823, + "balance_loss_mlp": 1.02296853, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.9987770736805301, + "language_loss": 0.84789538, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86930013, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.609450101852417 + }, + { + "auxiliary_loss_clip": 0.01089265, + "auxiliary_loss_mlp": 0.01040904, + "balance_loss_clip": 1.03514218, + "balance_loss_mlp": 1.02709317, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.8977685449390176, + "language_loss": 0.69222295, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71352464, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.5756025314331055 + }, + { + "auxiliary_loss_clip": 0.01081315, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03605199, + "balance_loss_mlp": 1.02129149, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.6945745317217107, + "language_loss": 0.83664304, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85781527, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.6084372997283936 + }, + { + "auxiliary_loss_clip": 0.01099597, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.03835046, + "balance_loss_mlp": 1.02491021, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.5231446116514855, + "language_loss": 0.70339584, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72477412, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.567626714706421 + }, + { + "auxiliary_loss_clip": 0.01086263, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.03655708, + "balance_loss_mlp": 1.02183449, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 1.783031625791951, + "language_loss": 0.81116915, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83237994, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.5979714393615723 + }, + { + "auxiliary_loss_clip": 0.0111179, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.03810811, + "balance_loss_mlp": 1.02344608, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.881313498574892, + "language_loss": 0.77895409, + "learning_rate": 2.504490886831089e-06, + "loss": 0.80043662, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 2.5198912620544434 + }, + { + "auxiliary_loss_clip": 0.01111159, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.03982794, + "balance_loss_mlp": 1.02241874, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.5764816852583152, + "language_loss": 0.76054281, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78200507, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 2.506956100463867 + }, + { + "auxiliary_loss_clip": 0.01100176, + "auxiliary_loss_mlp": 0.01036884, + "balance_loss_clip": 1.03670061, + "balance_loss_mlp": 1.02300811, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.7785566052171418, + "language_loss": 0.73036635, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75173694, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 2.569981575012207 + }, + { + "auxiliary_loss_clip": 0.01089001, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.03690732, + "balance_loss_mlp": 1.02121735, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 1.9638090087702083, + "language_loss": 0.76689506, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78812611, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 2.629204750061035 + }, + { + "auxiliary_loss_clip": 0.01011024, + "auxiliary_loss_mlp": 0.01002533, + "balance_loss_clip": 1.01330757, + "balance_loss_mlp": 1.00101328, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.742552228692538, + "language_loss": 0.57044363, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59057915, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.117873430252075 + }, + { + "auxiliary_loss_clip": 0.01088032, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.02928829, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 1.705860006641882, + "language_loss": 0.71013713, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.73145044, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 2.697279453277588 + }, + { + "auxiliary_loss_clip": 0.01059396, + "auxiliary_loss_mlp": 0.01042136, + "balance_loss_clip": 1.03307366, + "balance_loss_mlp": 1.02657914, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 2.356084004466775, + "language_loss": 0.69209981, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.7131151, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.639239549636841 + }, + { + "auxiliary_loss_clip": 0.01044639, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.01759982, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.6247359772674443, + "language_loss": 0.79902798, + "learning_rate": 2.501852344559726e-06, + "loss": 0.81976193, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 2.703165292739868 + }, + { + "auxiliary_loss_clip": 0.01071526, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_clip": 1.03671277, + "balance_loss_mlp": 1.03062844, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 2.0218026462079695, + "language_loss": 0.75312424, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77427995, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 4.229674339294434 + }, + { + "auxiliary_loss_clip": 0.01054533, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.03438663, + "balance_loss_mlp": 1.02180576, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 2.3144450282703097, + "language_loss": 0.6199376, + "learning_rate": 2.501098303852298e-06, + "loss": 0.64083719, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 2.8192827701568604 + }, + { + "auxiliary_loss_clip": 0.01086092, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.03588045, + "balance_loss_mlp": 1.01736164, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.274722079270998, + "language_loss": 0.72762513, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.74878907, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 2.603358030319214 + }, + { + "auxiliary_loss_clip": 0.01087186, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.03817463, + "balance_loss_mlp": 1.02092123, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.481188693868121, + "language_loss": 0.81956941, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.84078062, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 2.608814239501953 + }, + { + "auxiliary_loss_clip": 0.01107048, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.03625751, + "balance_loss_mlp": 1.02016783, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 46.86783392307645, + "language_loss": 0.74280083, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76419359, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 2.5303919315338135 + }, + { + "auxiliary_loss_clip": 0.01113971, + "auxiliary_loss_mlp": 0.01035307, + "balance_loss_clip": 1.03819203, + "balance_loss_mlp": 1.02131212, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 4.114171882491203, + "language_loss": 0.79832625, + "learning_rate": 2.499589994531454e-06, + "loss": 0.81981897, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 2.495576858520508 + }, + { + "auxiliary_loss_clip": 0.01092644, + "auxiliary_loss_mlp": 0.01037465, + "balance_loss_clip": 1.04031122, + "balance_loss_mlp": 1.02441764, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 1.812741334459428, + "language_loss": 0.74932218, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77062321, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 2.6106722354888916 + }, + { + "auxiliary_loss_clip": 0.01047892, + "auxiliary_loss_mlp": 0.01040931, + "balance_loss_clip": 1.03329444, + "balance_loss_mlp": 1.02593446, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 2.0056513535489082, + "language_loss": 0.7956481, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81653631, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.823648691177368 + }, + { + "auxiliary_loss_clip": 0.01023184, + "auxiliary_loss_mlp": 0.01007122, + "balance_loss_clip": 1.01041865, + "balance_loss_mlp": 1.00562561, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6986292496052652, + "language_loss": 0.54928124, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56958431, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.2139835357666016 + }, + { + "auxiliary_loss_clip": 0.01112699, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.03844416, + "balance_loss_mlp": 1.02426004, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.692060548093215, + "language_loss": 0.69808692, + "learning_rate": 2.498081382098581e-06, + "loss": 0.71959317, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 4.086917400360107 + }, + { + "auxiliary_loss_clip": 0.01085703, + "auxiliary_loss_mlp": 0.01051866, + "balance_loss_clip": 1.03663325, + "balance_loss_mlp": 1.03700054, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.9055331210196225, + "language_loss": 0.75044662, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77182233, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.738328218460083 + }, + { + "auxiliary_loss_clip": 0.01094538, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.03637755, + "balance_loss_mlp": 1.01732945, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.951269310392309, + "language_loss": 0.8036781, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82490814, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 2.592803955078125 + }, + { + "auxiliary_loss_clip": 0.01089154, + "auxiliary_loss_mlp": 0.010347, + "balance_loss_clip": 1.04019928, + "balance_loss_mlp": 1.02231455, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.049585591627213, + "language_loss": 0.79948723, + "learning_rate": 2.496949724407266e-06, + "loss": 0.8207258, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 4.235965967178345 + }, + { + "auxiliary_loss_clip": 0.01095868, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.04156661, + "balance_loss_mlp": 1.01834869, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.7526068574763727, + "language_loss": 0.72569203, + "learning_rate": 2.496572467468988e-06, + "loss": 0.74697077, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.639389753341675 + }, + { + "auxiliary_loss_clip": 0.01086894, + "auxiliary_loss_mlp": 0.00749553, + "balance_loss_clip": 1.03612983, + "balance_loss_mlp": 1.00043178, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.815719654456028, + "language_loss": 0.72546852, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74383295, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 2.6681082248687744 + }, + { + "auxiliary_loss_clip": 0.01073189, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.03682268, + "balance_loss_mlp": 1.0231849, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.6440185257105748, + "language_loss": 0.66207898, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68316376, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 4.071149110794067 + }, + { + "auxiliary_loss_clip": 0.01115715, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.03957582, + "balance_loss_mlp": 1.02082992, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.6896548174243147, + "language_loss": 0.81550777, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.83700478, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.5326340198516846 + }, + { + "auxiliary_loss_clip": 0.0108526, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.035743, + "balance_loss_mlp": 1.02198243, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.8642364884237048, + "language_loss": 0.77073127, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.79192734, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.5724399089813232 + }, + { + "auxiliary_loss_clip": 0.01082957, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.03422141, + "balance_loss_mlp": 1.02189898, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.7474651052087473, + "language_loss": 0.7548911, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77606344, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.5888054370880127 + }, + { + "auxiliary_loss_clip": 0.01067566, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.03383923, + "balance_loss_mlp": 1.02349854, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.670155874569771, + "language_loss": 0.85026741, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87130737, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.637902021408081 + }, + { + "auxiliary_loss_clip": 0.010881, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.03665709, + "balance_loss_mlp": 1.021698, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.8154540641181067, + "language_loss": 0.80478489, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82601738, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.6239633560180664 + }, + { + "auxiliary_loss_clip": 0.01100405, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.03741193, + "balance_loss_mlp": 1.02517939, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.7317132860574296, + "language_loss": 0.80152023, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82289827, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.565767526626587 + }, + { + "auxiliary_loss_clip": 0.01099035, + "auxiliary_loss_mlp": 0.0103094, + "balance_loss_clip": 1.0373807, + "balance_loss_mlp": 1.01812494, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 2.3335169559685736, + "language_loss": 0.7528621, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77416182, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.6177139282226562 + }, + { + "auxiliary_loss_clip": 0.01074237, + "auxiliary_loss_mlp": 0.01028482, + "balance_loss_clip": 1.033777, + "balance_loss_mlp": 1.01526213, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.5292247889946105, + "language_loss": 0.73898304, + "learning_rate": 2.492798864792712e-06, + "loss": 0.76001018, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 2.657742738723755 + }, + { + "auxiliary_loss_clip": 0.01089841, + "auxiliary_loss_mlp": 0.01040826, + "balance_loss_clip": 1.0368427, + "balance_loss_mlp": 1.02735531, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 2.967115310516266, + "language_loss": 0.82271743, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84402406, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.6041901111602783 + }, + { + "auxiliary_loss_clip": 0.01077241, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.03431761, + "balance_loss_mlp": 1.01998997, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.3598230219941363, + "language_loss": 0.84224695, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86335266, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.6668028831481934 + }, + { + "auxiliary_loss_clip": 0.01073104, + "auxiliary_loss_mlp": 0.01046365, + "balance_loss_clip": 1.03168106, + "balance_loss_mlp": 1.03130841, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.652611249498577, + "language_loss": 0.78393793, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80513263, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.608016014099121 + }, + { + "auxiliary_loss_clip": 0.01110023, + "auxiliary_loss_mlp": 0.01040042, + "balance_loss_clip": 1.03759122, + "balance_loss_mlp": 1.0274595, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 1.7840873199475682, + "language_loss": 0.77663696, + "learning_rate": 2.491288899685288e-06, + "loss": 0.79813755, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.539029359817505 + }, + { + "auxiliary_loss_clip": 0.01071075, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.03285313, + "balance_loss_mlp": 1.01723528, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.6628932672033494, + "language_loss": 0.64867836, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.66969442, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.6972358226776123 + }, + { + "auxiliary_loss_clip": 0.0109219, + "auxiliary_loss_mlp": 0.01036025, + "balance_loss_clip": 1.03402472, + "balance_loss_mlp": 1.02177954, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.7071640549381688, + "language_loss": 0.7453016, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76658368, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.5758087635040283 + }, + { + "auxiliary_loss_clip": 0.01077449, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.03794789, + "balance_loss_mlp": 1.02229929, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.3144183242151, + "language_loss": 0.78688776, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80801803, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 2.7111027240753174 + }, + { + "auxiliary_loss_clip": 0.01068348, + "auxiliary_loss_mlp": 0.0104161, + "balance_loss_clip": 1.03412735, + "balance_loss_mlp": 1.02817535, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.8316902960710704, + "language_loss": 0.73020911, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75130868, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 2.663105010986328 + }, + { + "auxiliary_loss_clip": 0.01070569, + "auxiliary_loss_mlp": 0.01044449, + "balance_loss_clip": 1.03583193, + "balance_loss_mlp": 1.0288806, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 1.9036745417610863, + "language_loss": 0.75152689, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77267706, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 2.5951833724975586 + }, + { + "auxiliary_loss_clip": 0.01100378, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.037781, + "balance_loss_mlp": 1.02001476, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.5942934968277669, + "language_loss": 0.69196141, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71329594, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 2.622520685195923 + }, + { + "auxiliary_loss_clip": 0.0108885, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_clip": 1.03440523, + "balance_loss_mlp": 1.01895785, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.3536201013839977, + "language_loss": 0.70731461, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72851837, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.7204551696777344 + }, + { + "auxiliary_loss_clip": 0.01098231, + "auxiliary_loss_mlp": 0.01029426, + "balance_loss_clip": 1.03727233, + "balance_loss_mlp": 1.01663482, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.6254028814898205, + "language_loss": 0.72579962, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74707621, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.7298636436462402 + }, + { + "auxiliary_loss_clip": 0.01077311, + "auxiliary_loss_mlp": 0.00749963, + "balance_loss_clip": 1.03496444, + "balance_loss_mlp": 1.00042462, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.9146501293453437, + "language_loss": 0.76898491, + "learning_rate": 2.487890389750719e-06, + "loss": 0.78725767, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 2.796560049057007 + }, + { + "auxiliary_loss_clip": 0.01087473, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.0355798, + "balance_loss_mlp": 1.01883924, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.6760906715022603, + "language_loss": 0.71023726, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.73142612, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 2.688729763031006 + }, + { + "auxiliary_loss_clip": 0.0106282, + "auxiliary_loss_mlp": 0.01041652, + "balance_loss_clip": 1.03567994, + "balance_loss_mlp": 1.0258925, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.93490254192826, + "language_loss": 0.70873725, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72978199, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 2.719078540802002 + }, + { + "auxiliary_loss_clip": 0.01089798, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.03856659, + "balance_loss_mlp": 1.02193737, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.853371004625556, + "language_loss": 0.82321703, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84445751, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 4.34220552444458 + }, + { + "auxiliary_loss_clip": 0.01096078, + "auxiliary_loss_mlp": 0.01046623, + "balance_loss_clip": 1.03712463, + "balance_loss_mlp": 1.03199625, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 4.684512023769436, + "language_loss": 0.68615758, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.7075845, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 2.776022434234619 + }, + { + "auxiliary_loss_clip": 0.01089263, + "auxiliary_loss_mlp": 0.00749487, + "balance_loss_clip": 1.03713167, + "balance_loss_mlp": 1.00043917, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.594978889137911, + "language_loss": 0.7780019, + "learning_rate": 2.486001680477873e-06, + "loss": 0.79638946, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 2.7031805515289307 + }, + { + "auxiliary_loss_clip": 0.01088347, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.03802693, + "balance_loss_mlp": 1.01809025, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.766134362363848, + "language_loss": 0.69078648, + "learning_rate": 2.485623883278308e-06, + "loss": 0.71198201, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 2.6638522148132324 + }, + { + "auxiliary_loss_clip": 0.01069623, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.03517008, + "balance_loss_mlp": 1.01666677, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.534638728557512, + "language_loss": 0.62778091, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.6487788, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.6471614837646484 + }, + { + "auxiliary_loss_clip": 0.01114181, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.03887725, + "balance_loss_mlp": 1.01761639, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 1.973549166015403, + "language_loss": 0.7193318, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74077964, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 2.5355496406555176 + }, + { + "auxiliary_loss_clip": 0.01093327, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.03639257, + "balance_loss_mlp": 1.01904941, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.7556861984523986, + "language_loss": 0.76905835, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79031539, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 2.6157073974609375 + }, + { + "auxiliary_loss_clip": 0.01096825, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.03722298, + "balance_loss_mlp": 1.01478815, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.8584619708454377, + "language_loss": 0.7113561, + "learning_rate": 2.484112510474251e-06, + "loss": 0.73259109, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.5549490451812744 + }, + { + "auxiliary_loss_clip": 0.01079, + "auxiliary_loss_mlp": 0.00749611, + "balance_loss_clip": 1.03430176, + "balance_loss_mlp": 1.00051129, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 3.8123630139212104, + "language_loss": 0.76061916, + "learning_rate": 2.483734621343429e-06, + "loss": 0.77890527, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.595566987991333 + }, + { + "auxiliary_loss_clip": 0.01100284, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.03692651, + "balance_loss_mlp": 1.01951814, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 1.960821154789137, + "language_loss": 0.8135547, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83487463, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 2.5201754570007324 + }, + { + "auxiliary_loss_clip": 0.01078366, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.03573048, + "balance_loss_mlp": 1.02180409, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.2433748682810455, + "language_loss": 0.84980983, + "learning_rate": 2.482978788066318e-06, + "loss": 0.8709408, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.6325771808624268 + }, + { + "auxiliary_loss_clip": 0.01085802, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.03411293, + "balance_loss_mlp": 1.01635647, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 1.9501499259712687, + "language_loss": 0.6750952, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69624436, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 4.226130247116089 + }, + { + "auxiliary_loss_clip": 0.01094334, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.03872311, + "balance_loss_mlp": 1.01856375, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 2.7895429333016293, + "language_loss": 0.76345229, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.78471231, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 4.190731048583984 + }, + { + "auxiliary_loss_clip": 0.01087856, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.03643751, + "balance_loss_mlp": 1.01617098, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.194439402552814, + "language_loss": 0.74539077, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76656085, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.599090576171875 + }, + { + "auxiliary_loss_clip": 0.01078372, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.04047728, + "balance_loss_mlp": 1.02304018, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 4.341709633950047, + "language_loss": 0.64670742, + "learning_rate": 2.481466901851506e-06, + "loss": 0.66784775, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 2.645918846130371 + }, + { + "auxiliary_loss_clip": 0.01085228, + "auxiliary_loss_mlp": 0.01035347, + "balance_loss_clip": 1.03710234, + "balance_loss_mlp": 1.02222848, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.7764992189128834, + "language_loss": 0.79825473, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.81946045, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 2.58337664604187 + }, + { + "auxiliary_loss_clip": 0.01061262, + "auxiliary_loss_mlp": 0.01043885, + "balance_loss_clip": 1.03181088, + "balance_loss_mlp": 1.02943683, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.6635253515149464, + "language_loss": 0.79467624, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81572771, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 4.166500568389893 + }, + { + "auxiliary_loss_clip": 0.01093713, + "auxiliary_loss_mlp": 0.01039998, + "balance_loss_clip": 1.03590322, + "balance_loss_mlp": 1.02602077, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.8100576573502993, + "language_loss": 0.79863513, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.81997228, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.578448534011841 + }, + { + "auxiliary_loss_clip": 0.01073199, + "auxiliary_loss_mlp": 0.01037358, + "balance_loss_clip": 1.03576791, + "balance_loss_mlp": 1.02534175, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.8521320102671368, + "language_loss": 0.69649512, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.7176007, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 2.6523215770721436 + }, + { + "auxiliary_loss_clip": 0.0099075, + "auxiliary_loss_mlp": 0.01001646, + "balance_loss_clip": 1.00852847, + "balance_loss_mlp": 0.99979872, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8786574757208613, + "language_loss": 0.56903237, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.5889563, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.2530038356781006 + }, + { + "auxiliary_loss_clip": 0.01053306, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.03074265, + "balance_loss_mlp": 1.02947128, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.6936747776305043, + "language_loss": 0.76193208, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78289038, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.660614013671875 + }, + { + "auxiliary_loss_clip": 0.01103594, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.03812909, + "balance_loss_mlp": 1.02224731, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.8546374477834071, + "language_loss": 0.80681366, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82820499, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.6112029552459717 + }, + { + "auxiliary_loss_clip": 0.01004377, + "auxiliary_loss_mlp": 0.01003491, + "balance_loss_clip": 1.01045489, + "balance_loss_mlp": 1.00188148, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.6697843902660422, + "language_loss": 0.54587549, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56595421, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 3.192188262939453 + }, + { + "auxiliary_loss_clip": 0.01110953, + "auxiliary_loss_mlp": 0.01026258, + "balance_loss_clip": 1.04044771, + "balance_loss_mlp": 1.01496267, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.422869774846538, + "language_loss": 0.69656813, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71794015, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.547046661376953 + }, + { + "auxiliary_loss_clip": 0.01067882, + "auxiliary_loss_mlp": 0.01026082, + "balance_loss_clip": 1.03385973, + "balance_loss_mlp": 1.01360667, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.5423545134952896, + "language_loss": 0.76416886, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78510845, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.654346227645874 + }, + { + "auxiliary_loss_clip": 0.01081889, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.03426921, + "balance_loss_mlp": 1.01741791, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 2.8978611251211652, + "language_loss": 0.8402186, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86134219, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.773176431655884 + }, + { + "auxiliary_loss_clip": 0.01087976, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.03730178, + "balance_loss_mlp": 1.01608121, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.1975328423175284, + "language_loss": 0.77077734, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79194766, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.6518800258636475 + }, + { + "auxiliary_loss_clip": 0.01092658, + "auxiliary_loss_mlp": 0.01030668, + "balance_loss_clip": 1.03424716, + "balance_loss_mlp": 1.01793671, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 3.486371035590166, + "language_loss": 0.73326993, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75450319, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.6562581062316895 + }, + { + "auxiliary_loss_clip": 0.01087951, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.03680778, + "balance_loss_mlp": 1.02153158, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.9129245035279365, + "language_loss": 0.74991304, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77112734, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.5969510078430176 + }, + { + "auxiliary_loss_clip": 0.01057033, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.0338099, + "balance_loss_mlp": 1.02163172, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.4154751202652913, + "language_loss": 0.76069164, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78160614, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.697350025177002 + }, + { + "auxiliary_loss_clip": 0.01085899, + "auxiliary_loss_mlp": 0.01037333, + "balance_loss_clip": 1.03739595, + "balance_loss_mlp": 1.02553082, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 2.1197006860619014, + "language_loss": 0.73207796, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75331026, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 2.54931902885437 + }, + { + "auxiliary_loss_clip": 0.01075628, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.03891599, + "balance_loss_mlp": 1.02078342, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 3.167926400128596, + "language_loss": 0.79784858, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81893557, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 2.6418144702911377 + }, + { + "auxiliary_loss_clip": 0.01082463, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.0355047, + "balance_loss_mlp": 1.02276838, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.9821350080942834, + "language_loss": 0.75424719, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77545613, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 2.649338722229004 + }, + { + "auxiliary_loss_clip": 0.0108531, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.03723824, + "balance_loss_mlp": 1.02075922, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 2.0508042525587005, + "language_loss": 0.72270334, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74389064, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.5818769931793213 + }, + { + "auxiliary_loss_clip": 0.01102128, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.03669071, + "balance_loss_mlp": 1.02499175, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.516230263168616, + "language_loss": 0.62659907, + "learning_rate": 2.473903107384165e-06, + "loss": 0.64800376, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.623134136199951 + }, + { + "auxiliary_loss_clip": 0.01014099, + "auxiliary_loss_mlp": 0.00746443, + "balance_loss_clip": 1.00989699, + "balance_loss_mlp": 0.99964869, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7622838203011074, + "language_loss": 0.52673984, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54434526, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 3.204042911529541 + }, + { + "auxiliary_loss_clip": 0.01088273, + "auxiliary_loss_mlp": 0.01044669, + "balance_loss_clip": 1.03421509, + "balance_loss_mlp": 1.02976799, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 2.0392528044871683, + "language_loss": 0.70727813, + "learning_rate": 2.473146330693997e-06, + "loss": 0.72860759, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 2.591604709625244 + }, + { + "auxiliary_loss_clip": 0.01036358, + "auxiliary_loss_mlp": 0.01045125, + "balance_loss_clip": 1.03166413, + "balance_loss_mlp": 1.03164864, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.6191151988567547, + "language_loss": 0.69935155, + "learning_rate": 2.472767915429105e-06, + "loss": 0.72016633, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 2.7556605339050293 + }, + { + "auxiliary_loss_clip": 0.01014194, + "auxiliary_loss_mlp": 0.01000079, + "balance_loss_clip": 1.01081717, + "balance_loss_mlp": 0.99869615, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8952536310565569, + "language_loss": 0.63958526, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.65972793, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 2.9855213165283203 + }, + { + "auxiliary_loss_clip": 0.01074198, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.03426015, + "balance_loss_mlp": 1.02333343, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.083734850901821, + "language_loss": 0.73530269, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75640935, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 2.7518277168273926 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01036822, + "balance_loss_clip": 1.03531647, + "balance_loss_mlp": 1.02311301, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 2.5310739760726575, + "language_loss": 0.79294002, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81438518, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 4.062046051025391 + }, + { + "auxiliary_loss_clip": 0.01072448, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.020576, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.695101560190375, + "language_loss": 0.76377183, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78482682, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 2.671910285949707 + }, + { + "auxiliary_loss_clip": 0.01010815, + "auxiliary_loss_mlp": 0.01001401, + "balance_loss_clip": 1.0080409, + "balance_loss_mlp": 0.99988699, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.795349265673374, + "language_loss": 0.63854748, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65866965, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 2.897848606109619 + }, + { + "auxiliary_loss_clip": 0.01113091, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.03847003, + "balance_loss_mlp": 1.02099514, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 1.751216054856658, + "language_loss": 0.85945976, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88093644, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.6000568866729736 + }, + { + "auxiliary_loss_clip": 0.0110002, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.03645456, + "balance_loss_mlp": 1.02484703, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 3.1385875995707244, + "language_loss": 0.80677426, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82816637, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 2.5786325931549072 + }, + { + "auxiliary_loss_clip": 0.01088764, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.03948104, + "balance_loss_mlp": 1.02172518, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 3.0909613505481373, + "language_loss": 0.82861006, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.84985018, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.6506917476654053 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.03919888, + "balance_loss_mlp": 1.02178812, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 2.0291720486508322, + "language_loss": 0.70350039, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72488809, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 2.5942318439483643 + }, + { + "auxiliary_loss_clip": 0.0107905, + "auxiliary_loss_mlp": 0.01037589, + "balance_loss_clip": 1.03504431, + "balance_loss_mlp": 1.02369475, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.9787015630173126, + "language_loss": 0.74325216, + "learning_rate": 2.468982779140819e-06, + "loss": 0.7644186, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.637941360473633 + }, + { + "auxiliary_loss_clip": 0.01112773, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.03893113, + "balance_loss_mlp": 1.02420855, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 2.0203537642493115, + "language_loss": 0.80399609, + "learning_rate": 2.468604167463827e-06, + "loss": 0.8254956, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.4803576469421387 + }, + { + "auxiliary_loss_clip": 0.01055361, + "auxiliary_loss_mlp": 0.00749316, + "balance_loss_clip": 1.02994466, + "balance_loss_mlp": 1.00032806, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.74108835672766, + "language_loss": 0.73596799, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75401485, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 2.6768147945404053 + }, + { + "auxiliary_loss_clip": 0.0108689, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.03855324, + "balance_loss_mlp": 1.01839066, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.782147129297691, + "language_loss": 0.87300533, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89419174, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.582839250564575 + }, + { + "auxiliary_loss_clip": 0.01112795, + "auxiliary_loss_mlp": 0.01038156, + "balance_loss_clip": 1.03944027, + "balance_loss_mlp": 1.02632451, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 1.839195749577277, + "language_loss": 0.75678611, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.77829564, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 5.5285539627075195 + }, + { + "auxiliary_loss_clip": 0.01070831, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.03517699, + "balance_loss_mlp": 1.02337098, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.7758529485213692, + "language_loss": 0.64700794, + "learning_rate": 2.467089543204268e-06, + "loss": 0.66806418, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 2.860074520111084 + }, + { + "auxiliary_loss_clip": 0.01114013, + "auxiliary_loss_mlp": 0.01035055, + "balance_loss_clip": 1.0379709, + "balance_loss_mlp": 1.02126861, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.9902265047106082, + "language_loss": 0.78307354, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80456424, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 2.535404920578003 + }, + { + "auxiliary_loss_clip": 0.01089262, + "auxiliary_loss_mlp": 0.00749704, + "balance_loss_clip": 1.03661454, + "balance_loss_mlp": 1.00050259, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5346791310020362, + "language_loss": 0.77156812, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.78995776, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 2.6083133220672607 + }, + { + "auxiliary_loss_clip": 0.01082381, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.03500354, + "balance_loss_mlp": 1.02454019, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.6112715301342426, + "language_loss": 0.73270839, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75391209, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.6716856956481934 + }, + { + "auxiliary_loss_clip": 0.01090501, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.03945744, + "balance_loss_mlp": 1.02105629, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.7983729409479554, + "language_loss": 0.75460666, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77584612, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.64768123626709 + }, + { + "auxiliary_loss_clip": 0.01086044, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.03648543, + "balance_loss_mlp": 1.01949489, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.851935180562039, + "language_loss": 0.70142335, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72261071, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 4.156805992126465 + }, + { + "auxiliary_loss_clip": 0.01088689, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.03905249, + "balance_loss_mlp": 1.0204705, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.280699062960188, + "language_loss": 0.69450617, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71572924, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.5683953762054443 + }, + { + "auxiliary_loss_clip": 0.01083179, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.03373885, + "balance_loss_mlp": 1.02158284, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 2.442982726758179, + "language_loss": 0.82346565, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84465277, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.628164768218994 + }, + { + "auxiliary_loss_clip": 0.01082853, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.03777647, + "balance_loss_mlp": 1.02523065, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 2.615252447269854, + "language_loss": 0.74738848, + "learning_rate": 2.464059445424366e-06, + "loss": 0.7686103, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 2.598710060119629 + }, + { + "auxiliary_loss_clip": 0.00990192, + "auxiliary_loss_mlp": 0.01007207, + "balance_loss_clip": 1.00795352, + "balance_loss_mlp": 1.00584817, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6771437980919255, + "language_loss": 0.55706418, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57703817, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 3.3399055004119873 + }, + { + "auxiliary_loss_clip": 0.01086942, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03534579, + "balance_loss_mlp": 1.0220437, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 2.5714930082879137, + "language_loss": 0.74513704, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76635063, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.670245885848999 + }, + { + "auxiliary_loss_clip": 0.01077713, + "auxiliary_loss_mlp": 0.01040189, + "balance_loss_clip": 1.03329182, + "balance_loss_mlp": 1.02630687, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 2.0548552974856156, + "language_loss": 0.74474752, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.7659266, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.574218988418579 + }, + { + "auxiliary_loss_clip": 0.01089029, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.03635824, + "balance_loss_mlp": 1.0195539, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 1.8143740513289592, + "language_loss": 0.73196507, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75318551, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.57395076751709 + }, + { + "auxiliary_loss_clip": 0.01111474, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.03855491, + "balance_loss_mlp": 1.0226078, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.3798702444446498, + "language_loss": 0.73513663, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.75660479, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.638887405395508 + }, + { + "auxiliary_loss_clip": 0.01079918, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.0350188, + "balance_loss_mlp": 1.02048182, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.671489239186387, + "language_loss": 0.80064601, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82177329, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 2.558112621307373 + }, + { + "auxiliary_loss_clip": 0.01070749, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.03394556, + "balance_loss_mlp": 1.01784313, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.9874119992484285, + "language_loss": 0.72097445, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74198377, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 2.7181060314178467 + }, + { + "auxiliary_loss_clip": 0.01110009, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.03645158, + "balance_loss_mlp": 1.02001524, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.902959173059746, + "language_loss": 0.70438194, + "learning_rate": 2.461028221425126e-06, + "loss": 0.7258119, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 2.5778915882110596 + }, + { + "auxiliary_loss_clip": 0.01097276, + "auxiliary_loss_mlp": 0.01028201, + "balance_loss_clip": 1.03637922, + "balance_loss_mlp": 1.01703131, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.0939535871903145, + "language_loss": 0.67874521, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70000005, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 2.567370653152466 + }, + { + "auxiliary_loss_clip": 0.01078459, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.0352819, + "balance_loss_mlp": 1.02143455, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 2.9380793109238668, + "language_loss": 0.83563441, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.8567766, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 2.623469829559326 + }, + { + "auxiliary_loss_clip": 0.01021479, + "auxiliary_loss_mlp": 0.01000255, + "balance_loss_clip": 1.00777507, + "balance_loss_mlp": 0.99883044, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.7607996809290236, + "language_loss": 0.55226183, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57247913, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 3.1993894577026367 + }, + { + "auxiliary_loss_clip": 0.01061491, + "auxiliary_loss_mlp": 0.01045006, + "balance_loss_clip": 1.03565025, + "balance_loss_mlp": 1.03129053, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 6.434636851251297, + "language_loss": 0.82464647, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.84571141, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 2.6066722869873047 + }, + { + "auxiliary_loss_clip": 0.01110443, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.03717959, + "balance_loss_mlp": 1.0151242, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 1.8943459741972852, + "language_loss": 0.84066355, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86204696, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.590543031692505 + }, + { + "auxiliary_loss_clip": 0.01087026, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.03619242, + "balance_loss_mlp": 1.02152765, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 2.442000578421798, + "language_loss": 0.77080524, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79201341, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 2.560314178466797 + }, + { + "auxiliary_loss_clip": 0.01094371, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.03767991, + "balance_loss_mlp": 1.01647496, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 2.6218119371308366, + "language_loss": 0.75852233, + "learning_rate": 2.458374982357057e-06, + "loss": 0.77975452, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 2.574453353881836 + }, + { + "auxiliary_loss_clip": 0.01082066, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.03494, + "balance_loss_mlp": 1.03360629, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 1.9672398315218742, + "language_loss": 0.6918512, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71315032, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 2.553079605102539 + }, + { + "auxiliary_loss_clip": 0.01039282, + "auxiliary_loss_mlp": 0.01042536, + "balance_loss_clip": 1.02967811, + "balance_loss_mlp": 1.0273782, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 6.748485217642967, + "language_loss": 0.72840303, + "learning_rate": 2.457616757401656e-06, + "loss": 0.74922121, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 2.7388904094696045 + }, + { + "auxiliary_loss_clip": 0.01089356, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.03747475, + "balance_loss_mlp": 1.02007508, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.4824607916009995, + "language_loss": 0.64849734, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66971254, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.640220880508423 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01038802, + "balance_loss_clip": 1.03961897, + "balance_loss_mlp": 1.02575445, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.7615303216059663, + "language_loss": 0.7991401, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82055283, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 2.5571441650390625 + }, + { + "auxiliary_loss_clip": 0.01105451, + "auxiliary_loss_mlp": 0.01045084, + "balance_loss_clip": 1.04140711, + "balance_loss_mlp": 1.03243566, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 2.8440592300336753, + "language_loss": 0.65188485, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67339027, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 4.157005071640015 + }, + { + "auxiliary_loss_clip": 0.01086309, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.03705716, + "balance_loss_mlp": 1.02568567, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 2.6211148517988083, + "language_loss": 0.75808752, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77934062, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 2.630612850189209 + }, + { + "auxiliary_loss_clip": 0.01114, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.03878736, + "balance_loss_mlp": 1.02302611, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.6801011864442088, + "language_loss": 0.81379932, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83530343, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.584014892578125 + }, + { + "auxiliary_loss_clip": 0.01063246, + "auxiliary_loss_mlp": 0.01038382, + "balance_loss_clip": 1.03408456, + "balance_loss_mlp": 1.024387, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.6493963903517543, + "language_loss": 0.81535047, + "learning_rate": 2.455341666526582e-06, + "loss": 0.83636677, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.6865832805633545 + }, + { + "auxiliary_loss_clip": 0.01071819, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.03545511, + "balance_loss_mlp": 1.02678025, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.8298527454305076, + "language_loss": 0.70236224, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.72349125, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 2.9954946041107178 + }, + { + "auxiliary_loss_clip": 0.01048739, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.03666329, + "balance_loss_mlp": 1.02926278, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 2.018026946523131, + "language_loss": 0.71770132, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73861885, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 2.668531656265259 + }, + { + "auxiliary_loss_clip": 0.0110227, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.03738737, + "balance_loss_mlp": 1.02305508, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.5558462724181896, + "language_loss": 0.69064134, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71203196, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.6542277336120605 + }, + { + "auxiliary_loss_clip": 0.011011, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.03775907, + "balance_loss_mlp": 1.01766038, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 2.055485041292993, + "language_loss": 0.74918902, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77050662, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.7384872436523438 + }, + { + "auxiliary_loss_clip": 0.01092348, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.03636599, + "balance_loss_mlp": 1.02426255, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.138100836189157, + "language_loss": 0.81350851, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83482021, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.570192337036133 + }, + { + "auxiliary_loss_clip": 0.01082207, + "auxiliary_loss_mlp": 0.01040386, + "balance_loss_clip": 1.03833747, + "balance_loss_mlp": 1.02617621, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 2.133702814663394, + "language_loss": 0.74027717, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.7615031, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 2.5850069522857666 + }, + { + "auxiliary_loss_clip": 0.01097588, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.03560829, + "balance_loss_mlp": 1.01764572, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.5614486797173226, + "language_loss": 0.79580671, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81708497, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.6170904636383057 + }, + { + "auxiliary_loss_clip": 0.01102515, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.0366503, + "balance_loss_mlp": 1.02127028, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 2.94446982322254, + "language_loss": 0.8087939, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83016396, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.666630506515503 + }, + { + "auxiliary_loss_clip": 0.01083269, + "auxiliary_loss_mlp": 0.0104245, + "balance_loss_clip": 1.03545547, + "balance_loss_mlp": 1.03011799, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 2.1126284892731237, + "language_loss": 0.79678315, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81804037, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 5.701903581619263 + }, + { + "auxiliary_loss_clip": 0.01083964, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.03505206, + "balance_loss_mlp": 1.02136755, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.8680520349253815, + "language_loss": 0.68546259, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70664203, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 2.6162796020507812 + }, + { + "auxiliary_loss_clip": 0.01097049, + "auxiliary_loss_mlp": 0.00749723, + "balance_loss_clip": 1.03748226, + "balance_loss_mlp": 1.00048196, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 3.662081798782745, + "language_loss": 0.80533153, + "learning_rate": 2.451169054403126e-06, + "loss": 0.82379925, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.511997938156128 + }, + { + "auxiliary_loss_clip": 0.01100551, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.03826118, + "balance_loss_mlp": 1.01877332, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.9202519445001744, + "language_loss": 0.67499489, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69631445, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.636758327484131 + }, + { + "auxiliary_loss_clip": 0.0107483, + "auxiliary_loss_mlp": 0.01039263, + "balance_loss_clip": 1.03462434, + "balance_loss_mlp": 1.0265615, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.8655828153082064, + "language_loss": 0.69837677, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71951771, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.662787675857544 + }, + { + "auxiliary_loss_clip": 0.01074592, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.03361297, + "balance_loss_mlp": 1.02160406, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 2.525868724839345, + "language_loss": 0.72359979, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.74468821, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 4.132734775543213 + }, + { + "auxiliary_loss_clip": 0.01063033, + "auxiliary_loss_mlp": 0.00749452, + "balance_loss_clip": 1.03639627, + "balance_loss_mlp": 1.00038147, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.6360790456970256, + "language_loss": 0.85249603, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87062085, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.672449827194214 + }, + { + "auxiliary_loss_clip": 0.01086386, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.03679919, + "balance_loss_mlp": 1.02387559, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.6313210258140696, + "language_loss": 0.83179796, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85301983, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 2.5951790809631348 + }, + { + "auxiliary_loss_clip": 0.01089318, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.03743243, + "balance_loss_mlp": 1.02072072, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.852238110057419, + "language_loss": 0.77047288, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.79170102, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 2.5552170276641846 + }, + { + "auxiliary_loss_clip": 0.01013429, + "auxiliary_loss_mlp": 0.01003355, + "balance_loss_clip": 1.01340604, + "balance_loss_mlp": 1.0019362, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.9905370878644474, + "language_loss": 0.6004737, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62064153, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.0933566093444824 + }, + { + "auxiliary_loss_clip": 0.01083973, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_clip": 1.03476822, + "balance_loss_mlp": 1.03012264, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 4.959557887129901, + "language_loss": 0.82012975, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84141791, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.5872817039489746 + }, + { + "auxiliary_loss_clip": 0.01086528, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.03555584, + "balance_loss_mlp": 1.01678705, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.5634970757945361, + "language_loss": 0.75221729, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77337503, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.5942158699035645 + }, + { + "auxiliary_loss_clip": 0.01076224, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.03625572, + "balance_loss_mlp": 1.01858497, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.8175055642132072, + "language_loss": 0.6550256, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67609322, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 2.6808156967163086 + }, + { + "auxiliary_loss_clip": 0.01082057, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.03693557, + "balance_loss_mlp": 1.02031815, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.6855057667525524, + "language_loss": 0.67494047, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.6960898, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.5915188789367676 + }, + { + "auxiliary_loss_clip": 0.01110264, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.03589642, + "balance_loss_mlp": 1.02088308, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.6349219203131804, + "language_loss": 0.71968615, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74113405, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.667647361755371 + }, + { + "auxiliary_loss_clip": 0.01089298, + "auxiliary_loss_mlp": 0.01036218, + "balance_loss_clip": 1.03616548, + "balance_loss_mlp": 1.02197886, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 2.135755962303849, + "language_loss": 0.65425706, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67551225, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 2.635002374649048 + }, + { + "auxiliary_loss_clip": 0.01085166, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.03460836, + "balance_loss_mlp": 1.02164364, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.0556771830465124, + "language_loss": 0.73935491, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76055515, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 2.5988423824310303 + }, + { + "auxiliary_loss_clip": 0.01046459, + "auxiliary_loss_mlp": 0.01033281, + "balance_loss_clip": 1.03790283, + "balance_loss_mlp": 1.0203408, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.7256416055417183, + "language_loss": 0.78499395, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.80579138, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 2.670997142791748 + }, + { + "auxiliary_loss_clip": 0.01090208, + "auxiliary_loss_mlp": 0.01031705, + "balance_loss_clip": 1.03453183, + "balance_loss_mlp": 1.01847291, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 2.055734152144617, + "language_loss": 0.7973361, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.81855524, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 2.5965700149536133 + }, + { + "auxiliary_loss_clip": 0.01096651, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.03592336, + "balance_loss_mlp": 1.0178926, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 2.0859806385000272, + "language_loss": 0.76342833, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.78469473, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.569169759750366 + }, + { + "auxiliary_loss_clip": 0.01077598, + "auxiliary_loss_mlp": 0.01036298, + "balance_loss_clip": 1.03406286, + "balance_loss_mlp": 1.02242208, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.5620663608345788, + "language_loss": 0.83545005, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85658896, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 2.6254231929779053 + }, + { + "auxiliary_loss_clip": 0.01106861, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.03478956, + "balance_loss_mlp": 1.02486944, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.733343661021812, + "language_loss": 0.83975846, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86119884, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 2.578963279724121 + }, + { + "auxiliary_loss_clip": 0.01074812, + "auxiliary_loss_mlp": 0.01035361, + "balance_loss_clip": 1.03442824, + "balance_loss_mlp": 1.02244508, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.643484522719592, + "language_loss": 0.8086828, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.82978457, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 2.6780707836151123 + }, + { + "auxiliary_loss_clip": 0.01087161, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.03407264, + "balance_loss_mlp": 1.02760875, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.891094404538099, + "language_loss": 0.80796504, + "learning_rate": 2.443197426237077e-06, + "loss": 0.82924521, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 2.6098217964172363 + }, + { + "auxiliary_loss_clip": 0.01101804, + "auxiliary_loss_mlp": 0.00749558, + "balance_loss_clip": 1.03761315, + "balance_loss_mlp": 1.00027895, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.6545117495829713, + "language_loss": 0.77501893, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79353249, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.6627893447875977 + }, + { + "auxiliary_loss_clip": 0.01073427, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.03363347, + "balance_loss_mlp": 1.02155828, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 2.431851636900031, + "language_loss": 0.72113723, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74220705, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 2.608414649963379 + }, + { + "auxiliary_loss_clip": 0.0108835, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.03646576, + "balance_loss_mlp": 1.01794744, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.5707941307594286, + "language_loss": 0.75000006, + "learning_rate": 2.442058014084156e-06, + "loss": 0.7711941, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.6571524143218994 + }, + { + "auxiliary_loss_clip": 0.01039689, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.03243768, + "balance_loss_mlp": 1.02539575, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.8054816257932356, + "language_loss": 0.7626223, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78340173, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 4.271399259567261 + }, + { + "auxiliary_loss_clip": 0.01109666, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.03752708, + "balance_loss_mlp": 1.02242756, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.422778105446891, + "language_loss": 0.6538862, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67532957, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 2.5202417373657227 + }, + { + "auxiliary_loss_clip": 0.0108245, + "auxiliary_loss_mlp": 0.01031657, + "balance_loss_clip": 1.03598928, + "balance_loss_mlp": 1.02029669, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.651924091371113, + "language_loss": 0.79114419, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81228518, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.5658421516418457 + }, + { + "auxiliary_loss_clip": 0.01094237, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.03680587, + "balance_loss_mlp": 1.02028036, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.6616105485478083, + "language_loss": 0.80224603, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.823506, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.576258659362793 + }, + { + "auxiliary_loss_clip": 0.01097091, + "auxiliary_loss_mlp": 0.0102798, + "balance_loss_clip": 1.03647637, + "balance_loss_mlp": 1.01620829, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 1.706964477467142, + "language_loss": 0.77508962, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79634035, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 2.5458383560180664 + }, + { + "auxiliary_loss_clip": 0.01084135, + "auxiliary_loss_mlp": 0.00749556, + "balance_loss_clip": 1.03619838, + "balance_loss_mlp": 1.00036526, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.8626475699515685, + "language_loss": 0.65205699, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.67039388, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 2.614274024963379 + }, + { + "auxiliary_loss_clip": 0.0109508, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03860426, + "balance_loss_mlp": 1.01943326, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.7917274013009814, + "language_loss": 0.75123972, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77250463, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 2.5443077087402344 + }, + { + "auxiliary_loss_clip": 0.01073756, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.03416955, + "balance_loss_mlp": 1.02097726, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.8589233048386296, + "language_loss": 0.77651894, + "learning_rate": 2.439018845165806e-06, + "loss": 0.7975924, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 2.5772695541381836 + }, + { + "auxiliary_loss_clip": 0.01103327, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.03944945, + "balance_loss_mlp": 1.02267456, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 7.3702715501354366, + "language_loss": 0.90894586, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93033481, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.5611045360565186 + }, + { + "auxiliary_loss_clip": 0.01091531, + "auxiliary_loss_mlp": 0.00749861, + "balance_loss_clip": 1.03852582, + "balance_loss_mlp": 1.00038052, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.621672422623194, + "language_loss": 0.79517138, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81358528, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 2.6161820888519287 + }, + { + "auxiliary_loss_clip": 0.01092402, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03896189, + "balance_loss_mlp": 1.01930094, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 1.8447951995876615, + "language_loss": 0.80289459, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82414281, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.5255637168884277 + }, + { + "auxiliary_loss_clip": 0.01077506, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.0364542, + "balance_loss_mlp": 1.01918173, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 1.7825807355636327, + "language_loss": 0.76350051, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78458512, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.6419548988342285 + }, + { + "auxiliary_loss_clip": 0.01090839, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.03603959, + "balance_loss_mlp": 1.0211103, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.7669613285770267, + "language_loss": 0.77012193, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79134166, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 4.209443807601929 + }, + { + "auxiliary_loss_clip": 0.01101494, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.039433, + "balance_loss_mlp": 1.01793122, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.6833225516624175, + "language_loss": 0.64615649, + "learning_rate": 2.436738768872905e-06, + "loss": 0.6674698, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 4.309649467468262 + }, + { + "auxiliary_loss_clip": 0.01095041, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.04009199, + "balance_loss_mlp": 1.01720417, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.72695950224477, + "language_loss": 0.8366307, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85787833, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.6169941425323486 + }, + { + "auxiliary_loss_clip": 0.0105246, + "auxiliary_loss_mlp": 0.0104375, + "balance_loss_clip": 1.03336549, + "balance_loss_mlp": 1.02796626, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.6102100200804368, + "language_loss": 0.79615557, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81711769, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.6956915855407715 + }, + { + "auxiliary_loss_clip": 0.01064592, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.03638422, + "balance_loss_mlp": 1.02147126, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.685808051448876, + "language_loss": 0.71893632, + "learning_rate": 2.435598506956009e-06, + "loss": 0.73991501, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 2.621821403503418 + }, + { + "auxiliary_loss_clip": 0.0106654, + "auxiliary_loss_mlp": 0.01036511, + "balance_loss_clip": 1.03600836, + "balance_loss_mlp": 1.02367187, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 2.030098193067846, + "language_loss": 0.66902882, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.6900593, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 2.7180256843566895 + }, + { + "auxiliary_loss_clip": 0.01081985, + "auxiliary_loss_mlp": 0.01037469, + "balance_loss_clip": 1.03442836, + "balance_loss_mlp": 1.02330697, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.5456947116457043, + "language_loss": 0.73897028, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.7601648, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 4.209929943084717 + }, + { + "auxiliary_loss_clip": 0.0104933, + "auxiliary_loss_mlp": 0.01044832, + "balance_loss_clip": 1.02881956, + "balance_loss_mlp": 1.03151035, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 2.0099444640925115, + "language_loss": 0.74415851, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76510006, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.7398760318756104 + }, + { + "auxiliary_loss_clip": 0.01080108, + "auxiliary_loss_mlp": 0.01035191, + "balance_loss_clip": 1.03873158, + "balance_loss_mlp": 1.02190495, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.8074011993411463, + "language_loss": 0.7510165, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.77216953, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.6967689990997314 + }, + { + "auxiliary_loss_clip": 0.0111277, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.03702283, + "balance_loss_mlp": 1.02009487, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 2.066261247535442, + "language_loss": 0.74265063, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76410937, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.619455575942993 + }, + { + "auxiliary_loss_clip": 0.01080289, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.03103757, + "balance_loss_mlp": 1.01919961, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 2.6321329480422473, + "language_loss": 0.7780385, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.7991668, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.5920610427856445 + }, + { + "auxiliary_loss_clip": 0.01091885, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.03616953, + "balance_loss_mlp": 1.01937127, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.269778133310171, + "language_loss": 0.8484394, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.86967671, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 2.5343968868255615 + }, + { + "auxiliary_loss_clip": 0.01066773, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.03299463, + "balance_loss_mlp": 1.02773607, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 2.9992800915052995, + "language_loss": 0.63845128, + "learning_rate": 2.432557082778765e-06, + "loss": 0.65954971, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.6474618911743164 + }, + { + "auxiliary_loss_clip": 0.01020334, + "auxiliary_loss_mlp": 0.01002776, + "balance_loss_clip": 1.00743103, + "balance_loss_mlp": 1.00160205, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7368311657434862, + "language_loss": 0.50207323, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52230436, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 2.9584038257598877 + }, + { + "auxiliary_loss_clip": 0.01031807, + "auxiliary_loss_mlp": 0.00998101, + "balance_loss_clip": 1.00858247, + "balance_loss_mlp": 0.99675429, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7508882700041851, + "language_loss": 0.59289521, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61319429, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 3.112664222717285 + }, + { + "auxiliary_loss_clip": 0.01078398, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.03646457, + "balance_loss_mlp": 1.02203584, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 2.149164443372181, + "language_loss": 0.59014392, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61126691, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 2.850841760635376 + }, + { + "auxiliary_loss_clip": 0.01078279, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.03492188, + "balance_loss_mlp": 1.01903105, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.817653209980318, + "language_loss": 0.79612774, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.81722224, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 2.573878526687622 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.03766274, + "balance_loss_mlp": 1.02915466, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.8543220336619717, + "language_loss": 0.79627347, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81779158, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 2.4926044940948486 + }, + { + "auxiliary_loss_clip": 0.00990825, + "auxiliary_loss_mlp": 0.01012673, + "balance_loss_clip": 1.00871587, + "balance_loss_mlp": 1.01125467, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8346920708312312, + "language_loss": 0.62829816, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64833307, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 3.3103864192962646 + }, + { + "auxiliary_loss_clip": 0.011116, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.03759909, + "balance_loss_mlp": 1.02345872, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 2.0483146608144014, + "language_loss": 0.6255458, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64702785, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 2.5301921367645264 + }, + { + "auxiliary_loss_clip": 0.01013565, + "auxiliary_loss_mlp": 0.01008446, + "balance_loss_clip": 1.00932932, + "balance_loss_mlp": 1.00707483, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7526537122068546, + "language_loss": 0.57093132, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59115142, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 3.0092544555664062 + }, + { + "auxiliary_loss_clip": 0.01079652, + "auxiliary_loss_mlp": 0.01041445, + "balance_loss_clip": 1.03287733, + "balance_loss_mlp": 1.02750969, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.1188971367479983, + "language_loss": 0.75325525, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.77446628, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.5399320125579834 + }, + { + "auxiliary_loss_clip": 0.01088548, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.03714442, + "balance_loss_mlp": 1.02069688, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.6717737851687844, + "language_loss": 0.76209259, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78330463, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 2.6998331546783447 + }, + { + "auxiliary_loss_clip": 0.01113359, + "auxiliary_loss_mlp": 0.01035335, + "balance_loss_clip": 1.04142833, + "balance_loss_mlp": 1.02342653, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 2.1983935554251994, + "language_loss": 0.76327056, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78475749, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 2.5222692489624023 + }, + { + "auxiliary_loss_clip": 0.01095656, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.03680754, + "balance_loss_mlp": 1.02169394, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 1.9615293431907483, + "language_loss": 0.68237865, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70368832, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.4962222576141357 + }, + { + "auxiliary_loss_clip": 0.0107278, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.03499961, + "balance_loss_mlp": 1.01826835, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.6023880960114898, + "language_loss": 0.71894014, + "learning_rate": 2.427612532815961e-06, + "loss": 0.73998547, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 2.5851659774780273 + }, + { + "auxiliary_loss_clip": 0.0108902, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.0341084, + "balance_loss_mlp": 1.01975822, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 2.1472865823935816, + "language_loss": 0.69761741, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71883595, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.5519771575927734 + }, + { + "auxiliary_loss_clip": 0.01111854, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.03850484, + "balance_loss_mlp": 1.02597654, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.8443155283976265, + "language_loss": 0.77115929, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79266393, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 2.4922218322753906 + }, + { + "auxiliary_loss_clip": 0.01110837, + "auxiliary_loss_mlp": 0.01034208, + "balance_loss_clip": 1.03585911, + "balance_loss_mlp": 1.02136886, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 2.336226645984358, + "language_loss": 0.6790995, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70054996, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 2.524905204772949 + }, + { + "auxiliary_loss_clip": 0.01030046, + "auxiliary_loss_mlp": 0.01000513, + "balance_loss_clip": 1.00682473, + "balance_loss_mlp": 0.99907649, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7471386253395158, + "language_loss": 0.54465669, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56496233, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 4.67633056640625 + }, + { + "auxiliary_loss_clip": 0.01098, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.03525281, + "balance_loss_mlp": 1.01730919, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 2.3596298622720333, + "language_loss": 0.75617862, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.77745289, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.6090810298919678 + }, + { + "auxiliary_loss_clip": 0.01097237, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.03716743, + "balance_loss_mlp": 1.02057445, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.904790881297708, + "language_loss": 0.74277806, + "learning_rate": 2.425329506653441e-06, + "loss": 0.76406986, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 2.5686194896698 + }, + { + "auxiliary_loss_clip": 0.01092638, + "auxiliary_loss_mlp": 0.01039817, + "balance_loss_clip": 1.03933954, + "balance_loss_mlp": 1.0255599, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 2.163071451466227, + "language_loss": 0.80520624, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82653081, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.663374900817871 + }, + { + "auxiliary_loss_clip": 0.01092921, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.03814089, + "balance_loss_mlp": 1.02304685, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.3390275848072295, + "language_loss": 0.80678433, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82807338, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 2.568141222000122 + }, + { + "auxiliary_loss_clip": 0.01061464, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.03765059, + "balance_loss_mlp": 1.01869118, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 2.4911701210823867, + "language_loss": 0.75120282, + "learning_rate": 2.424187775642129e-06, + "loss": 0.77212596, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.679267644882202 + }, + { + "auxiliary_loss_clip": 0.01076613, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.0333277, + "balance_loss_mlp": 1.01492381, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.938975594298219, + "language_loss": 0.70688659, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.72791994, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.631986141204834 + }, + { + "auxiliary_loss_clip": 0.01099386, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.03894067, + "balance_loss_mlp": 1.02440083, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7688912439668205, + "language_loss": 0.72153068, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74290168, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.5619540214538574 + }, + { + "auxiliary_loss_clip": 0.01066172, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.03274345, + "balance_loss_mlp": 1.02018404, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 1.9838492580645917, + "language_loss": 0.76882243, + "learning_rate": 2.423045899863634e-06, + "loss": 0.78982306, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 2.71344256401062 + }, + { + "auxiliary_loss_clip": 0.01110299, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.03780925, + "balance_loss_mlp": 1.02413774, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.6094136342799195, + "language_loss": 0.70193195, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72339737, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.5406064987182617 + }, + { + "auxiliary_loss_clip": 0.01019749, + "auxiliary_loss_mlp": 0.01001658, + "balance_loss_clip": 1.00624835, + "balance_loss_mlp": 1.00026917, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7333190470510538, + "language_loss": 0.61670971, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63692367, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.1414198875427246 + }, + { + "auxiliary_loss_clip": 0.01112423, + "auxiliary_loss_mlp": 0.00749618, + "balance_loss_clip": 1.03831601, + "balance_loss_mlp": 1.00025249, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 3.5637601113764132, + "language_loss": 0.78211397, + "learning_rate": 2.421903879707657e-06, + "loss": 0.8007344, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 3.9427642822265625 + }, + { + "auxiliary_loss_clip": 0.01057838, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.03340983, + "balance_loss_mlp": 1.02415514, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.6759097016189963, + "language_loss": 0.71965837, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74061155, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 4.13082480430603 + }, + { + "auxiliary_loss_clip": 0.01060613, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.03404033, + "balance_loss_mlp": 1.02324176, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 2.4330758372290155, + "language_loss": 0.76723838, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.7882179, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.7190334796905518 + }, + { + "auxiliary_loss_clip": 0.01102274, + "auxiliary_loss_mlp": 0.00749685, + "balance_loss_clip": 1.03545463, + "balance_loss_mlp": 1.00035536, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.4003098581034488, + "language_loss": 0.71803617, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73655581, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.5304195880889893 + }, + { + "auxiliary_loss_clip": 0.01083296, + "auxiliary_loss_mlp": 0.0104521, + "balance_loss_clip": 1.03430319, + "balance_loss_mlp": 1.03015423, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 1.9504786498306825, + "language_loss": 0.68216479, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70344985, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 2.569310426712036 + }, + { + "auxiliary_loss_clip": 0.01083728, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.03589582, + "balance_loss_mlp": 1.02082098, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.7841700413645274, + "language_loss": 0.89606047, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91722274, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.6500227451324463 + }, + { + "auxiliary_loss_clip": 0.0104823, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.03470349, + "balance_loss_mlp": 1.02491641, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 1.8824781745316916, + "language_loss": 0.75442636, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77529645, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 4.220455646514893 + }, + { + "auxiliary_loss_clip": 0.01074028, + "auxiliary_loss_mlp": 0.01037455, + "balance_loss_clip": 1.03283656, + "balance_loss_mlp": 1.02301276, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.0013740593211344, + "language_loss": 0.79326963, + "learning_rate": 2.419238606731815e-06, + "loss": 0.81438446, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.6032819747924805 + }, + { + "auxiliary_loss_clip": 0.01086095, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.03586793, + "balance_loss_mlp": 1.01906621, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.6866948411589993, + "language_loss": 0.68751913, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70870006, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 2.690506935119629 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01039326, + "balance_loss_clip": 1.0384872, + "balance_loss_mlp": 1.02596903, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.054948250635791, + "language_loss": 0.85014093, + "learning_rate": 2.418476956872571e-06, + "loss": 0.87155581, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.528463840484619 + }, + { + "auxiliary_loss_clip": 0.010746, + "auxiliary_loss_mlp": 0.01049669, + "balance_loss_clip": 1.03383934, + "balance_loss_mlp": 1.03528035, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.8024102344130817, + "language_loss": 0.80345386, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82469654, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.660693883895874 + }, + { + "auxiliary_loss_clip": 0.01052495, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.03073645, + "balance_loss_mlp": 1.01807284, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 3.373066988147151, + "language_loss": 0.75407112, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77492231, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 2.608929395675659 + }, + { + "auxiliary_loss_clip": 0.01013272, + "auxiliary_loss_mlp": 0.01006419, + "balance_loss_clip": 1.00944638, + "balance_loss_mlp": 1.0049473, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7960468689639907, + "language_loss": 0.58664775, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60684466, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 3.2111287117004395 + }, + { + "auxiliary_loss_clip": 0.0109255, + "auxiliary_loss_mlp": 0.0103628, + "balance_loss_clip": 1.03553987, + "balance_loss_mlp": 1.02149773, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.412606559342175, + "language_loss": 0.83266026, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85394859, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.6395907402038574 + }, + { + "auxiliary_loss_clip": 0.01109373, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.03745937, + "balance_loss_mlp": 1.02186894, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.6027251583020448, + "language_loss": 0.77311325, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79455745, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 2.6265180110931396 + }, + { + "auxiliary_loss_clip": 0.01110095, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.0408504, + "balance_loss_mlp": 1.02422917, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.3165774370524623, + "language_loss": 0.72067094, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74215794, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 2.6164932250976562 + }, + { + "auxiliary_loss_clip": 0.01090198, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.03875494, + "balance_loss_mlp": 1.0243293, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 6.0397584394992005, + "language_loss": 0.69340372, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71469676, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 2.5891475677490234 + }, + { + "auxiliary_loss_clip": 0.01009996, + "auxiliary_loss_mlp": 0.01005204, + "balance_loss_clip": 1.01255584, + "balance_loss_mlp": 1.00400627, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7608405576777566, + "language_loss": 0.56679928, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58695126, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 3.149311065673828 + }, + { + "auxiliary_loss_clip": 0.01095147, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.0371021, + "balance_loss_mlp": 1.02113366, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.9921405105971772, + "language_loss": 0.79534674, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81663328, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 2.609321355819702 + }, + { + "auxiliary_loss_clip": 0.01074966, + "auxiliary_loss_mlp": 0.0074991, + "balance_loss_clip": 1.03412533, + "balance_loss_mlp": 1.00041234, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.468832064800384, + "language_loss": 0.93318403, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.95143276, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 2.554919481277466 + }, + { + "auxiliary_loss_clip": 0.01018591, + "auxiliary_loss_mlp": 0.01000648, + "balance_loss_clip": 1.00605512, + "balance_loss_mlp": 0.99919337, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.7986734486221305, + "language_loss": 0.62881219, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64900458, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 3.1371326446533203 + }, + { + "auxiliary_loss_clip": 0.01110848, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.03837717, + "balance_loss_mlp": 1.01691937, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4799075963388455, + "language_loss": 0.8234328, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84483266, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 2.5110771656036377 + }, + { + "auxiliary_loss_clip": 0.01093329, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.03536916, + "balance_loss_mlp": 1.01868117, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.688026492455997, + "language_loss": 0.85334748, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87461925, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 2.7055740356445312 + }, + { + "auxiliary_loss_clip": 0.01113054, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.03875518, + "balance_loss_mlp": 1.01593328, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.0394964448095854, + "language_loss": 0.76072061, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78214288, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 2.534498929977417 + }, + { + "auxiliary_loss_clip": 0.01077748, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.03434372, + "balance_loss_mlp": 1.01943421, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 1.8977397116665322, + "language_loss": 0.74749619, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.76859677, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.6255879402160645 + }, + { + "auxiliary_loss_clip": 0.01113777, + "auxiliary_loss_mlp": 0.01037548, + "balance_loss_clip": 1.03909421, + "balance_loss_mlp": 1.02363682, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 2.197173335366077, + "language_loss": 0.70141745, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72293073, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.625605344772339 + }, + { + "auxiliary_loss_clip": 0.01070104, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.036713, + "balance_loss_mlp": 1.01998353, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.8978295787853463, + "language_loss": 0.77054989, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79158902, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 2.6913528442382812 + }, + { + "auxiliary_loss_clip": 0.01067669, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.03666031, + "balance_loss_mlp": 1.01790237, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 2.0449978146111545, + "language_loss": 0.62579381, + "learning_rate": 2.411619265641992e-06, + "loss": 0.64677817, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 2.7062041759490967 + }, + { + "auxiliary_loss_clip": 0.01114663, + "auxiliary_loss_mlp": 0.0103703, + "balance_loss_clip": 1.03857791, + "balance_loss_mlp": 1.0231899, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 2.220006327728026, + "language_loss": 0.84390974, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86542666, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.5246262550354004 + }, + { + "auxiliary_loss_clip": 0.01086046, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.03700137, + "balance_loss_mlp": 1.01917648, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.401134671439311, + "language_loss": 0.79724824, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81842577, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 4.058683633804321 + }, + { + "auxiliary_loss_clip": 0.01083339, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.03736269, + "balance_loss_mlp": 1.02312696, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 2.1496527405599184, + "language_loss": 0.808644, + "learning_rate": 2.410475823155484e-06, + "loss": 0.8298381, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 2.555081605911255 + }, + { + "auxiliary_loss_clip": 0.01065238, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.03266001, + "balance_loss_mlp": 1.02222347, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 1.711144622740947, + "language_loss": 0.63353127, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.6545226, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.6713736057281494 + }, + { + "auxiliary_loss_clip": 0.01002779, + "auxiliary_loss_mlp": 0.01002239, + "balance_loss_clip": 1.01465046, + "balance_loss_mlp": 1.00093353, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8319422267989283, + "language_loss": 0.58931828, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60936844, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 3.220057249069214 + }, + { + "auxiliary_loss_clip": 0.01058848, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.03524721, + "balance_loss_mlp": 1.02173972, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.6546137100328946, + "language_loss": 0.79286325, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81379908, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.67685604095459 + }, + { + "auxiliary_loss_clip": 0.01071622, + "auxiliary_loss_mlp": 0.01035527, + "balance_loss_clip": 1.03561234, + "balance_loss_mlp": 1.02162743, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 2.36925036336528, + "language_loss": 0.74170792, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76277936, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.6852657794952393 + }, + { + "auxiliary_loss_clip": 0.0109816, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.03841341, + "balance_loss_mlp": 1.02291155, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 2.3108963090728216, + "language_loss": 0.79726481, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81859553, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.5599770545959473 + }, + { + "auxiliary_loss_clip": 0.01110905, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.0391711, + "balance_loss_mlp": 1.02021849, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.7807249283343998, + "language_loss": 0.72949731, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75093019, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 2.607645034790039 + }, + { + "auxiliary_loss_clip": 0.0111168, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.03736401, + "balance_loss_mlp": 1.02019823, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 2.794382392353553, + "language_loss": 0.76717871, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.78863078, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 2.602276086807251 + }, + { + "auxiliary_loss_clip": 0.01100591, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.03698564, + "balance_loss_mlp": 1.02092743, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.6821097282047373, + "language_loss": 0.78746557, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80881232, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.5846376419067383 + }, + { + "auxiliary_loss_clip": 0.01077905, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.03585577, + "balance_loss_mlp": 1.01972532, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 2.189374998892973, + "language_loss": 0.8705495, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89166033, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.6766226291656494 + }, + { + "auxiliary_loss_clip": 0.01093151, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.03668499, + "balance_loss_mlp": 1.01848781, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 3.6226423439362265, + "language_loss": 0.67117959, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69241601, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.6306779384613037 + }, + { + "auxiliary_loss_clip": 0.01098626, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.03900361, + "balance_loss_mlp": 1.01862192, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 1.8898581907789638, + "language_loss": 0.69270992, + "learning_rate": 2.406282005146318e-06, + "loss": 0.7140286, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 4.080031633377075 + }, + { + "auxiliary_loss_clip": 0.01094758, + "auxiliary_loss_mlp": 0.01039038, + "balance_loss_clip": 1.03533947, + "balance_loss_mlp": 1.02473259, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 5.499695700947951, + "language_loss": 0.81234729, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83368528, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 2.499246835708618 + }, + { + "auxiliary_loss_clip": 0.01107563, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.03779149, + "balance_loss_mlp": 1.02043593, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.5653357592047357, + "language_loss": 0.65657222, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67797709, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.5380918979644775 + }, + { + "auxiliary_loss_clip": 0.01073867, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.03682566, + "balance_loss_mlp": 1.01704621, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 3.356052297609247, + "language_loss": 0.62758744, + "learning_rate": 2.405137912257333e-06, + "loss": 0.64861482, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.6047720909118652 + }, + { + "auxiliary_loss_clip": 0.0109847, + "auxiliary_loss_mlp": 0.01038087, + "balance_loss_clip": 1.03748167, + "balance_loss_mlp": 1.02563608, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.4864982064552323, + "language_loss": 0.59176147, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61312699, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.791743040084839 + }, + { + "auxiliary_loss_clip": 0.0109952, + "auxiliary_loss_mlp": 0.01042326, + "balance_loss_clip": 1.03831041, + "balance_loss_mlp": 1.02946913, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.6183212699982465, + "language_loss": 0.72445118, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74586964, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.5471882820129395 + }, + { + "auxiliary_loss_clip": 0.01086272, + "auxiliary_loss_mlp": 0.01036212, + "balance_loss_clip": 1.03507864, + "balance_loss_mlp": 1.02411795, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 2.9999296812160248, + "language_loss": 0.75759351, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77881837, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 4.147113561630249 + }, + { + "auxiliary_loss_clip": 0.01094658, + "auxiliary_loss_mlp": 0.01040718, + "balance_loss_clip": 1.03853452, + "balance_loss_mlp": 1.02731931, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 2.140929275412919, + "language_loss": 0.67508316, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69643694, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.6194334030151367 + }, + { + "auxiliary_loss_clip": 0.01097841, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.03670347, + "balance_loss_mlp": 1.02372539, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.5225589088345053, + "language_loss": 0.60777235, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62911808, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 2.6197421550750732 + }, + { + "auxiliary_loss_clip": 0.01102872, + "auxiliary_loss_mlp": 0.01039763, + "balance_loss_clip": 1.03688371, + "balance_loss_mlp": 1.02582181, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.1108918894828728, + "language_loss": 0.78393078, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80535722, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 2.5127012729644775 + }, + { + "auxiliary_loss_clip": 0.01065701, + "auxiliary_loss_mlp": 0.01034419, + "balance_loss_clip": 1.03769803, + "balance_loss_mlp": 1.02172339, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.5702215438259202, + "language_loss": 0.63830066, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65930188, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.724897861480713 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.03700554, + "balance_loss_mlp": 1.02168703, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.66202712174554, + "language_loss": 0.79282343, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81414831, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 2.584503173828125 + }, + { + "auxiliary_loss_clip": 0.01080188, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.03470063, + "balance_loss_mlp": 1.01823258, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 3.432147318828004, + "language_loss": 0.80806935, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82917857, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.7364017963409424 + }, + { + "auxiliary_loss_clip": 0.01074127, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.03699708, + "balance_loss_mlp": 1.01612997, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.574029180590506, + "language_loss": 0.65409398, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67512095, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 2.7219624519348145 + }, + { + "auxiliary_loss_clip": 0.01082623, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.03474593, + "balance_loss_mlp": 1.01899898, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 2.8604539283071957, + "language_loss": 0.74977005, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77090794, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 2.7118349075317383 + }, + { + "auxiliary_loss_clip": 0.01108027, + "auxiliary_loss_mlp": 0.01033239, + "balance_loss_clip": 1.03699911, + "balance_loss_mlp": 1.02110362, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 23.115685189467573, + "language_loss": 0.72883475, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75024742, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 2.595783233642578 + }, + { + "auxiliary_loss_clip": 0.01074154, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.0354929, + "balance_loss_mlp": 1.01832247, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.7178234685717166, + "language_loss": 0.76257813, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78362155, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 2.6900737285614014 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01036808, + "balance_loss_clip": 1.03549027, + "balance_loss_mlp": 1.02421379, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.4639935684219527, + "language_loss": 0.6665473, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.68795145, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 2.6513195037841797 + }, + { + "auxiliary_loss_clip": 0.01098072, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.03965533, + "balance_loss_mlp": 1.022753, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 1.9206897895538981, + "language_loss": 0.78636122, + "learning_rate": 2.399415381635768e-06, + "loss": 0.80768526, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 2.645188331604004 + }, + { + "auxiliary_loss_clip": 0.01082664, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.03570926, + "balance_loss_mlp": 1.02191508, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.8468916503479766, + "language_loss": 0.82798827, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.84916818, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 2.6656837463378906 + }, + { + "auxiliary_loss_clip": 0.01082402, + "auxiliary_loss_mlp": 0.01037274, + "balance_loss_clip": 1.03749013, + "balance_loss_mlp": 1.0236181, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.6359996065252087, + "language_loss": 0.76651454, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78771132, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.6415324211120605 + }, + { + "auxiliary_loss_clip": 0.01067443, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.03637588, + "balance_loss_mlp": 1.02133155, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5660744376531195, + "language_loss": 0.80680573, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82780647, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.645138740539551 + }, + { + "auxiliary_loss_clip": 0.01076885, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.03347552, + "balance_loss_mlp": 1.01925218, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 2.7672218814282488, + "language_loss": 0.7553274, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.77641338, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.628265142440796 + }, + { + "auxiliary_loss_clip": 0.01098196, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03658545, + "balance_loss_mlp": 1.02064383, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.226738824564536, + "language_loss": 0.757819, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.77912223, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.55351185798645 + }, + { + "auxiliary_loss_clip": 0.01022782, + "auxiliary_loss_mlp": 0.01002042, + "balance_loss_clip": 1.00967824, + "balance_loss_mlp": 1.00053966, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7862364408801116, + "language_loss": 0.62326485, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64351308, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 3.1594784259796143 + }, + { + "auxiliary_loss_clip": 0.01108691, + "auxiliary_loss_mlp": 0.01038438, + "balance_loss_clip": 1.03814948, + "balance_loss_mlp": 1.02651167, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 2.194713409649446, + "language_loss": 0.65609646, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67756772, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.4992382526397705 + }, + { + "auxiliary_loss_clip": 0.01087947, + "auxiliary_loss_mlp": 0.01039462, + "balance_loss_clip": 1.03749394, + "balance_loss_mlp": 1.0257287, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.9713210566008024, + "language_loss": 0.84966004, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87093413, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 2.661799907684326 + }, + { + "auxiliary_loss_clip": 0.01087324, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.03754187, + "balance_loss_mlp": 1.01567495, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7953981107972488, + "language_loss": 0.76599693, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78714538, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.705709457397461 + }, + { + "auxiliary_loss_clip": 0.01083855, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.03516793, + "balance_loss_mlp": 1.01443946, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.778225865329072, + "language_loss": 0.80619889, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82730657, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 4.086630821228027 + }, + { + "auxiliary_loss_clip": 0.01095523, + "auxiliary_loss_mlp": 0.00749521, + "balance_loss_clip": 1.0361917, + "balance_loss_mlp": 1.00031447, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.6458371579276843, + "language_loss": 0.75853485, + "learning_rate": 2.395216690562469e-06, + "loss": 0.77698529, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.5981156826019287 + }, + { + "auxiliary_loss_clip": 0.01082157, + "auxiliary_loss_mlp": 0.01035654, + "balance_loss_clip": 1.03942347, + "balance_loss_mlp": 1.02284455, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.669290426575664, + "language_loss": 0.75286072, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77403879, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 2.6746184825897217 + }, + { + "auxiliary_loss_clip": 0.01087999, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.0369339, + "balance_loss_mlp": 1.01748252, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.5238490247966134, + "language_loss": 0.72476476, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74594569, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.6319708824157715 + }, + { + "auxiliary_loss_clip": 0.01090993, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.03692913, + "balance_loss_mlp": 1.01745343, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.4902523609829148, + "language_loss": 0.75908256, + "learning_rate": 2.394071277466609e-06, + "loss": 0.78030467, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.609412670135498 + }, + { + "auxiliary_loss_clip": 0.01102282, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.03794944, + "balance_loss_mlp": 1.01773489, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.0171771063289023, + "language_loss": 0.69963861, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72096717, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.5266520977020264 + }, + { + "auxiliary_loss_clip": 0.01108538, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.03608191, + "balance_loss_mlp": 1.02356982, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.4372585932091133, + "language_loss": 0.72435057, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74579847, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.564967393875122 + }, + { + "auxiliary_loss_clip": 0.0107409, + "auxiliary_loss_mlp": 0.01027488, + "balance_loss_clip": 1.03417611, + "balance_loss_mlp": 1.01590645, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.6664358044465868, + "language_loss": 0.65173364, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67274946, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 2.697093963623047 + }, + { + "auxiliary_loss_clip": 0.01092717, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.03865874, + "balance_loss_mlp": 1.01965582, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.902143826559322, + "language_loss": 0.68543887, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70668292, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.5685982704162598 + }, + { + "auxiliary_loss_clip": 0.01093934, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.03305674, + "balance_loss_mlp": 1.02120352, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.803819407067179, + "language_loss": 0.79492629, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81620872, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.531219482421875 + }, + { + "auxiliary_loss_clip": 0.01020175, + "auxiliary_loss_mlp": 0.01004801, + "balance_loss_clip": 1.00773108, + "balance_loss_mlp": 1.00340605, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8205230658970509, + "language_loss": 0.57817644, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59842622, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 3.08901047706604 + }, + { + "auxiliary_loss_clip": 0.01047684, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.03637481, + "balance_loss_mlp": 1.02147496, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 2.0113292799381552, + "language_loss": 0.76656568, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78737247, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 2.7226462364196777 + }, + { + "auxiliary_loss_clip": 0.01090508, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.03699303, + "balance_loss_mlp": 1.01863217, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.0874869079918357, + "language_loss": 0.77010345, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79133129, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 4.220656871795654 + }, + { + "auxiliary_loss_clip": 0.01045005, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.03741503, + "balance_loss_mlp": 1.02037489, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.4619523532955878, + "language_loss": 0.7256223, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74639928, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.9506912231445312 + }, + { + "auxiliary_loss_clip": 0.01114722, + "auxiliary_loss_mlp": 0.01034158, + "balance_loss_clip": 1.03916955, + "balance_loss_mlp": 1.02123022, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.8099030026511482, + "language_loss": 0.62923527, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65072411, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 2.8981878757476807 + }, + { + "auxiliary_loss_clip": 0.01011185, + "auxiliary_loss_mlp": 0.01001994, + "balance_loss_clip": 1.00826871, + "balance_loss_mlp": 1.00030136, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6837251035429628, + "language_loss": 0.57566679, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59579861, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 3.085523843765259 + }, + { + "auxiliary_loss_clip": 0.01103136, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.0390867, + "balance_loss_mlp": 1.02200675, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 5.671861496952978, + "language_loss": 0.56844926, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58983982, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.5420546531677246 + }, + { + "auxiliary_loss_clip": 0.01098052, + "auxiliary_loss_mlp": 0.00749516, + "balance_loss_clip": 1.03885984, + "balance_loss_mlp": 1.00041091, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 1.7129325932129609, + "language_loss": 0.72025585, + "learning_rate": 2.389106271642792e-06, + "loss": 0.7387315, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.548600196838379 + }, + { + "auxiliary_loss_clip": 0.01024598, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.03098762, + "balance_loss_mlp": 1.02356017, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 2.9159312571456297, + "language_loss": 0.68716812, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.70778751, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 2.727813243865967 + }, + { + "auxiliary_loss_clip": 0.01086231, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.03655827, + "balance_loss_mlp": 1.02132535, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.5356770174025514, + "language_loss": 0.85022146, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87141031, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 4.108996868133545 + }, + { + "auxiliary_loss_clip": 0.01095687, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.03533304, + "balance_loss_mlp": 1.02036572, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 2.8578419184678596, + "language_loss": 0.89348096, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91476417, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.6249489784240723 + }, + { + "auxiliary_loss_clip": 0.01110785, + "auxiliary_loss_mlp": 0.00749745, + "balance_loss_clip": 1.03719974, + "balance_loss_mlp": 1.00046635, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.8354893405756847, + "language_loss": 0.71198189, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73058724, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 2.656078338623047 + }, + { + "auxiliary_loss_clip": 0.01101799, + "auxiliary_loss_mlp": 0.0103835, + "balance_loss_clip": 1.0363605, + "balance_loss_mlp": 1.02527905, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.6942635193606406, + "language_loss": 0.68117237, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.7025739, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.723763942718506 + }, + { + "auxiliary_loss_clip": 0.0107297, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.03936875, + "balance_loss_mlp": 1.02346504, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.5615624145462765, + "language_loss": 0.80020809, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82130426, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 2.7307512760162354 + }, + { + "auxiliary_loss_clip": 0.01069185, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.03326917, + "balance_loss_mlp": 1.01826739, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.7072943639962976, + "language_loss": 0.73651195, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75752622, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 2.745988368988037 + }, + { + "auxiliary_loss_clip": 0.01073996, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.03370476, + "balance_loss_mlp": 1.02369606, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.4582110542443802, + "language_loss": 0.81235009, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83345747, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 2.760334014892578 + }, + { + "auxiliary_loss_clip": 0.01108773, + "auxiliary_loss_mlp": 0.01050688, + "balance_loss_clip": 1.03985977, + "balance_loss_mlp": 1.03612673, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 2.026503689053318, + "language_loss": 0.79379129, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81538588, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 2.593312978744507 + }, + { + "auxiliary_loss_clip": 0.01103628, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.03838062, + "balance_loss_mlp": 1.01722324, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.419062747064138, + "language_loss": 0.75254691, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77389622, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 2.6262362003326416 + }, + { + "auxiliary_loss_clip": 0.01090659, + "auxiliary_loss_mlp": 0.01040479, + "balance_loss_clip": 1.03914392, + "balance_loss_mlp": 1.02716911, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 2.2941024229396136, + "language_loss": 0.74768925, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76900065, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 2.7149932384490967 + }, + { + "auxiliary_loss_clip": 0.01098296, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.03872216, + "balance_loss_mlp": 1.01908207, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.4702831862944408, + "language_loss": 0.81277132, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83406925, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.6071386337280273 + }, + { + "auxiliary_loss_clip": 0.01095953, + "auxiliary_loss_mlp": 0.01039345, + "balance_loss_clip": 1.03868985, + "balance_loss_mlp": 1.02458096, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 1.7014286691632696, + "language_loss": 0.72644401, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.74779701, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 2.688647508621216 + }, + { + "auxiliary_loss_clip": 0.01105326, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.03961527, + "balance_loss_mlp": 1.02182221, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 2.2620683853510255, + "language_loss": 0.74782968, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76925009, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.7717628479003906 + }, + { + "auxiliary_loss_clip": 0.01103502, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.03899968, + "balance_loss_mlp": 1.02087474, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.585791730091612, + "language_loss": 0.71075869, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73213989, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.586397886276245 + }, + { + "auxiliary_loss_clip": 0.01090375, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.03809607, + "balance_loss_mlp": 1.02250445, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.910734795493457, + "language_loss": 0.73495567, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75622034, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.595510244369507 + }, + { + "auxiliary_loss_clip": 0.01111084, + "auxiliary_loss_mlp": 0.01037911, + "balance_loss_clip": 1.03889823, + "balance_loss_mlp": 1.02457726, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.5521491310675637, + "language_loss": 0.66431069, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68580067, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.515536069869995 + }, + { + "auxiliary_loss_clip": 0.0108575, + "auxiliary_loss_mlp": 0.0104677, + "balance_loss_clip": 1.038311, + "balance_loss_mlp": 1.031744, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 2.127872698351733, + "language_loss": 0.74256164, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76388687, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 2.607149362564087 + }, + { + "auxiliary_loss_clip": 0.01060477, + "auxiliary_loss_mlp": 0.00750056, + "balance_loss_clip": 1.03567624, + "balance_loss_mlp": 1.00047219, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.9529303309085384, + "language_loss": 0.69862431, + "learning_rate": 2.381845247976697e-06, + "loss": 0.71672964, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 2.7130119800567627 + }, + { + "auxiliary_loss_clip": 0.01096566, + "auxiliary_loss_mlp": 0.01036319, + "balance_loss_clip": 1.03516269, + "balance_loss_mlp": 1.02321172, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.6638421092000741, + "language_loss": 0.78279853, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80412734, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.6545047760009766 + }, + { + "auxiliary_loss_clip": 0.01112823, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.04032636, + "balance_loss_mlp": 1.0180186, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.5364051282538551, + "language_loss": 0.68906045, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71050572, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 2.7238948345184326 + }, + { + "auxiliary_loss_clip": 0.0109126, + "auxiliary_loss_mlp": 0.01029024, + "balance_loss_clip": 1.03318739, + "balance_loss_mlp": 1.01582193, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.686060472815791, + "language_loss": 0.7337743, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75497711, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 2.6780779361724854 + }, + { + "auxiliary_loss_clip": 0.01116583, + "auxiliary_loss_mlp": 0.01047307, + "balance_loss_clip": 1.04079282, + "balance_loss_mlp": 1.0325309, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 2.0795848812276447, + "language_loss": 0.72353041, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74516928, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.530867576599121 + }, + { + "auxiliary_loss_clip": 0.01101155, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.03920841, + "balance_loss_mlp": 1.02271163, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.875779378349792, + "language_loss": 0.72733176, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74870956, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 4.283949613571167 + }, + { + "auxiliary_loss_clip": 0.01070794, + "auxiliary_loss_mlp": 0.01041239, + "balance_loss_clip": 1.03587246, + "balance_loss_mlp": 1.02704144, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.4554579916976176, + "language_loss": 0.68045235, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70157266, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 2.8123271465301514 + }, + { + "auxiliary_loss_clip": 0.0111279, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.03969502, + "balance_loss_mlp": 1.01708663, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.3832493069082457, + "language_loss": 0.76175511, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78318405, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.5327038764953613 + }, + { + "auxiliary_loss_clip": 0.01091134, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.03815293, + "balance_loss_mlp": 1.01681733, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.725351594063527, + "language_loss": 0.78482628, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80603075, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.6960296630859375 + }, + { + "auxiliary_loss_clip": 0.01084869, + "auxiliary_loss_mlp": 0.01046607, + "balance_loss_clip": 1.03494298, + "balance_loss_mlp": 1.03268349, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.1088836331244467, + "language_loss": 0.69580126, + "learning_rate": 2.378403985195863e-06, + "loss": 0.717116, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.576014280319214 + }, + { + "auxiliary_loss_clip": 0.01099043, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.03974605, + "balance_loss_mlp": 1.0213356, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.7600569659768865, + "language_loss": 0.79485756, + "learning_rate": 2.378021550725735e-06, + "loss": 0.816185, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.5979366302490234 + }, + { + "auxiliary_loss_clip": 0.01098247, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.03682625, + "balance_loss_mlp": 1.02244008, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.2595440367565724, + "language_loss": 0.62478375, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64612639, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 2.6376092433929443 + }, + { + "auxiliary_loss_clip": 0.01081856, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.03454185, + "balance_loss_mlp": 1.02796698, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 1.9584551561724777, + "language_loss": 0.73049062, + "learning_rate": 2.377256638796135e-06, + "loss": 0.7517184, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 2.664759635925293 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.01041173, + "balance_loss_clip": 1.04155958, + "balance_loss_mlp": 1.0269866, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.248771064294415, + "language_loss": 0.76444161, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.7858215, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 2.563260555267334 + }, + { + "auxiliary_loss_clip": 0.01083611, + "auxiliary_loss_mlp": 0.01036644, + "balance_loss_clip": 1.03496003, + "balance_loss_mlp": 1.02186799, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 2.0690786882357344, + "language_loss": 0.69365865, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71486115, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 2.634253740310669 + }, + { + "auxiliary_loss_clip": 0.01090247, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.03390598, + "balance_loss_mlp": 1.01809001, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 1.8510349582093653, + "language_loss": 0.83779693, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.85899818, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.5695242881774902 + }, + { + "auxiliary_loss_clip": 0.01020163, + "auxiliary_loss_mlp": 0.00747101, + "balance_loss_clip": 1.00755548, + "balance_loss_mlp": 1.00039852, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7962113886507953, + "language_loss": 0.52758175, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54525441, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 6.220596075057983 + }, + { + "auxiliary_loss_clip": 0.01077328, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.03499126, + "balance_loss_mlp": 1.0167346, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.3711497653807636, + "language_loss": 0.87454671, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89562827, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 2.6093342304229736 + }, + { + "auxiliary_loss_clip": 0.01104993, + "auxiliary_loss_mlp": 0.01045411, + "balance_loss_clip": 1.0399251, + "balance_loss_mlp": 1.03223896, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 1.5383011510882356, + "language_loss": 0.77351266, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79501665, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.63224458694458 + }, + { + "auxiliary_loss_clip": 0.01102916, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.03865409, + "balance_loss_mlp": 1.01872802, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.9992825951346693, + "language_loss": 0.78476214, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80611855, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.528414011001587 + }, + { + "auxiliary_loss_clip": 0.01113482, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.03991532, + "balance_loss_mlp": 1.01996446, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.2143709978559714, + "language_loss": 0.71218264, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73364139, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 2.622746229171753 + }, + { + "auxiliary_loss_clip": 0.01078034, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.03517985, + "balance_loss_mlp": 1.02211046, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.8492758930066011, + "language_loss": 0.69835532, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71949005, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.620163679122925 + }, + { + "auxiliary_loss_clip": 0.01052551, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.03763032, + "balance_loss_mlp": 1.03271508, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 2.0118890666856353, + "language_loss": 0.78656971, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80755621, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 2.8068103790283203 + }, + { + "auxiliary_loss_clip": 0.01083383, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.03736949, + "balance_loss_mlp": 1.03076625, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 2.4706523254611845, + "language_loss": 0.71614957, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73742354, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 4.257246732711792 + }, + { + "auxiliary_loss_clip": 0.01095636, + "auxiliary_loss_mlp": 0.01035205, + "balance_loss_clip": 1.03536963, + "balance_loss_mlp": 1.02013755, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 2.136854400073622, + "language_loss": 0.73495781, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75626618, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 2.5906012058258057 + }, + { + "auxiliary_loss_clip": 0.01101611, + "auxiliary_loss_mlp": 0.01042043, + "balance_loss_clip": 1.03867841, + "balance_loss_mlp": 1.02716613, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 2.284805637473174, + "language_loss": 0.83361751, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85505402, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 2.5295181274414062 + }, + { + "auxiliary_loss_clip": 0.01094789, + "auxiliary_loss_mlp": 0.01040592, + "balance_loss_clip": 1.0425446, + "balance_loss_mlp": 1.02621567, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 2.142518732699311, + "language_loss": 0.85804641, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87940025, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 2.595729351043701 + }, + { + "auxiliary_loss_clip": 0.0106358, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.03407633, + "balance_loss_mlp": 1.02425528, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.9832459517990768, + "language_loss": 0.73724645, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75827032, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 2.6420998573303223 + }, + { + "auxiliary_loss_clip": 0.01084538, + "auxiliary_loss_mlp": 0.01039261, + "balance_loss_clip": 1.03971016, + "balance_loss_mlp": 1.0246222, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 2.5218438834642476, + "language_loss": 0.79979181, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82102984, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.637808322906494 + }, + { + "auxiliary_loss_clip": 0.01077016, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.04119813, + "balance_loss_mlp": 1.02606153, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 1.6963057824026881, + "language_loss": 0.81027001, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83144033, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 2.641226053237915 + }, + { + "auxiliary_loss_clip": 0.01088508, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.0349226, + "balance_loss_mlp": 1.026613, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.7666866330021163, + "language_loss": 0.67996001, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70124924, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 2.631610631942749 + }, + { + "auxiliary_loss_clip": 0.0108506, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.04033589, + "balance_loss_mlp": 1.03174329, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.6865619915221228, + "language_loss": 0.80593336, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82723951, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 2.6748359203338623 + }, + { + "auxiliary_loss_clip": 0.01105168, + "auxiliary_loss_mlp": 0.01033545, + "balance_loss_clip": 1.03952742, + "balance_loss_mlp": 1.01921582, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 2.0398902902867824, + "language_loss": 0.82225412, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84364116, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.540186882019043 + }, + { + "auxiliary_loss_clip": 0.01107452, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.0416975, + "balance_loss_mlp": 1.01914704, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 2.318628653986827, + "language_loss": 0.74095505, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76236486, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 2.750148057937622 + }, + { + "auxiliary_loss_clip": 0.01083045, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.03345942, + "balance_loss_mlp": 1.02104878, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 3.217151428838303, + "language_loss": 0.85038769, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87157398, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.6982367038726807 + }, + { + "auxiliary_loss_clip": 0.01071874, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.03388524, + "balance_loss_mlp": 1.0183537, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.1803301625425875, + "language_loss": 0.75972927, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.78076887, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.6497716903686523 + }, + { + "auxiliary_loss_clip": 0.01111094, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.03848386, + "balance_loss_mlp": 1.02175009, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.6711407727670138, + "language_loss": 0.74822366, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76968384, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.575869083404541 + }, + { + "auxiliary_loss_clip": 0.01013721, + "auxiliary_loss_mlp": 0.01003583, + "balance_loss_clip": 1.01019239, + "balance_loss_mlp": 1.001688, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7753470364787988, + "language_loss": 0.57684678, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59701985, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.180851459503174 + }, + { + "auxiliary_loss_clip": 0.01076795, + "auxiliary_loss_mlp": 0.00749901, + "balance_loss_clip": 1.03298223, + "balance_loss_mlp": 1.00058329, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.5740229311820206, + "language_loss": 0.70846647, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.72673345, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.588186502456665 + }, + { + "auxiliary_loss_clip": 0.01114803, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.04076087, + "balance_loss_mlp": 1.02196729, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 2.3537036819072905, + "language_loss": 0.76758254, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78908932, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 2.5360186100006104 + }, + { + "auxiliary_loss_clip": 0.01078331, + "auxiliary_loss_mlp": 0.01042127, + "balance_loss_clip": 1.03778481, + "balance_loss_mlp": 1.02863252, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.6636197098973546, + "language_loss": 0.76655018, + "learning_rate": 2.366541916231585e-06, + "loss": 0.78775477, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.6395483016967773 + }, + { + "auxiliary_loss_clip": 0.01111372, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.04037857, + "balance_loss_mlp": 1.02450395, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 2.825052720853636, + "language_loss": 0.71679318, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73827434, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 2.4941768646240234 + }, + { + "auxiliary_loss_clip": 0.01088958, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.03864753, + "balance_loss_mlp": 1.02029061, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.8269667091781103, + "language_loss": 0.78066885, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80188936, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.8581511974334717 + }, + { + "auxiliary_loss_clip": 0.0102177, + "auxiliary_loss_mlp": 0.01001309, + "balance_loss_clip": 1.00822294, + "balance_loss_mlp": 0.99971151, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7886369819038813, + "language_loss": 0.64987552, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67010641, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 3.141004800796509 + }, + { + "auxiliary_loss_clip": 0.01099044, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.03792048, + "balance_loss_mlp": 1.02029824, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.8073814992608253, + "language_loss": 0.79591841, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81725085, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.6201865673065186 + }, + { + "auxiliary_loss_clip": 0.01057652, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_clip": 1.03234899, + "balance_loss_mlp": 1.03072965, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 2.147733401238538, + "language_loss": 0.70585823, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72688103, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 4.28537917137146 + }, + { + "auxiliary_loss_clip": 0.01079245, + "auxiliary_loss_mlp": 0.01039166, + "balance_loss_clip": 1.0335815, + "balance_loss_mlp": 1.02562964, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.0521599087107707, + "language_loss": 0.72775972, + "learning_rate": 2.364244475667491e-06, + "loss": 0.74894381, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.6536569595336914 + }, + { + "auxiliary_loss_clip": 0.01097179, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.03866434, + "balance_loss_mlp": 1.02702355, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 2.22548892791121, + "language_loss": 0.78268647, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80405116, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.465150833129883 + }, + { + "auxiliary_loss_clip": 0.01114438, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_clip": 1.03887427, + "balance_loss_mlp": 1.02871394, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.7307795024193577, + "language_loss": 0.84620595, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.8677696, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.4917333126068115 + }, + { + "auxiliary_loss_clip": 0.01116373, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.03954256, + "balance_loss_mlp": 1.02752709, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.5770166822186593, + "language_loss": 0.6954906, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71706659, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.521503448486328 + }, + { + "auxiliary_loss_clip": 0.01097573, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.03561401, + "balance_loss_mlp": 1.02300096, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.5135903412375924, + "language_loss": 0.78553927, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.8068763, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.488243341445923 + }, + { + "auxiliary_loss_clip": 0.0109314, + "auxiliary_loss_mlp": 0.01045682, + "balance_loss_clip": 1.03620625, + "balance_loss_mlp": 1.03097737, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.010249612565793, + "language_loss": 0.79329133, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.8146795, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 2.5341646671295166 + }, + { + "auxiliary_loss_clip": 0.01095975, + "auxiliary_loss_mlp": 0.01036874, + "balance_loss_clip": 1.03901124, + "balance_loss_mlp": 1.02319455, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.6938213729370386, + "language_loss": 0.72049475, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.7418232, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.6878254413604736 + }, + { + "auxiliary_loss_clip": 0.01072617, + "auxiliary_loss_mlp": 0.01046714, + "balance_loss_clip": 1.03593159, + "balance_loss_mlp": 1.03149104, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.516458777542942, + "language_loss": 0.70864058, + "learning_rate": 2.361563500108531e-06, + "loss": 0.72983384, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.57033371925354 + }, + { + "auxiliary_loss_clip": 0.01057839, + "auxiliary_loss_mlp": 0.007502, + "balance_loss_clip": 1.03253102, + "balance_loss_mlp": 1.00058222, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.713435247684727, + "language_loss": 0.6962471, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71432751, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 2.624910593032837 + }, + { + "auxiliary_loss_clip": 0.01104421, + "auxiliary_loss_mlp": 0.01040134, + "balance_loss_clip": 1.03925729, + "balance_loss_mlp": 1.02637088, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.578718824811934, + "language_loss": 0.8041752, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82562077, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 2.549638509750366 + }, + { + "auxiliary_loss_clip": 0.01103647, + "auxiliary_loss_mlp": 0.0075011, + "balance_loss_clip": 1.03825259, + "balance_loss_mlp": 1.00064325, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.6886415606356857, + "language_loss": 0.81318259, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83172017, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 4.150937080383301 + }, + { + "auxiliary_loss_clip": 0.01089751, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_clip": 1.03793395, + "balance_loss_mlp": 1.03296733, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.4606274876324585, + "language_loss": 0.6471799, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.66854262, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 4.176829099655151 + }, + { + "auxiliary_loss_clip": 0.01099096, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.03968096, + "balance_loss_mlp": 1.02123141, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.508325098555345, + "language_loss": 0.80441916, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82575393, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 2.5832643508911133 + }, + { + "auxiliary_loss_clip": 0.01078697, + "auxiliary_loss_mlp": 0.01038888, + "balance_loss_clip": 1.03486967, + "balance_loss_mlp": 1.0233072, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.3400641931409836, + "language_loss": 0.75404948, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77522528, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 2.5904595851898193 + }, + { + "auxiliary_loss_clip": 0.01100894, + "auxiliary_loss_mlp": 0.01037499, + "balance_loss_clip": 1.03878462, + "balance_loss_mlp": 1.02358103, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.7728015896077816, + "language_loss": 0.74204588, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76342976, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 2.5415310859680176 + }, + { + "auxiliary_loss_clip": 0.01114264, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.03841341, + "balance_loss_mlp": 1.02266395, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 1.5954405280484731, + "language_loss": 0.67889833, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70040751, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 2.5387837886810303 + }, + { + "auxiliary_loss_clip": 0.01085165, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.0349946, + "balance_loss_mlp": 1.02604949, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.7340270453582005, + "language_loss": 0.75374258, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77499795, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 2.5813891887664795 + }, + { + "auxiliary_loss_clip": 0.01091489, + "auxiliary_loss_mlp": 0.01034885, + "balance_loss_clip": 1.03813815, + "balance_loss_mlp": 1.01988792, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.7744663091799782, + "language_loss": 0.74942315, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77068686, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 2.577240228652954 + }, + { + "auxiliary_loss_clip": 0.01020004, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.01274788, + "balance_loss_mlp": 1.00612569, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8467393385460097, + "language_loss": 0.5825696, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60284388, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 4.448234796524048 + }, + { + "auxiliary_loss_clip": 0.01106587, + "auxiliary_loss_mlp": 0.01038748, + "balance_loss_clip": 1.03708816, + "balance_loss_mlp": 1.02485466, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 2.1913716427817453, + "language_loss": 0.92886263, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95031607, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 2.5789225101470947 + }, + { + "auxiliary_loss_clip": 0.01100735, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.03806698, + "balance_loss_mlp": 1.02365696, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 3.664559906054064, + "language_loss": 0.82932365, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.85070866, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 2.5527501106262207 + }, + { + "auxiliary_loss_clip": 0.00989865, + "auxiliary_loss_mlp": 0.01002867, + "balance_loss_clip": 1.00560665, + "balance_loss_mlp": 1.00144291, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7570456499038023, + "language_loss": 0.59872246, + "learning_rate": 2.356199538526593e-06, + "loss": 0.61864984, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 3.1120269298553467 + }, + { + "auxiliary_loss_clip": 0.01093883, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.03641438, + "balance_loss_mlp": 1.01951611, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.5989392270593912, + "language_loss": 0.72583878, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74711668, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.6346561908721924 + }, + { + "auxiliary_loss_clip": 0.01080928, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.03564465, + "balance_loss_mlp": 1.02621722, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.8383061252844342, + "language_loss": 0.66773367, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.68895483, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 2.688366174697876 + }, + { + "auxiliary_loss_clip": 0.01099679, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.03544664, + "balance_loss_mlp": 1.01735926, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.4915942300756377, + "language_loss": 0.78613174, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80744219, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 2.6047494411468506 + }, + { + "auxiliary_loss_clip": 0.01051385, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.03425801, + "balance_loss_mlp": 1.02652156, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 2.203607164848142, + "language_loss": 0.68970251, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71061856, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.7655489444732666 + }, + { + "auxiliary_loss_clip": 0.01106639, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.03780317, + "balance_loss_mlp": 1.02396226, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 2.136033119122938, + "language_loss": 0.8436023, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86507136, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 2.596987724304199 + }, + { + "auxiliary_loss_clip": 0.01088821, + "auxiliary_loss_mlp": 0.00750032, + "balance_loss_clip": 1.03785956, + "balance_loss_mlp": 1.00056148, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.4317994511148013, + "language_loss": 0.75571764, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77410614, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.6174681186676025 + }, + { + "auxiliary_loss_clip": 0.01065371, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.0333035, + "balance_loss_mlp": 1.01913595, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.6674038920573604, + "language_loss": 0.75742936, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.77841657, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.756972312927246 + }, + { + "auxiliary_loss_clip": 0.01070462, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.03781343, + "balance_loss_mlp": 1.02250171, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 1.9746956505418802, + "language_loss": 0.66323876, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68432891, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.6559860706329346 + }, + { + "auxiliary_loss_clip": 0.01084366, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.03314734, + "balance_loss_mlp": 1.02532697, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.6164771451460622, + "language_loss": 0.78965068, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81089264, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.6203207969665527 + }, + { + "auxiliary_loss_clip": 0.01071518, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.03313911, + "balance_loss_mlp": 1.019256, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 1.6752974362041586, + "language_loss": 0.67782348, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69886839, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.01087103, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.03419983, + "balance_loss_mlp": 1.0205822, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.7334851881508904, + "language_loss": 0.81035525, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83156443, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 2.6865150928497314 + }, + { + "auxiliary_loss_clip": 0.01113805, + "auxiliary_loss_mlp": 0.00749933, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.00054383, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 1.8858483326711366, + "language_loss": 0.70754647, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72618383, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.658379077911377 + }, + { + "auxiliary_loss_clip": 0.01024067, + "auxiliary_loss_mlp": 0.01001042, + "balance_loss_clip": 1.01064086, + "balance_loss_mlp": 0.99928969, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9462390753813259, + "language_loss": 0.62088162, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64113271, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 3.225620746612549 + }, + { + "auxiliary_loss_clip": 0.01057566, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_clip": 1.03377438, + "balance_loss_mlp": 1.02767587, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 1.6281325615401328, + "language_loss": 0.68659425, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70760536, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.7406058311462402 + }, + { + "auxiliary_loss_clip": 0.01095872, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.03408527, + "balance_loss_mlp": 1.02610481, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.7044096552015389, + "language_loss": 0.77304578, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79440129, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 2.531163454055786 + }, + { + "auxiliary_loss_clip": 0.01093338, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_clip": 1.03771305, + "balance_loss_mlp": 1.02946997, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.7497478797376662, + "language_loss": 0.74992537, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77130616, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.6056692600250244 + }, + { + "auxiliary_loss_clip": 0.01085459, + "auxiliary_loss_mlp": 0.01041026, + "balance_loss_clip": 1.03527749, + "balance_loss_mlp": 1.0249685, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 3.887827256089278, + "language_loss": 0.80156863, + "learning_rate": 2.349682601310998e-06, + "loss": 0.82283354, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.552809715270996 + }, + { + "auxiliary_loss_clip": 0.01097761, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.03696918, + "balance_loss_mlp": 1.01962066, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 2.6480887843313465, + "language_loss": 0.7331813, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75449097, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 3.979891538619995 + }, + { + "auxiliary_loss_clip": 0.0107926, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.0366838, + "balance_loss_mlp": 1.02153182, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.5494806180561402, + "language_loss": 0.72378206, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74492186, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.605377674102783 + }, + { + "auxiliary_loss_clip": 0.01082792, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.03709996, + "balance_loss_mlp": 1.02444053, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 7.759753344323293, + "language_loss": 0.77932787, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80053627, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.650676727294922 + }, + { + "auxiliary_loss_clip": 0.01066936, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.03472114, + "balance_loss_mlp": 1.02201235, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 2.260983550435389, + "language_loss": 0.73975682, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76079679, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.906229019165039 + }, + { + "auxiliary_loss_clip": 0.01067376, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.03765666, + "balance_loss_mlp": 1.02061129, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.4987635441372515, + "language_loss": 0.76254773, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78355753, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.773266315460205 + }, + { + "auxiliary_loss_clip": 0.01058888, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.03765011, + "balance_loss_mlp": 1.02232242, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 1.671342232197918, + "language_loss": 0.78065783, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80159855, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 2.709946870803833 + }, + { + "auxiliary_loss_clip": 0.01092577, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.03440332, + "balance_loss_mlp": 1.02233958, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 3.380180882174435, + "language_loss": 0.82726395, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84856355, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.714639186859131 + }, + { + "auxiliary_loss_clip": 0.01097835, + "auxiliary_loss_mlp": 0.01039074, + "balance_loss_clip": 1.03465176, + "balance_loss_mlp": 1.02559757, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6960382035608974, + "language_loss": 0.63627636, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.65764546, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 2.574626922607422 + }, + { + "auxiliary_loss_clip": 0.01018756, + "auxiliary_loss_mlp": 0.01006693, + "balance_loss_clip": 1.01559043, + "balance_loss_mlp": 1.0050118, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6890077385564648, + "language_loss": 0.55845845, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57871294, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 3.2666380405426025 + }, + { + "auxiliary_loss_clip": 0.01105169, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.03969026, + "balance_loss_mlp": 1.02460599, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 3.2429093237865994, + "language_loss": 0.71238244, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73381335, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.5349223613739014 + }, + { + "auxiliary_loss_clip": 0.01083299, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.03517962, + "balance_loss_mlp": 1.01973343, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 2.206826236697112, + "language_loss": 0.70981055, + "learning_rate": 2.345463713066195e-06, + "loss": 0.7309742, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.8174614906311035 + }, + { + "auxiliary_loss_clip": 0.01087342, + "auxiliary_loss_mlp": 0.01037671, + "balance_loss_clip": 1.03408861, + "balance_loss_mlp": 1.02398586, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.4499284972260795, + "language_loss": 0.65624642, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67749655, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 4.247010946273804 + }, + { + "auxiliary_loss_clip": 0.01031637, + "auxiliary_loss_mlp": 0.01000752, + "balance_loss_clip": 1.00851107, + "balance_loss_mlp": 0.99930364, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7003452502517515, + "language_loss": 0.58679825, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60712212, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 3.0986146926879883 + }, + { + "auxiliary_loss_clip": 0.01001403, + "auxiliary_loss_mlp": 0.01003293, + "balance_loss_clip": 1.00781322, + "balance_loss_mlp": 1.00188673, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7931949469250746, + "language_loss": 0.62689662, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64694369, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 4.482827663421631 + }, + { + "auxiliary_loss_clip": 0.01088485, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.03726649, + "balance_loss_mlp": 1.0213387, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 16.10395435558574, + "language_loss": 0.75743449, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.77866161, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 2.6230714321136475 + }, + { + "auxiliary_loss_clip": 0.01114296, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.03996193, + "balance_loss_mlp": 1.02384186, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.920514007787551, + "language_loss": 0.66532367, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68684429, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 2.5483930110931396 + }, + { + "auxiliary_loss_clip": 0.01078028, + "auxiliary_loss_mlp": 0.01038414, + "balance_loss_clip": 1.03607106, + "balance_loss_mlp": 1.02533102, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 1.8659100301638656, + "language_loss": 0.697891, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71905547, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 2.739156723022461 + }, + { + "auxiliary_loss_clip": 0.01120512, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_clip": 1.04325092, + "balance_loss_mlp": 1.02976859, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 2.1592215877707823, + "language_loss": 0.63655937, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65820086, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 2.583489179611206 + }, + { + "auxiliary_loss_clip": 0.01098326, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.03694046, + "balance_loss_mlp": 1.01798916, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.6900349125984289, + "language_loss": 0.67048991, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69178081, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 2.5922040939331055 + }, + { + "auxiliary_loss_clip": 0.01068859, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.03422523, + "balance_loss_mlp": 1.02761722, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.4663616663165118, + "language_loss": 0.74150407, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76260591, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 4.3074235916137695 + }, + { + "auxiliary_loss_clip": 0.01113023, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.03943968, + "balance_loss_mlp": 1.0218271, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.8968951578282827, + "language_loss": 0.76435649, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78583533, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 2.5965523719787598 + }, + { + "auxiliary_loss_clip": 0.01119551, + "auxiliary_loss_mlp": 0.01040506, + "balance_loss_clip": 1.04117775, + "balance_loss_mlp": 1.02664161, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 1.694916612445548, + "language_loss": 0.79426181, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.81586242, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 2.518115520477295 + }, + { + "auxiliary_loss_clip": 0.01068498, + "auxiliary_loss_mlp": 0.01045269, + "balance_loss_clip": 1.03846085, + "balance_loss_mlp": 1.03067183, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 1.6236954893598206, + "language_loss": 0.66371024, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68484795, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 2.730637550354004 + }, + { + "auxiliary_loss_clip": 0.01091676, + "auxiliary_loss_mlp": 0.00749989, + "balance_loss_clip": 1.03729129, + "balance_loss_mlp": 1.00060415, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.009918439901715, + "language_loss": 0.74194717, + "learning_rate": 2.340475712142296e-06, + "loss": 0.76036376, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 2.601987600326538 + }, + { + "auxiliary_loss_clip": 0.01048653, + "auxiliary_loss_mlp": 0.01034333, + "balance_loss_clip": 1.03835797, + "balance_loss_mlp": 1.01971185, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.1330252201793187, + "language_loss": 0.74530286, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76613271, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 2.7718310356140137 + }, + { + "auxiliary_loss_clip": 0.01059363, + "auxiliary_loss_mlp": 0.0074988, + "balance_loss_clip": 1.03338194, + "balance_loss_mlp": 1.00058925, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.6003852531737233, + "language_loss": 0.7874068, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80549926, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 2.6648664474487305 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.0103862, + "balance_loss_clip": 1.03589821, + "balance_loss_mlp": 1.02353954, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 2.639203012066602, + "language_loss": 0.56716973, + "learning_rate": 2.339324323980964e-06, + "loss": 0.58855247, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.5942471027374268 + }, + { + "auxiliary_loss_clip": 0.01102446, + "auxiliary_loss_mlp": 0.0103662, + "balance_loss_clip": 1.03766787, + "balance_loss_mlp": 1.02254725, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.272367196052685, + "language_loss": 0.83191228, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85330302, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.6142289638519287 + }, + { + "auxiliary_loss_clip": 0.01092834, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.03956652, + "balance_loss_mlp": 1.01648569, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.6433279752076912, + "language_loss": 0.75287449, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77410007, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.6143064498901367 + }, + { + "auxiliary_loss_clip": 0.01078975, + "auxiliary_loss_mlp": 0.01042233, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.02786827, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 1.6051239426838138, + "language_loss": 0.7417711, + "learning_rate": 2.338172820014723e-06, + "loss": 0.7629832, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.7425475120544434 + }, + { + "auxiliary_loss_clip": 0.01068695, + "auxiliary_loss_mlp": 0.01043699, + "balance_loss_clip": 1.03573728, + "balance_loss_mlp": 1.02832675, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.5156151580643094, + "language_loss": 0.85536909, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87649304, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.635488986968994 + }, + { + "auxiliary_loss_clip": 0.01094328, + "auxiliary_loss_mlp": 0.0103866, + "balance_loss_clip": 1.0394609, + "balance_loss_mlp": 1.02536798, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 2.114566632858523, + "language_loss": 0.79457426, + "learning_rate": 2.337405086561902e-06, + "loss": 0.8159042, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 2.6619796752929688 + }, + { + "auxiliary_loss_clip": 0.01097718, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.03698242, + "balance_loss_mlp": 1.02210379, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.5899136316142746, + "language_loss": 0.72166276, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.742993, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 2.5359179973602295 + }, + { + "auxiliary_loss_clip": 0.0109093, + "auxiliary_loss_mlp": 0.01044596, + "balance_loss_clip": 1.03738701, + "balance_loss_mlp": 1.03035617, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.6245031163463735, + "language_loss": 0.69057977, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71193504, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 2.5546936988830566 + }, + { + "auxiliary_loss_clip": 0.01112833, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.04062653, + "balance_loss_mlp": 1.01782966, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.877594408745218, + "language_loss": 0.84491307, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.86635029, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.5917022228240967 + }, + { + "auxiliary_loss_clip": 0.01111079, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.03773165, + "balance_loss_mlp": 1.02430403, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 2.7028664232575736, + "language_loss": 0.71212643, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73361349, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 2.618149995803833 + }, + { + "auxiliary_loss_clip": 0.01054378, + "auxiliary_loss_mlp": 0.01037434, + "balance_loss_clip": 1.03419757, + "balance_loss_mlp": 1.02308714, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.7872788176407857, + "language_loss": 0.71714395, + "learning_rate": 2.335485529281996e-06, + "loss": 0.73806208, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.7433226108551025 + }, + { + "auxiliary_loss_clip": 0.01111443, + "auxiliary_loss_mlp": 0.00749797, + "balance_loss_clip": 1.0385437, + "balance_loss_mlp": 1.00055039, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 2.0003570673271724, + "language_loss": 0.72276413, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74137652, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 2.5561304092407227 + }, + { + "auxiliary_loss_clip": 0.01061997, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.03363574, + "balance_loss_mlp": 1.02543283, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 2.0747097102187633, + "language_loss": 0.64997268, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67098606, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 2.7880606651306152 + }, + { + "auxiliary_loss_clip": 0.01087548, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.03687871, + "balance_loss_mlp": 1.01744723, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 3.0501473220439737, + "language_loss": 0.72759104, + "learning_rate": 2.33433364213785e-06, + "loss": 0.74876773, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.624040365219116 + }, + { + "auxiliary_loss_clip": 0.0108947, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.03684688, + "balance_loss_mlp": 1.02098954, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.6537589933419232, + "language_loss": 0.68812817, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70937514, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 4.265545845031738 + }, + { + "auxiliary_loss_clip": 0.0110223, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.0385921, + "balance_loss_mlp": 1.01632071, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 1.8600998231906556, + "language_loss": 0.80535012, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.82667226, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.6072635650634766 + }, + { + "auxiliary_loss_clip": 0.01098119, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.03709674, + "balance_loss_mlp": 1.02137351, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.8112495711466168, + "language_loss": 0.7734037, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79472923, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.56516695022583 + }, + { + "auxiliary_loss_clip": 0.01083447, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.03742528, + "balance_loss_mlp": 1.01785684, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.8052817715254694, + "language_loss": 0.701765, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.7229085, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.6645162105560303 + }, + { + "auxiliary_loss_clip": 0.01093214, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.03674459, + "balance_loss_mlp": 1.02084291, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 2.260661095833777, + "language_loss": 0.61146855, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63276142, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 2.8139402866363525 + }, + { + "auxiliary_loss_clip": 0.01076131, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.03702521, + "balance_loss_mlp": 1.02018476, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 3.0516210065429696, + "language_loss": 0.7814405, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.80254412, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 2.747515916824341 + }, + { + "auxiliary_loss_clip": 0.01115997, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.04061329, + "balance_loss_mlp": 1.02668428, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.7057279995061772, + "language_loss": 0.76964343, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.7912119, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.543773889541626 + }, + { + "auxiliary_loss_clip": 0.01102625, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.03720951, + "balance_loss_mlp": 1.0223093, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 1.9821215889798596, + "language_loss": 0.73490512, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75630736, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 2.667043447494507 + }, + { + "auxiliary_loss_clip": 0.01085993, + "auxiliary_loss_mlp": 0.01044465, + "balance_loss_clip": 1.0381999, + "balance_loss_mlp": 1.02965903, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.3292305499190982, + "language_loss": 0.71497691, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73628139, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 2.6011459827423096 + }, + { + "auxiliary_loss_clip": 0.01088956, + "auxiliary_loss_mlp": 0.0104343, + "balance_loss_clip": 1.03714204, + "balance_loss_mlp": 1.02733111, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 1.5732263167576035, + "language_loss": 0.73100591, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75232983, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.7176530361175537 + }, + { + "auxiliary_loss_clip": 0.01076867, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.03610468, + "balance_loss_mlp": 1.02417517, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.7326927567957662, + "language_loss": 0.58711004, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60827172, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 2.7694380283355713 + }, + { + "auxiliary_loss_clip": 0.01100857, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.0378896, + "balance_loss_mlp": 1.02142715, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 2.192446390115716, + "language_loss": 0.70132858, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72268742, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 4.112663269042969 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.04059911, + "balance_loss_mlp": 1.02504444, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 1.8038371889144362, + "language_loss": 0.69021964, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.71180665, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 2.5343337059020996 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.03883958, + "balance_loss_mlp": 1.01665008, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.6288913032978691, + "language_loss": 0.80799508, + "learning_rate": 2.328956666474691e-06, + "loss": 0.82947105, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 4.017656326293945 + }, + { + "auxiliary_loss_clip": 0.01112439, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.03781247, + "balance_loss_mlp": 1.02319479, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.6536473067455117, + "language_loss": 0.73388386, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75538141, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 2.525141954421997 + }, + { + "auxiliary_loss_clip": 0.01111135, + "auxiliary_loss_mlp": 0.00749889, + "balance_loss_clip": 1.03752697, + "balance_loss_mlp": 1.00048256, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 2.5032605539839032, + "language_loss": 0.70561576, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.724226, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.678189277648926 + }, + { + "auxiliary_loss_clip": 0.01090911, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.041502, + "balance_loss_mlp": 1.02339816, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 3.231310935056062, + "language_loss": 0.86769307, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88898551, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 2.625434637069702 + }, + { + "auxiliary_loss_clip": 0.01011878, + "auxiliary_loss_mlp": 0.01007489, + "balance_loss_clip": 1.00852132, + "balance_loss_mlp": 1.00580847, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7205321800732922, + "language_loss": 0.55076379, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57095742, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 3.193997859954834 + }, + { + "auxiliary_loss_clip": 0.01088928, + "auxiliary_loss_mlp": 0.01043109, + "balance_loss_clip": 1.03678966, + "balance_loss_mlp": 1.0285418, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.237264770655671, + "language_loss": 0.79651392, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81783426, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 2.5951361656188965 + }, + { + "auxiliary_loss_clip": 0.01117344, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.03999841, + "balance_loss_mlp": 1.02347815, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.5974226444847883, + "language_loss": 0.78196615, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80351603, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 4.123611927032471 + }, + { + "auxiliary_loss_clip": 0.01013412, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.0301708, + "balance_loss_mlp": 1.01965749, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.6095685616407645, + "language_loss": 0.68282217, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70329338, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 3.0587239265441895 + }, + { + "auxiliary_loss_clip": 0.01095076, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.03659415, + "balance_loss_mlp": 1.02146637, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.4597391662729295, + "language_loss": 0.67126226, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69256872, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.860468626022339 + }, + { + "auxiliary_loss_clip": 0.01096541, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.03747272, + "balance_loss_mlp": 1.02434254, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.869807415178978, + "language_loss": 0.64885587, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67018902, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 2.6971333026885986 + }, + { + "auxiliary_loss_clip": 0.01085171, + "auxiliary_loss_mlp": 0.00750037, + "balance_loss_clip": 1.03708446, + "balance_loss_mlp": 1.00065994, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 2.547343121991516, + "language_loss": 0.74680543, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.76515758, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.684720039367676 + }, + { + "auxiliary_loss_clip": 0.0109002, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.03730989, + "balance_loss_mlp": 1.02780461, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 1.929091085203001, + "language_loss": 0.78128505, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80260801, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 2.7084741592407227 + }, + { + "auxiliary_loss_clip": 0.01072513, + "auxiliary_loss_mlp": 0.01038037, + "balance_loss_clip": 1.03354883, + "balance_loss_mlp": 1.02392256, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 1.8059384647108334, + "language_loss": 0.76439971, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78550518, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.711954355239868 + }, + { + "auxiliary_loss_clip": 0.01086419, + "auxiliary_loss_mlp": 0.01046627, + "balance_loss_clip": 1.03850508, + "balance_loss_mlp": 1.03096259, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 2.7733583842719485, + "language_loss": 0.80049354, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82182395, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.631937026977539 + }, + { + "auxiliary_loss_clip": 0.01110949, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.03762627, + "balance_loss_mlp": 1.02798045, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.4989058850849009, + "language_loss": 0.76923311, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79075545, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.5211000442504883 + }, + { + "auxiliary_loss_clip": 0.01069692, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.03390622, + "balance_loss_mlp": 1.01817179, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 2.8309327571897214, + "language_loss": 0.65824085, + "learning_rate": 2.323192909069061e-06, + "loss": 0.67925423, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 2.742055654525757 + }, + { + "auxiliary_loss_clip": 0.01088521, + "auxiliary_loss_mlp": 0.01041445, + "balance_loss_clip": 1.03544223, + "balance_loss_mlp": 1.02541113, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.051282322206819, + "language_loss": 0.72795898, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74925864, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 2.6422746181488037 + }, + { + "auxiliary_loss_clip": 0.0103062, + "auxiliary_loss_mlp": 0.01011043, + "balance_loss_clip": 1.00770617, + "balance_loss_mlp": 1.0093981, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2645699057205015, + "language_loss": 0.51950759, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.5399242, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 3.057856798171997 + }, + { + "auxiliary_loss_clip": 0.01092475, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.04065609, + "balance_loss_mlp": 1.01898003, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.2431933995059667, + "language_loss": 0.75515759, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77641356, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 2.648533344268799 + }, + { + "auxiliary_loss_clip": 0.01064669, + "auxiliary_loss_mlp": 0.01041246, + "balance_loss_clip": 1.03499258, + "balance_loss_mlp": 1.02700663, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 1.914477131076628, + "language_loss": 0.69933665, + "learning_rate": 2.321655439354519e-06, + "loss": 0.7203958, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 2.6273410320281982 + }, + { + "auxiliary_loss_clip": 0.0110981, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.03893232, + "balance_loss_mlp": 1.0209471, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.5846592390826781, + "language_loss": 0.72138429, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74282026, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.5848162174224854 + }, + { + "auxiliary_loss_clip": 0.01088865, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.04058039, + "balance_loss_mlp": 1.03030252, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 2.24532555240661, + "language_loss": 0.83321035, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85454917, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.609133720397949 + }, + { + "auxiliary_loss_clip": 0.01020079, + "auxiliary_loss_mlp": 0.01009989, + "balance_loss_clip": 1.00710964, + "balance_loss_mlp": 1.00845087, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7647185508774791, + "language_loss": 0.57843828, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59873897, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 3.1983299255371094 + }, + { + "auxiliary_loss_clip": 0.01092952, + "auxiliary_loss_mlp": 0.01040072, + "balance_loss_clip": 1.0390141, + "balance_loss_mlp": 1.02679229, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.7783844381021592, + "language_loss": 0.8427639, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.8640942, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 2.5786359310150146 + }, + { + "auxiliary_loss_clip": 0.01087388, + "auxiliary_loss_mlp": 0.01043244, + "balance_loss_clip": 1.0372206, + "balance_loss_mlp": 1.02904654, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 3.1148687871772163, + "language_loss": 0.7620337, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78334004, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.6860249042510986 + }, + { + "auxiliary_loss_clip": 0.01084603, + "auxiliary_loss_mlp": 0.01037822, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.02428532, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 5.183739293185186, + "language_loss": 0.81049824, + "learning_rate": 2.319348869158064e-06, + "loss": 0.8317225, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 2.6456377506256104 + }, + { + "auxiliary_loss_clip": 0.01091337, + "auxiliary_loss_mlp": 0.01045011, + "balance_loss_clip": 1.03619003, + "balance_loss_mlp": 1.03030074, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.6392214139665566, + "language_loss": 0.72811812, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74948156, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.6583220958709717 + }, + { + "auxiliary_loss_clip": 0.01075378, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.03592944, + "balance_loss_mlp": 1.0203352, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.3687634669302513, + "language_loss": 0.71352512, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73462701, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 4.102771759033203 + }, + { + "auxiliary_loss_clip": 0.01064165, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.0394733, + "balance_loss_mlp": 1.01765859, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5802438403526085, + "language_loss": 0.85171628, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87265897, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 2.7505197525024414 + }, + { + "auxiliary_loss_clip": 0.01100622, + "auxiliary_loss_mlp": 0.01042062, + "balance_loss_clip": 1.039428, + "balance_loss_mlp": 1.02799559, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.3024060291371848, + "language_loss": 0.73216963, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75359643, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 2.6169989109039307 + }, + { + "auxiliary_loss_clip": 0.01099221, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.039711, + "balance_loss_mlp": 1.0267843, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.793480419345336, + "language_loss": 0.69777262, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.71916378, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 2.8917198181152344 + }, + { + "auxiliary_loss_clip": 0.01069795, + "auxiliary_loss_mlp": 0.01042244, + "balance_loss_clip": 1.03372896, + "balance_loss_mlp": 1.02705717, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 3.4532058845702442, + "language_loss": 0.67131156, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69243193, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.7733848094940186 + }, + { + "auxiliary_loss_clip": 0.01078581, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.03845525, + "balance_loss_mlp": 1.02316618, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.0231445673704913, + "language_loss": 0.63356942, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.654742, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 2.5759429931640625 + }, + { + "auxiliary_loss_clip": 0.01105422, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.03963816, + "balance_loss_mlp": 1.02189839, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 3.184174579854594, + "language_loss": 0.74543715, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.7668609, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 2.5884366035461426 + }, + { + "auxiliary_loss_clip": 0.0109271, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.0403161, + "balance_loss_mlp": 1.01645064, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.2375483644051255, + "language_loss": 0.74197143, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76320672, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.7065482139587402 + }, + { + "auxiliary_loss_clip": 0.01086889, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.04186702, + "balance_loss_mlp": 1.02038574, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 2.253037048738577, + "language_loss": 0.73813426, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.75935221, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 2.701998233795166 + }, + { + "auxiliary_loss_clip": 0.01087212, + "auxiliary_loss_mlp": 0.01039851, + "balance_loss_clip": 1.03756976, + "balance_loss_mlp": 1.02570128, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 13.07931197944574, + "language_loss": 0.69108129, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71235192, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 2.6630141735076904 + }, + { + "auxiliary_loss_clip": 0.01087012, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03904319, + "balance_loss_mlp": 1.02069974, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 2.6138148447957574, + "language_loss": 0.72745371, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.74866271, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 2.6993613243103027 + }, + { + "auxiliary_loss_clip": 0.0109587, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.03979743, + "balance_loss_mlp": 1.01712394, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.5639197289915951, + "language_loss": 0.79088193, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81215727, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 4.722888231277466 + }, + { + "auxiliary_loss_clip": 0.01098472, + "auxiliary_loss_mlp": 0.01033975, + "balance_loss_clip": 1.03828514, + "balance_loss_mlp": 1.02086771, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.736901481332817, + "language_loss": 0.72137755, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74270207, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 2.71867036819458 + }, + { + "auxiliary_loss_clip": 0.01098799, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.03701866, + "balance_loss_mlp": 1.01880765, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.9056382056727914, + "language_loss": 0.78180563, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80311763, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 4.145415544509888 + }, + { + "auxiliary_loss_clip": 0.01070068, + "auxiliary_loss_mlp": 0.01032736, + "balance_loss_clip": 1.03557098, + "balance_loss_mlp": 1.01971221, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 1.894572073171282, + "language_loss": 0.66603726, + "learning_rate": 2.313195892540705e-06, + "loss": 0.6870653, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.7628371715545654 + }, + { + "auxiliary_loss_clip": 0.01079729, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.03569508, + "balance_loss_mlp": 1.03062415, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.6963529721352377, + "language_loss": 0.74987209, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.77111351, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.680809736251831 + }, + { + "auxiliary_loss_clip": 0.01094463, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.04102325, + "balance_loss_mlp": 1.02358246, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.4811484086261884, + "language_loss": 0.7796424, + "learning_rate": 2.312426555462893e-06, + "loss": 0.80095798, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 2.657686471939087 + }, + { + "auxiliary_loss_clip": 0.01088991, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.03894567, + "balance_loss_mlp": 1.01777446, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.6407341588528277, + "language_loss": 0.74235225, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76355374, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 2.664292335510254 + }, + { + "auxiliary_loss_clip": 0.01101424, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.03906727, + "balance_loss_mlp": 1.02270257, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 2.296690962709683, + "language_loss": 0.78813875, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.8095355, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.632171869277954 + }, + { + "auxiliary_loss_clip": 0.01021029, + "auxiliary_loss_mlp": 0.01006428, + "balance_loss_clip": 1.00848556, + "balance_loss_mlp": 1.00499749, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.800719790678713, + "language_loss": 0.59823275, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61850739, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 4.73381781578064 + }, + { + "auxiliary_loss_clip": 0.01072275, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.03415847, + "balance_loss_mlp": 1.02192676, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 3.60271641472953, + "language_loss": 0.78602564, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80712169, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 2.655710458755493 + }, + { + "auxiliary_loss_clip": 0.01084743, + "auxiliary_loss_mlp": 0.01035462, + "balance_loss_clip": 1.04254222, + "balance_loss_mlp": 1.02340364, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.881886728828504, + "language_loss": 0.71919191, + "learning_rate": 2.310503005696839e-06, + "loss": 0.740394, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.649310827255249 + }, + { + "auxiliary_loss_clip": 0.01071062, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.03652143, + "balance_loss_mlp": 1.02140617, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 1.9011639938562448, + "language_loss": 0.78059351, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.80166, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 2.602376699447632 + }, + { + "auxiliary_loss_clip": 0.0109088, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.03385603, + "balance_loss_mlp": 1.02237058, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 2.486852309760725, + "language_loss": 0.64668918, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.66795611, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 2.551696300506592 + }, + { + "auxiliary_loss_clip": 0.01101883, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.03878558, + "balance_loss_mlp": 1.02001405, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.9790202983573077, + "language_loss": 0.7456513, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76700348, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 2.6852314472198486 + }, + { + "auxiliary_loss_clip": 0.01083761, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.03753257, + "balance_loss_mlp": 1.01956677, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.6010533740779336, + "language_loss": 0.70973301, + "learning_rate": 2.308963953858982e-06, + "loss": 0.73090035, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.6566295623779297 + }, + { + "auxiliary_loss_clip": 0.01109987, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.03656769, + "balance_loss_mlp": 1.02332544, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.7172346474981142, + "language_loss": 0.80846286, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.82992554, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.5794732570648193 + }, + { + "auxiliary_loss_clip": 0.01028131, + "auxiliary_loss_mlp": 0.01001237, + "balance_loss_clip": 1.00531077, + "balance_loss_mlp": 0.99980658, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7959788008219199, + "language_loss": 0.55682611, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57711983, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.0732510089874268 + }, + { + "auxiliary_loss_clip": 0.01093749, + "auxiliary_loss_mlp": 0.00749861, + "balance_loss_clip": 1.03469479, + "balance_loss_mlp": 1.00077581, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.1091700925742596, + "language_loss": 0.65543127, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.6738674, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.6236865520477295 + }, + { + "auxiliary_loss_clip": 0.01095562, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.037884, + "balance_loss_mlp": 1.02022839, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 2.040583092765821, + "language_loss": 0.63508141, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65636742, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.616072416305542 + }, + { + "auxiliary_loss_clip": 0.01092139, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.0379051, + "balance_loss_mlp": 1.02250719, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 2.089841978245636, + "language_loss": 0.80144548, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82273751, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.63105845451355 + }, + { + "auxiliary_loss_clip": 0.01077202, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.03865433, + "balance_loss_mlp": 1.01655412, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.5322794089092846, + "language_loss": 0.77479243, + "learning_rate": 2.306655024915726e-06, + "loss": 0.7958678, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 2.6254303455352783 + }, + { + "auxiliary_loss_clip": 0.01075247, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.03529692, + "balance_loss_mlp": 1.02275181, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.7030187909030494, + "language_loss": 0.69641465, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71753019, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 2.6302363872528076 + }, + { + "auxiliary_loss_clip": 0.01101362, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.03918982, + "balance_loss_mlp": 1.02096844, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.4143516657691655, + "language_loss": 0.73753035, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75887573, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.5936763286590576 + }, + { + "auxiliary_loss_clip": 0.01098823, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.03683186, + "balance_loss_mlp": 1.01973176, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.6358288749326206, + "language_loss": 0.69514644, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71646309, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 2.5876781940460205 + }, + { + "auxiliary_loss_clip": 0.01096465, + "auxiliary_loss_mlp": 0.01040727, + "balance_loss_clip": 1.03537476, + "balance_loss_mlp": 1.0271436, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 2.229076809072742, + "language_loss": 0.7340914, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75546336, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.590182304382324 + }, + { + "auxiliary_loss_clip": 0.01063825, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.03489721, + "balance_loss_mlp": 1.02291822, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5489359269635945, + "language_loss": 0.72456658, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74556041, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 2.6685128211975098 + }, + { + "auxiliary_loss_clip": 0.01075235, + "auxiliary_loss_mlp": 0.01054353, + "balance_loss_clip": 1.0321101, + "balance_loss_mlp": 1.03859353, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.552831502562233, + "language_loss": 0.74330592, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76460177, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 2.6986701488494873 + }, + { + "auxiliary_loss_clip": 0.01099703, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.03551316, + "balance_loss_mlp": 1.02251744, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.8803904815650516, + "language_loss": 0.62518388, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.6465413, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.699897527694702 + }, + { + "auxiliary_loss_clip": 0.01090786, + "auxiliary_loss_mlp": 0.01042722, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.02990651, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.8752602455770853, + "language_loss": 0.6321547, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65348983, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 2.693887710571289 + }, + { + "auxiliary_loss_clip": 0.01104401, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.04101479, + "balance_loss_mlp": 1.02200532, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.7185250009092163, + "language_loss": 0.67699355, + "learning_rate": 2.303190847569801e-06, + "loss": 0.69839901, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 4.159752607345581 + }, + { + "auxiliary_loss_clip": 0.01078821, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.03586423, + "balance_loss_mlp": 1.0190804, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 2.2753711473145217, + "language_loss": 0.84156686, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.86266398, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 2.628777265548706 + }, + { + "auxiliary_loss_clip": 0.0107514, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.03620076, + "balance_loss_mlp": 1.02048254, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 2.075102773651516, + "language_loss": 0.77241337, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79350752, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.72710919380188 + }, + { + "auxiliary_loss_clip": 0.01094271, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03587723, + "balance_loss_mlp": 1.0204016, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 2.2944858673495996, + "language_loss": 0.74070078, + "learning_rate": 2.302035914315856e-06, + "loss": 0.76196927, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.530482530593872 + }, + { + "auxiliary_loss_clip": 0.01077809, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.03544474, + "balance_loss_mlp": 1.02322853, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.7043723948938072, + "language_loss": 0.65237153, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67351454, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 2.6886441707611084 + }, + { + "auxiliary_loss_clip": 0.01095974, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.03565371, + "balance_loss_mlp": 1.02155161, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.6170341225227547, + "language_loss": 0.64119995, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66248828, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.6148416996002197 + }, + { + "auxiliary_loss_clip": 0.01019806, + "auxiliary_loss_mlp": 0.01004927, + "balance_loss_clip": 1.00746632, + "balance_loss_mlp": 1.00347269, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.755524141428325, + "language_loss": 0.61932743, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63957477, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 3.223787307739258 + }, + { + "auxiliary_loss_clip": 0.01065834, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.03700614, + "balance_loss_mlp": 1.02366459, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.6763793397614462, + "language_loss": 0.78934228, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81037629, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 2.6149096488952637 + }, + { + "auxiliary_loss_clip": 0.01098828, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.03704226, + "balance_loss_mlp": 1.02539527, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.5540042792055635, + "language_loss": 0.7554059, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.7767756, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 2.6839804649353027 + }, + { + "auxiliary_loss_clip": 0.01068662, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.02561951, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.470752968243087, + "language_loss": 0.68243277, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70350957, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 2.6354618072509766 + }, + { + "auxiliary_loss_clip": 0.01097779, + "auxiliary_loss_mlp": 0.00749654, + "balance_loss_clip": 1.03786659, + "balance_loss_mlp": 1.00081074, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.532654969540171, + "language_loss": 0.74274039, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76121467, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 2.6026692390441895 + }, + { + "auxiliary_loss_clip": 0.01079899, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.02353632, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.477923132830119, + "language_loss": 0.63059205, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65176153, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 4.260208368301392 + }, + { + "auxiliary_loss_clip": 0.01066329, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.03476286, + "balance_loss_mlp": 1.01446104, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 2.1788587296926476, + "language_loss": 0.68345159, + "learning_rate": 2.298570497656304e-06, + "loss": 0.7043941, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.817125082015991 + }, + { + "auxiliary_loss_clip": 0.0110905, + "auxiliary_loss_mlp": 0.00749829, + "balance_loss_clip": 1.03715587, + "balance_loss_mlp": 1.00078738, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.6727577402115796, + "language_loss": 0.69851184, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.71710062, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 4.204024791717529 + }, + { + "auxiliary_loss_clip": 0.01085916, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.03735971, + "balance_loss_mlp": 1.0231595, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 3.528787385608522, + "language_loss": 0.66960609, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69083774, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 2.6792426109313965 + }, + { + "auxiliary_loss_clip": 0.01018164, + "auxiliary_loss_mlp": 0.01001379, + "balance_loss_clip": 1.00560713, + "balance_loss_mlp": 1.00003147, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9300091118386639, + "language_loss": 0.64490974, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.6651051, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 3.2954797744750977 + }, + { + "auxiliary_loss_clip": 0.01079956, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.03811765, + "balance_loss_mlp": 1.0155164, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.4513181799472616, + "language_loss": 0.72307289, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74415559, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 2.6600229740142822 + }, + { + "auxiliary_loss_clip": 0.01105707, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.03694344, + "balance_loss_mlp": 1.02327669, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 1.6417282507498727, + "language_loss": 0.71674263, + "learning_rate": 2.296644869233568e-06, + "loss": 0.73814476, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.5381438732147217 + }, + { + "auxiliary_loss_clip": 0.010736, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.03398693, + "balance_loss_mlp": 1.0280118, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.4140589037941775, + "language_loss": 0.62593144, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64709306, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.6222777366638184 + }, + { + "auxiliary_loss_clip": 0.01108965, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.03633964, + "balance_loss_mlp": 1.02760184, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 4.593015760945052, + "language_loss": 0.73723829, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75873381, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 4.0987255573272705 + }, + { + "auxiliary_loss_clip": 0.01082665, + "auxiliary_loss_mlp": 0.00749778, + "balance_loss_clip": 1.03497314, + "balance_loss_mlp": 1.00069845, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 1.6253081737409922, + "language_loss": 0.77345443, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.7917788, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.6124465465545654 + }, + { + "auxiliary_loss_clip": 0.01085431, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.03807664, + "balance_loss_mlp": 1.01620412, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.7136926280131515, + "language_loss": 0.77439749, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79554117, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 2.59067440032959 + }, + { + "auxiliary_loss_clip": 0.0111716, + "auxiliary_loss_mlp": 0.01042371, + "balance_loss_clip": 1.04022312, + "balance_loss_mlp": 1.02839398, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.5407063942140309, + "language_loss": 0.82856011, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85015541, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.595874071121216 + }, + { + "auxiliary_loss_clip": 0.01089021, + "auxiliary_loss_mlp": 0.01037757, + "balance_loss_clip": 1.0374006, + "balance_loss_mlp": 1.02445292, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 3.9822596776406343, + "language_loss": 0.7707988, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79206657, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.6869101524353027 + }, + { + "auxiliary_loss_clip": 0.0108806, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.03865385, + "balance_loss_mlp": 1.01967049, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.7269437728587842, + "language_loss": 0.51193494, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53315252, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.5001111030578613 + }, + { + "auxiliary_loss_clip": 0.00989699, + "auxiliary_loss_mlp": 0.01002963, + "balance_loss_clip": 1.00817764, + "balance_loss_mlp": 1.00160396, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.7848608351122994, + "language_loss": 0.57790571, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59783232, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 3.131243944168091 + }, + { + "auxiliary_loss_clip": 0.01066337, + "auxiliary_loss_mlp": 0.01038518, + "balance_loss_clip": 1.03795874, + "balance_loss_mlp": 1.02539921, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.460248483276356, + "language_loss": 0.71531546, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.73636401, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.807734966278076 + }, + { + "auxiliary_loss_clip": 0.01100515, + "auxiliary_loss_mlp": 0.01036335, + "balance_loss_clip": 1.03746796, + "balance_loss_mlp": 1.02318656, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 1.9836575506697973, + "language_loss": 0.81186074, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83322924, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.643195867538452 + }, + { + "auxiliary_loss_clip": 0.01093817, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.03678548, + "balance_loss_mlp": 1.02034366, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.583902572214081, + "language_loss": 0.80615324, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82743132, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.5751171112060547 + }, + { + "auxiliary_loss_clip": 0.01049162, + "auxiliary_loss_mlp": 0.01037497, + "balance_loss_clip": 1.03218246, + "balance_loss_mlp": 1.024652, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.7514864836422959, + "language_loss": 0.74208033, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76294684, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.7198543548583984 + }, + { + "auxiliary_loss_clip": 0.01084635, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.01822579, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.1512352287109504, + "language_loss": 0.84454793, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86571908, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.6330251693725586 + }, + { + "auxiliary_loss_clip": 0.01085441, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.03511763, + "balance_loss_mlp": 1.02318311, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.9386927696160772, + "language_loss": 0.81729943, + "learning_rate": 2.291251619387217e-06, + "loss": 0.838512, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 2.570758819580078 + }, + { + "auxiliary_loss_clip": 0.01070307, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.03909516, + "balance_loss_mlp": 1.02104771, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 1.9016222825006155, + "language_loss": 0.77469987, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79576027, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.687955379486084 + }, + { + "auxiliary_loss_clip": 0.01028295, + "auxiliary_loss_mlp": 0.01003323, + "balance_loss_clip": 1.00614214, + "balance_loss_mlp": 1.00174999, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.838369709524598, + "language_loss": 0.59069437, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61101055, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.0414791107177734 + }, + { + "auxiliary_loss_clip": 0.01085083, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.03564477, + "balance_loss_mlp": 1.02026582, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 2.033513684165638, + "language_loss": 0.7932806, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81446314, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 2.6662681102752686 + }, + { + "auxiliary_loss_clip": 0.01109435, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.03621531, + "balance_loss_mlp": 1.0227468, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.6999734279004672, + "language_loss": 0.84109998, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86254829, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.535640001296997 + }, + { + "auxiliary_loss_clip": 0.01078966, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.03617835, + "balance_loss_mlp": 1.02105796, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.28527494404898, + "language_loss": 0.76137221, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78251487, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 2.586883783340454 + }, + { + "auxiliary_loss_clip": 0.01096492, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_clip": 1.04014719, + "balance_loss_mlp": 1.02819717, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 2.393393505756177, + "language_loss": 0.74578315, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76716208, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 2.526357889175415 + }, + { + "auxiliary_loss_clip": 0.01108378, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.03748667, + "balance_loss_mlp": 1.03044438, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 2.224578054091616, + "language_loss": 0.88964868, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91116357, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 2.560398578643799 + }, + { + "auxiliary_loss_clip": 0.01095087, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.03837466, + "balance_loss_mlp": 1.02383816, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.6147260570874562, + "language_loss": 0.79500592, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.8163197, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 4.203354597091675 + }, + { + "auxiliary_loss_clip": 0.01010022, + "auxiliary_loss_mlp": 0.01013848, + "balance_loss_clip": 1.00998306, + "balance_loss_mlp": 1.01215518, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6958842811138609, + "language_loss": 0.5667302, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.5869689, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.2720048427581787 + }, + { + "auxiliary_loss_clip": 0.01083452, + "auxiliary_loss_mlp": 0.01040997, + "balance_loss_clip": 1.03388286, + "balance_loss_mlp": 1.02659011, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.8173819417591415, + "language_loss": 0.81031346, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83155793, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 2.621483087539673 + }, + { + "auxiliary_loss_clip": 0.01090014, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.03794932, + "balance_loss_mlp": 1.01968515, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.5268086263952712, + "language_loss": 0.66822791, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68946183, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.6669516563415527 + }, + { + "auxiliary_loss_clip": 0.01081897, + "auxiliary_loss_mlp": 0.01039434, + "balance_loss_clip": 1.03373051, + "balance_loss_mlp": 1.02580249, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 1.7089242831211706, + "language_loss": 0.84100986, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86222315, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 2.6396396160125732 + }, + { + "auxiliary_loss_clip": 0.01008627, + "auxiliary_loss_mlp": 0.00997258, + "balance_loss_clip": 1.00644994, + "balance_loss_mlp": 0.99588126, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.8015844883092857, + "language_loss": 0.55688471, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57694352, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 3.1556520462036133 + }, + { + "auxiliary_loss_clip": 0.01108974, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.03834939, + "balance_loss_mlp": 1.02082109, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.80358690070101, + "language_loss": 0.80645674, + "learning_rate": 2.285856204861245e-06, + "loss": 0.82788038, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 2.5566978454589844 + }, + { + "auxiliary_loss_clip": 0.01110511, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.03967786, + "balance_loss_mlp": 1.0204488, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.352255858421086, + "language_loss": 0.76069266, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78212148, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 2.606126070022583 + }, + { + "auxiliary_loss_clip": 0.01073076, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.03666234, + "balance_loss_mlp": 1.0197593, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 2.191772610037183, + "language_loss": 0.79016352, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81123173, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 2.594025135040283 + }, + { + "auxiliary_loss_clip": 0.01059219, + "auxiliary_loss_mlp": 0.01048977, + "balance_loss_clip": 1.03116727, + "balance_loss_mlp": 1.03233492, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 2.851056578481606, + "language_loss": 0.75572437, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.77680635, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 2.674851179122925 + }, + { + "auxiliary_loss_clip": 0.01085758, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.03912365, + "balance_loss_mlp": 1.017524, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.4756871436224426, + "language_loss": 0.74921602, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.77036786, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.58482027053833 + }, + { + "auxiliary_loss_clip": 0.01097134, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.03725386, + "balance_loss_mlp": 1.02499223, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.6612327632608697, + "language_loss": 0.75539577, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77674663, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 4.004923343658447 + }, + { + "auxiliary_loss_clip": 0.010645, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.03824198, + "balance_loss_mlp": 1.02593112, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 8.087496830604898, + "language_loss": 0.66592562, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68695772, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 2.916659116744995 + }, + { + "auxiliary_loss_clip": 0.01017706, + "auxiliary_loss_mlp": 0.00747017, + "balance_loss_clip": 1.00554729, + "balance_loss_mlp": 1.00025773, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8717154862087846, + "language_loss": 0.62133706, + "learning_rate": 2.283157698374194e-06, + "loss": 0.63898432, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 3.075794219970703 + }, + { + "auxiliary_loss_clip": 0.01074298, + "auxiliary_loss_mlp": 0.00749928, + "balance_loss_clip": 1.03647578, + "balance_loss_mlp": 1.0005672, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.531781937598061, + "language_loss": 0.69425535, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71249759, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 2.6602025032043457 + }, + { + "auxiliary_loss_clip": 0.01095657, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.03734255, + "balance_loss_mlp": 1.02971745, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.8639219025572078, + "language_loss": 0.6670031, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68839335, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 4.025687217712402 + }, + { + "auxiliary_loss_clip": 0.01080832, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.0340941, + "balance_loss_mlp": 1.02347708, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.6621267753876974, + "language_loss": 0.77292567, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79412013, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 2.706364393234253 + }, + { + "auxiliary_loss_clip": 0.01072897, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.03602231, + "balance_loss_mlp": 1.0248518, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 2.314891493983352, + "language_loss": 0.72804892, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.74914932, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.9493870735168457 + }, + { + "auxiliary_loss_clip": 0.0108226, + "auxiliary_loss_mlp": 0.01033166, + "balance_loss_clip": 1.03510261, + "balance_loss_mlp": 1.02036846, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.701389701668403, + "language_loss": 0.75007623, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77123046, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 2.6723625659942627 + }, + { + "auxiliary_loss_clip": 0.01080485, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.03668404, + "balance_loss_mlp": 1.02658761, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.7063035693260125, + "language_loss": 0.70624334, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72744226, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.660444736480713 + }, + { + "auxiliary_loss_clip": 0.01104462, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.04251051, + "balance_loss_mlp": 1.02229118, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.053707830959883, + "language_loss": 0.7847687, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80617225, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 4.089683294296265 + }, + { + "auxiliary_loss_clip": 0.01095315, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.03841507, + "balance_loss_mlp": 1.01870561, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.624878254104115, + "language_loss": 0.74291086, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76417321, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.60219407081604 + }, + { + "auxiliary_loss_clip": 0.01085312, + "auxiliary_loss_mlp": 0.01045068, + "balance_loss_clip": 1.03766918, + "balance_loss_mlp": 1.0314064, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.5660301461308872, + "language_loss": 0.78632569, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80762947, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 2.6005797386169434 + }, + { + "auxiliary_loss_clip": 0.01096398, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.0365411, + "balance_loss_mlp": 1.0259769, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.5811816762356041, + "language_loss": 0.73016274, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75151366, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.6778335571289062 + }, + { + "auxiliary_loss_clip": 0.01093515, + "auxiliary_loss_mlp": 0.01033055, + "balance_loss_clip": 1.03525484, + "balance_loss_mlp": 1.02084792, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.3123469182680751, + "language_loss": 0.7442801, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76554585, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.595001220703125 + }, + { + "auxiliary_loss_clip": 0.01059536, + "auxiliary_loss_mlp": 0.01041208, + "balance_loss_clip": 1.0354445, + "balance_loss_mlp": 1.02854764, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 2.0263197421594588, + "language_loss": 0.8071596, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82816702, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.6681528091430664 + }, + { + "auxiliary_loss_clip": 0.01095706, + "auxiliary_loss_mlp": 0.01039914, + "balance_loss_clip": 1.03752446, + "balance_loss_mlp": 1.02734387, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 2.040592173796173, + "language_loss": 0.70591056, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72726679, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.532829999923706 + }, + { + "auxiliary_loss_clip": 0.01082085, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.03759038, + "balance_loss_mlp": 1.02784812, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.12423051401392, + "language_loss": 0.69090277, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71214432, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.6471340656280518 + }, + { + "auxiliary_loss_clip": 0.01054757, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.03677821, + "balance_loss_mlp": 1.01924253, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 2.2633263151924945, + "language_loss": 0.74709207, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.76796591, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.848294258117676 + }, + { + "auxiliary_loss_clip": 0.01036897, + "auxiliary_loss_mlp": 0.0105333, + "balance_loss_clip": 1.02972269, + "balance_loss_mlp": 1.03646207, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.6763353258732352, + "language_loss": 0.76025391, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78115618, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.68456768989563 + }, + { + "auxiliary_loss_clip": 0.01072247, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.03568053, + "balance_loss_mlp": 1.01949143, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.614142094424449, + "language_loss": 0.69193071, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71298164, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.6455023288726807 + }, + { + "auxiliary_loss_clip": 0.00999096, + "auxiliary_loss_mlp": 0.01002937, + "balance_loss_clip": 1.01521897, + "balance_loss_mlp": 1.00126827, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.7110102633167483, + "language_loss": 0.50141126, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52143157, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 3.5003719329833984 + }, + { + "auxiliary_loss_clip": 0.01102103, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.03892732, + "balance_loss_mlp": 1.02171671, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.8094121680844537, + "language_loss": 0.64088243, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66225678, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 2.8548226356506348 + }, + { + "auxiliary_loss_clip": 0.01100317, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.03986239, + "balance_loss_mlp": 1.02101207, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.7389954048362979, + "language_loss": 0.75710309, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.77845019, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.7133400440216064 + }, + { + "auxiliary_loss_clip": 0.01084563, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.0366528, + "balance_loss_mlp": 1.0234735, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.9413972162155493, + "language_loss": 0.74653327, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76773322, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 2.8402581214904785 + }, + { + "auxiliary_loss_clip": 0.01088361, + "auxiliary_loss_mlp": 0.01040961, + "balance_loss_clip": 1.03831792, + "balance_loss_mlp": 1.02907586, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.745241314044112, + "language_loss": 0.64605546, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66734874, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.733699321746826 + }, + { + "auxiliary_loss_clip": 0.0109719, + "auxiliary_loss_mlp": 0.00749775, + "balance_loss_clip": 1.03629017, + "balance_loss_mlp": 1.00066531, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.5515029152914963, + "language_loss": 0.70373964, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72220933, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 2.540540933609009 + }, + { + "auxiliary_loss_clip": 0.01112076, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.03812361, + "balance_loss_mlp": 1.02087188, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 2.922203461322549, + "language_loss": 0.6198442, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64130372, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.562896251678467 + }, + { + "auxiliary_loss_clip": 0.0108791, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.03719485, + "balance_loss_mlp": 1.02814579, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 1.8857630352356956, + "language_loss": 0.72247493, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.74376655, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 2.71722149848938 + }, + { + "auxiliary_loss_clip": 0.01080231, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.03541422, + "balance_loss_mlp": 1.02245665, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 2.1059172636105874, + "language_loss": 0.85129559, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87245274, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.6618993282318115 + }, + { + "auxiliary_loss_clip": 0.0110898, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.03549635, + "balance_loss_mlp": 1.01965857, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.8747903176998315, + "language_loss": 0.84258384, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86399448, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 4.018896579742432 + }, + { + "auxiliary_loss_clip": 0.01087108, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.03727937, + "balance_loss_mlp": 1.02349806, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.9936629246809827, + "language_loss": 0.66535723, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68658441, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.601703643798828 + }, + { + "auxiliary_loss_clip": 0.01108634, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.0366019, + "balance_loss_mlp": 1.01772082, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 1.9282489289899603, + "language_loss": 0.65268403, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67408037, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.493105888366699 + }, + { + "auxiliary_loss_clip": 0.01075703, + "auxiliary_loss_mlp": 0.00749762, + "balance_loss_clip": 1.03415275, + "balance_loss_mlp": 1.00065362, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 1.6015578338368404, + "language_loss": 0.74527788, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76353258, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 2.594717264175415 + }, + { + "auxiliary_loss_clip": 0.01108753, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.03623223, + "balance_loss_mlp": 1.02149677, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 4.187121276852839, + "language_loss": 0.83305669, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85448837, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 2.508596181869507 + }, + { + "auxiliary_loss_clip": 0.01094529, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.03550303, + "balance_loss_mlp": 1.0196259, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.6049918737247975, + "language_loss": 0.79623604, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81750047, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 2.6327478885650635 + }, + { + "auxiliary_loss_clip": 0.01028465, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.03370214, + "balance_loss_mlp": 1.02121735, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 1.8663884834715356, + "language_loss": 0.74439365, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76502955, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 2.8089542388916016 + }, + { + "auxiliary_loss_clip": 0.01089119, + "auxiliary_loss_mlp": 0.01042506, + "balance_loss_clip": 1.03795218, + "balance_loss_mlp": 1.0283556, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 3.7083693544799345, + "language_loss": 0.73881066, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76012695, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 2.6886255741119385 + }, + { + "auxiliary_loss_clip": 0.01115523, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.03980637, + "balance_loss_mlp": 1.02353477, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 1.8650627249230927, + "language_loss": 0.81220853, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83373898, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 2.642622709274292 + }, + { + "auxiliary_loss_clip": 0.01093182, + "auxiliary_loss_mlp": 0.0103692, + "balance_loss_clip": 1.03581393, + "balance_loss_mlp": 1.02349687, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.5849900074456709, + "language_loss": 0.75689405, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77819514, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 2.670419692993164 + }, + { + "auxiliary_loss_clip": 0.01073041, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.03369272, + "balance_loss_mlp": 1.02067125, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 2.080530295145451, + "language_loss": 0.67966056, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70072943, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.868448257446289 + }, + { + "auxiliary_loss_clip": 0.01097968, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.03722739, + "balance_loss_mlp": 1.02290821, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.6620870881883258, + "language_loss": 0.72108865, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74243087, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 4.101497173309326 + }, + { + "auxiliary_loss_clip": 0.01093001, + "auxiliary_loss_mlp": 0.01041751, + "balance_loss_clip": 1.03895557, + "balance_loss_mlp": 1.02876961, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.2111507520439764, + "language_loss": 0.65308714, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67443466, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.5851142406463623 + }, + { + "auxiliary_loss_clip": 0.01060637, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.03855681, + "balance_loss_mlp": 1.0190587, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.4414385217870898, + "language_loss": 0.81224483, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83317983, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 2.7867379188537598 + }, + { + "auxiliary_loss_clip": 0.0107502, + "auxiliary_loss_mlp": 0.01040727, + "balance_loss_clip": 1.03281689, + "balance_loss_mlp": 1.02661252, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.7315617437887778, + "language_loss": 0.78965342, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81081086, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 4.192947149276733 + }, + { + "auxiliary_loss_clip": 0.01097695, + "auxiliary_loss_mlp": 0.00749816, + "balance_loss_clip": 1.03679907, + "balance_loss_mlp": 1.00066495, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 2.4075632432404577, + "language_loss": 0.70510244, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.7235775, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.5814106464385986 + }, + { + "auxiliary_loss_clip": 0.01070155, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.03764272, + "balance_loss_mlp": 1.02170086, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.8649493690800447, + "language_loss": 0.75300777, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77404845, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.7904677391052246 + }, + { + "auxiliary_loss_clip": 0.01016325, + "auxiliary_loss_mlp": 0.0100567, + "balance_loss_clip": 1.01384962, + "balance_loss_mlp": 1.00418568, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7295579002578323, + "language_loss": 0.61357343, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63379335, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.1865482330322266 + }, + { + "auxiliary_loss_clip": 0.01085131, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.03506851, + "balance_loss_mlp": 1.02352595, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.4166379741822828, + "language_loss": 0.67790401, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.69913012, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 2.6442761421203613 + }, + { + "auxiliary_loss_clip": 0.01049191, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.03942204, + "balance_loss_mlp": 1.0180124, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 3.2108687788151125, + "language_loss": 0.77197325, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79277039, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 2.844789981842041 + }, + { + "auxiliary_loss_clip": 0.01100928, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.03895235, + "balance_loss_mlp": 1.01925504, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.51541032675016, + "language_loss": 0.76199687, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78332996, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 4.163726806640625 + }, + { + "auxiliary_loss_clip": 0.01087765, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.0371747, + "balance_loss_mlp": 1.02107525, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7644687947871744, + "language_loss": 0.71753967, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.73874491, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 2.6921937465667725 + }, + { + "auxiliary_loss_clip": 0.01101905, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.03805196, + "balance_loss_mlp": 1.02587545, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 1.9520222982094344, + "language_loss": 0.8250615, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84647459, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 2.5462915897369385 + }, + { + "auxiliary_loss_clip": 0.01071398, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_clip": 1.03472102, + "balance_loss_mlp": 1.02879047, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.6875582574819794, + "language_loss": 0.73106074, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75221062, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 2.615349531173706 + }, + { + "auxiliary_loss_clip": 0.01090887, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.03700829, + "balance_loss_mlp": 1.02071714, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 1.9183827636527448, + "language_loss": 0.73646152, + "learning_rate": 2.263481587786849e-06, + "loss": 0.75772041, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.5920262336730957 + }, + { + "auxiliary_loss_clip": 0.01097413, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.03767097, + "balance_loss_mlp": 1.01705503, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.7915783283503532, + "language_loss": 0.76967907, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79094243, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.582364559173584 + }, + { + "auxiliary_loss_clip": 0.01098254, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.03623044, + "balance_loss_mlp": 1.02053046, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.671349159534736, + "language_loss": 0.72752714, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.748842, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.5917439460754395 + }, + { + "auxiliary_loss_clip": 0.01030368, + "auxiliary_loss_mlp": 0.01005849, + "balance_loss_clip": 1.00841594, + "balance_loss_mlp": 1.00446057, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7414379229731709, + "language_loss": 0.56107068, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58143288, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.2221195697784424 + }, + { + "auxiliary_loss_clip": 0.01099996, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.03913569, + "balance_loss_mlp": 1.02277434, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 1.8358262261369371, + "language_loss": 0.65520883, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67657614, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.582148313522339 + }, + { + "auxiliary_loss_clip": 0.01115629, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.03978515, + "balance_loss_mlp": 1.02190351, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.670942298744137, + "language_loss": 0.70219576, + "learning_rate": 2.26155112714642e-06, + "loss": 0.7237125, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.485112190246582 + }, + { + "auxiliary_loss_clip": 0.01013837, + "auxiliary_loss_mlp": 0.01005915, + "balance_loss_clip": 1.0147146, + "balance_loss_mlp": 1.00447869, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8096665883342967, + "language_loss": 0.58594739, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60614496, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.245601177215576 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.03911829, + "balance_loss_mlp": 1.0213728, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 2.263069166648671, + "language_loss": 0.77257288, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79392058, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 2.574907064437866 + }, + { + "auxiliary_loss_clip": 0.01098749, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.03705335, + "balance_loss_mlp": 1.02152276, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 2.0804853111951513, + "language_loss": 0.7476747, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76900685, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 2.5392134189605713 + }, + { + "auxiliary_loss_clip": 0.01094709, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.0353725, + "balance_loss_mlp": 1.01511967, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.8139538531856039, + "language_loss": 0.82690001, + "learning_rate": 2.260006580021429e-06, + "loss": 0.8481282, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.5892860889434814 + }, + { + "auxiliary_loss_clip": 0.01097802, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.03752923, + "balance_loss_mlp": 1.01816106, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 2.0446225349856717, + "language_loss": 0.75494909, + "learning_rate": 2.259620418554886e-06, + "loss": 0.7762388, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 2.6229584217071533 + }, + { + "auxiliary_loss_clip": 0.01089452, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.03798473, + "balance_loss_mlp": 1.02611494, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.4626423020358805, + "language_loss": 0.63972187, + "learning_rate": 2.25923424724351e-06, + "loss": 0.66100764, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 2.601508140563965 + }, + { + "auxiliary_loss_clip": 0.01070044, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_clip": 1.03442085, + "balance_loss_mlp": 1.02997386, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.002177126881358, + "language_loss": 0.70232087, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72347414, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 2.7243552207946777 + }, + { + "auxiliary_loss_clip": 0.01099265, + "auxiliary_loss_mlp": 0.01036845, + "balance_loss_clip": 1.03708947, + "balance_loss_mlp": 1.02361274, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 2.2706969227838676, + "language_loss": 0.68272519, + "learning_rate": 2.258461875144837e-06, + "loss": 0.7040863, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.6984198093414307 + }, + { + "auxiliary_loss_clip": 0.0107458, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_clip": 1.03711712, + "balance_loss_mlp": 1.02675223, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.912377029991135, + "language_loss": 0.70835787, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72950041, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 2.713623285293579 + }, + { + "auxiliary_loss_clip": 0.01081895, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.03706217, + "balance_loss_mlp": 1.0340097, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.6593888764047187, + "language_loss": 0.73670483, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75800353, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.7006828784942627 + }, + { + "auxiliary_loss_clip": 0.01070059, + "auxiliary_loss_mlp": 0.01033145, + "balance_loss_clip": 1.03562164, + "balance_loss_mlp": 1.02116489, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.66402395725351, + "language_loss": 0.68623227, + "learning_rate": 2.257303243526688e-06, + "loss": 0.7072643, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 4.235163450241089 + }, + { + "auxiliary_loss_clip": 0.0108193, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.03525996, + "balance_loss_mlp": 1.02166843, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.4424131603527468, + "language_loss": 0.71920979, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74036396, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 2.609677314758301 + }, + { + "auxiliary_loss_clip": 0.01039199, + "auxiliary_loss_mlp": 0.01044675, + "balance_loss_clip": 1.02953172, + "balance_loss_mlp": 1.03070974, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.7542162955530824, + "language_loss": 0.86268926, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88352799, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 2.720567226409912 + }, + { + "auxiliary_loss_clip": 0.01085747, + "auxiliary_loss_mlp": 0.01036543, + "balance_loss_clip": 1.03312135, + "balance_loss_mlp": 1.02489614, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.6629692104067384, + "language_loss": 0.82149434, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84271729, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 2.5910518169403076 + }, + { + "auxiliary_loss_clip": 0.01011506, + "auxiliary_loss_mlp": 0.01000001, + "balance_loss_clip": 1.01903713, + "balance_loss_mlp": 0.99869519, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6768977557087775, + "language_loss": 0.58950019, + "learning_rate": 2.255758264840002e-06, + "loss": 0.60961521, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 3.3026692867279053 + }, + { + "auxiliary_loss_clip": 0.01091261, + "auxiliary_loss_mlp": 0.01034105, + "balance_loss_clip": 1.03502727, + "balance_loss_mlp": 1.02112889, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.8588543032518885, + "language_loss": 0.8129735, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83422709, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.5322682857513428 + }, + { + "auxiliary_loss_clip": 0.01099001, + "auxiliary_loss_mlp": 0.01041766, + "balance_loss_clip": 1.03850114, + "balance_loss_mlp": 1.02874184, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 2.3402886827463947, + "language_loss": 0.73638403, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75779164, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 2.5835208892822266 + }, + { + "auxiliary_loss_clip": 0.01079335, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.03567004, + "balance_loss_mlp": 1.02410626, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.4992766515510738, + "language_loss": 0.75314337, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77430111, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 2.5977227687835693 + }, + { + "auxiliary_loss_clip": 0.01095566, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.03680944, + "balance_loss_mlp": 1.01676536, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.923016091753292, + "language_loss": 0.7877785, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.80901676, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 2.6287639141082764 + }, + { + "auxiliary_loss_clip": 0.01076627, + "auxiliary_loss_mlp": 0.00749564, + "balance_loss_clip": 1.03246796, + "balance_loss_mlp": 1.00056159, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.7715471551782278, + "language_loss": 0.75300646, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77126837, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.6414012908935547 + }, + { + "auxiliary_loss_clip": 0.01105877, + "auxiliary_loss_mlp": 0.01042118, + "balance_loss_clip": 1.03586173, + "balance_loss_mlp": 1.03012562, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.5891580439780646, + "language_loss": 0.73872161, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76020157, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.66232967376709 + }, + { + "auxiliary_loss_clip": 0.01092064, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.04022622, + "balance_loss_mlp": 1.01916146, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 2.4427544350944386, + "language_loss": 0.72508991, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74633181, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 4.1215500831604 + }, + { + "auxiliary_loss_clip": 0.01086545, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.03893924, + "balance_loss_mlp": 1.02637553, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 2.0519863873359405, + "language_loss": 0.64609182, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66734266, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 2.5884199142456055 + }, + { + "auxiliary_loss_clip": 0.01105983, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.03768921, + "balance_loss_mlp": 1.0248847, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.665659291946875, + "language_loss": 0.76565069, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.78708208, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 2.535433769226074 + }, + { + "auxiliary_loss_clip": 0.01106995, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.0365777, + "balance_loss_mlp": 1.02317905, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.7556625038999671, + "language_loss": 0.641316, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66273808, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 4.021465063095093 + }, + { + "auxiliary_loss_clip": 0.01001402, + "auxiliary_loss_mlp": 0.01006348, + "balance_loss_clip": 1.00877905, + "balance_loss_mlp": 1.00501847, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8358813100688585, + "language_loss": 0.65675247, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67682993, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.185849905014038 + }, + { + "auxiliary_loss_clip": 0.01095041, + "auxiliary_loss_mlp": 0.0074967, + "balance_loss_clip": 1.03527117, + "balance_loss_mlp": 1.00055671, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.6558583436204772, + "language_loss": 0.68882966, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70727682, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 2.555877447128296 + }, + { + "auxiliary_loss_clip": 0.01085668, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.03579473, + "balance_loss_mlp": 1.02322447, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 3.7685946398113033, + "language_loss": 0.75017488, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.7713846, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.6538329124450684 + }, + { + "auxiliary_loss_clip": 0.01092795, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.03927076, + "balance_loss_mlp": 1.02273738, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.6763544223078464, + "language_loss": 0.77214825, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79343462, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 2.629703998565674 + }, + { + "auxiliary_loss_clip": 0.01088535, + "auxiliary_loss_mlp": 0.01037811, + "balance_loss_clip": 1.03649068, + "balance_loss_mlp": 1.02386403, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.5694268408796215, + "language_loss": 0.78016973, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80143321, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 2.625063419342041 + }, + { + "auxiliary_loss_clip": 0.01078077, + "auxiliary_loss_mlp": 0.01041472, + "balance_loss_clip": 1.03836226, + "balance_loss_mlp": 1.02789354, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.8447753616276557, + "language_loss": 0.72079206, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74198759, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 4.214834451675415 + }, + { + "auxiliary_loss_clip": 0.01078607, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.03642976, + "balance_loss_mlp": 1.02215004, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 1.737864856051268, + "language_loss": 0.81865579, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.83979076, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.700403928756714 + }, + { + "auxiliary_loss_clip": 0.01107674, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.04124129, + "balance_loss_mlp": 1.02341163, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.8243360578127659, + "language_loss": 0.80295718, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82440352, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.572474241256714 + }, + { + "auxiliary_loss_clip": 0.01082406, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.03368282, + "balance_loss_mlp": 1.02458215, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 1.6887197212452822, + "language_loss": 0.72334808, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74454248, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.6610641479492188 + }, + { + "auxiliary_loss_clip": 0.01101629, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.03825819, + "balance_loss_mlp": 1.01858902, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 1.857343238824698, + "language_loss": 0.67827094, + "learning_rate": 2.248031062546432e-06, + "loss": 0.69960833, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.622344493865967 + }, + { + "auxiliary_loss_clip": 0.01072627, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.03618836, + "balance_loss_mlp": 1.01850033, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.7035932078525275, + "language_loss": 0.6812734, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70230567, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.712747097015381 + }, + { + "auxiliary_loss_clip": 0.01108617, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03707242, + "balance_loss_mlp": 1.01895714, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.24534830407256, + "language_loss": 0.78648907, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80789423, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.525707960128784 + }, + { + "auxiliary_loss_clip": 0.01080394, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.03423166, + "balance_loss_mlp": 1.02655268, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 2.4504306704757624, + "language_loss": 0.66300988, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68419707, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.738405227661133 + }, + { + "auxiliary_loss_clip": 0.01092565, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.03656948, + "balance_loss_mlp": 1.02039623, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.7377219225905987, + "language_loss": 0.79693842, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.81819141, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.5777692794799805 + }, + { + "auxiliary_loss_clip": 0.01077369, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.03309834, + "balance_loss_mlp": 1.01994717, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.9506537734775513, + "language_loss": 0.76271021, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78381586, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 2.6192407608032227 + }, + { + "auxiliary_loss_clip": 0.01085916, + "auxiliary_loss_mlp": 0.00749368, + "balance_loss_clip": 1.03692102, + "balance_loss_mlp": 1.00054502, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 1.981993394864009, + "language_loss": 0.79768908, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81604183, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 2.5645852088928223 + }, + { + "auxiliary_loss_clip": 0.01103948, + "auxiliary_loss_mlp": 0.01041384, + "balance_loss_clip": 1.03780961, + "balance_loss_mlp": 1.02628016, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.9323671786276697, + "language_loss": 0.73953176, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76098514, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 2.5731825828552246 + }, + { + "auxiliary_loss_clip": 0.0110195, + "auxiliary_loss_mlp": 0.01035913, + "balance_loss_clip": 1.03767276, + "balance_loss_mlp": 1.02279949, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 3.081734734279974, + "language_loss": 0.80033314, + "learning_rate": 2.244939121664211e-06, + "loss": 0.82171178, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 2.6032044887542725 + }, + { + "auxiliary_loss_clip": 0.01084769, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.03765845, + "balance_loss_mlp": 1.02380586, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 2.0776053637452763, + "language_loss": 0.7102083, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73143303, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 2.8157687187194824 + }, + { + "auxiliary_loss_clip": 0.01112609, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.03774357, + "balance_loss_mlp": 1.01800156, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 3.221841918354808, + "language_loss": 0.68050611, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.70194131, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 2.5513741970062256 + }, + { + "auxiliary_loss_clip": 0.01018927, + "auxiliary_loss_mlp": 0.01001831, + "balance_loss_clip": 1.0069567, + "balance_loss_mlp": 1.00050735, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.718804438847254, + "language_loss": 0.56395149, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58415908, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 3.2749085426330566 + }, + { + "auxiliary_loss_clip": 0.01082035, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.03629959, + "balance_loss_mlp": 1.02451766, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.7467091573363667, + "language_loss": 0.88860822, + "learning_rate": 2.243392927839317e-06, + "loss": 0.90982497, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.656094551086426 + }, + { + "auxiliary_loss_clip": 0.0109847, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.03549957, + "balance_loss_mlp": 1.02461207, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 1.8380115440491982, + "language_loss": 0.77052438, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79187936, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.5214953422546387 + }, + { + "auxiliary_loss_clip": 0.01084757, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.03650415, + "balance_loss_mlp": 1.01835275, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.6760601126919397, + "language_loss": 0.84996367, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87111151, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 2.622333526611328 + }, + { + "auxiliary_loss_clip": 0.01090691, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.03766322, + "balance_loss_mlp": 1.02361584, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 1.8406991803349941, + "language_loss": 0.75717169, + "learning_rate": 2.24223318550976e-06, + "loss": 0.77845716, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 4.194727182388306 + }, + { + "auxiliary_loss_clip": 0.01098929, + "auxiliary_loss_mlp": 0.01038155, + "balance_loss_clip": 1.03869963, + "balance_loss_mlp": 1.02479148, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.7189697799587937, + "language_loss": 0.64860392, + "learning_rate": 2.241846586342682e-06, + "loss": 0.6699748, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 2.701856851577759 + }, + { + "auxiliary_loss_clip": 0.01069592, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.03525782, + "balance_loss_mlp": 1.02114511, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.9674518802226595, + "language_loss": 0.73683202, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.7578823, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 2.6628339290618896 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.0396347, + "balance_loss_mlp": 1.02111244, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 3.29687067446199, + "language_loss": 0.67781746, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.69914532, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 2.520383596420288 + }, + { + "auxiliary_loss_clip": 0.01066932, + "auxiliary_loss_mlp": 0.00750043, + "balance_loss_clip": 1.03103292, + "balance_loss_mlp": 1.000458, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 2.143307837063337, + "language_loss": 0.75595319, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77412283, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 2.6913232803344727 + }, + { + "auxiliary_loss_clip": 0.01084081, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.03732753, + "balance_loss_mlp": 1.0256654, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.996414742345386, + "language_loss": 0.79186624, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81310833, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 2.7232789993286133 + }, + { + "auxiliary_loss_clip": 0.01076356, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.03557348, + "balance_loss_mlp": 1.01852727, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.8339039294664268, + "language_loss": 0.73866951, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75974894, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 2.613262414932251 + }, + { + "auxiliary_loss_clip": 0.01086402, + "auxiliary_loss_mlp": 0.01036277, + "balance_loss_clip": 1.03513515, + "balance_loss_mlp": 1.02180552, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.4307423335220741, + "language_loss": 0.77949107, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80071783, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 2.568763494491577 + }, + { + "auxiliary_loss_clip": 0.01077498, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.03374219, + "balance_loss_mlp": 1.0192461, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.2387507102150606, + "language_loss": 0.73893249, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.76002777, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 2.620844602584839 + }, + { + "auxiliary_loss_clip": 0.01075881, + "auxiliary_loss_mlp": 0.01042139, + "balance_loss_clip": 1.03513288, + "balance_loss_mlp": 1.02776194, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.8647617527381342, + "language_loss": 0.73781115, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.75899136, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.692265272140503 + }, + { + "auxiliary_loss_clip": 0.01078289, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.03738415, + "balance_loss_mlp": 1.02048802, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 2.2005181969950427, + "language_loss": 0.80251169, + "learning_rate": 2.238366782910174e-06, + "loss": 0.8236376, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.64682674407959 + }, + { + "auxiliary_loss_clip": 0.01087843, + "auxiliary_loss_mlp": 0.01042811, + "balance_loss_clip": 1.03499985, + "balance_loss_mlp": 1.0290072, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.6301723154746726, + "language_loss": 0.78206539, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80337191, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.5629427433013916 + }, + { + "auxiliary_loss_clip": 0.01101918, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.03735042, + "balance_loss_mlp": 1.02135074, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.197571659115103, + "language_loss": 0.83924723, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86061126, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 4.025615692138672 + }, + { + "auxiliary_loss_clip": 0.01075314, + "auxiliary_loss_mlp": 0.01037546, + "balance_loss_clip": 1.0325619, + "balance_loss_mlp": 1.02396786, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.8054279000813673, + "language_loss": 0.70901543, + "learning_rate": 2.237206685204768e-06, + "loss": 0.73014402, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 2.6193013191223145 + }, + { + "auxiliary_loss_clip": 0.01084706, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.03744328, + "balance_loss_mlp": 1.02224231, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 2.1204351376782005, + "language_loss": 0.81733561, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83853722, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.6197986602783203 + }, + { + "auxiliary_loss_clip": 0.01087618, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.03738618, + "balance_loss_mlp": 1.01831126, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 1.8333113803071854, + "language_loss": 0.84777939, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.86897731, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 4.095999717712402 + }, + { + "auxiliary_loss_clip": 0.01097479, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.0362041, + "balance_loss_mlp": 1.02415764, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.7284247015536849, + "language_loss": 0.79525578, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81659973, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 2.5886034965515137 + }, + { + "auxiliary_loss_clip": 0.01062771, + "auxiliary_loss_mlp": 0.00750084, + "balance_loss_clip": 1.03019023, + "balance_loss_mlp": 1.00039911, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 1.8813451309352587, + "language_loss": 0.82723385, + "learning_rate": 2.235659762404047e-06, + "loss": 0.84536242, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.662487745285034 + }, + { + "auxiliary_loss_clip": 0.01076224, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.03829765, + "balance_loss_mlp": 1.02353716, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.769442011131674, + "language_loss": 0.72747076, + "learning_rate": 2.235273009326599e-06, + "loss": 0.74858648, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 2.6890764236450195 + }, + { + "auxiliary_loss_clip": 0.01073689, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.03529334, + "balance_loss_mlp": 1.02545261, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 12.562283151191554, + "language_loss": 0.77105933, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79217398, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.643951654434204 + }, + { + "auxiliary_loss_clip": 0.01067619, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.03564107, + "balance_loss_mlp": 1.01787758, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.6611179567800052, + "language_loss": 0.77743745, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.79842043, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 2.621662139892578 + }, + { + "auxiliary_loss_clip": 0.01083236, + "auxiliary_loss_mlp": 0.01037578, + "balance_loss_clip": 1.03727734, + "balance_loss_mlp": 1.02446508, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 2.3247207021647456, + "language_loss": 0.6551379, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67634606, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 4.206283807754517 + }, + { + "auxiliary_loss_clip": 0.01098356, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.03629065, + "balance_loss_mlp": 1.02232718, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.9035711745018682, + "language_loss": 0.77883303, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.80017006, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.781757354736328 + }, + { + "auxiliary_loss_clip": 0.01102226, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.03708422, + "balance_loss_mlp": 1.02049422, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.6242963899407636, + "language_loss": 0.76483941, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78621924, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.5718467235565186 + }, + { + "auxiliary_loss_clip": 0.01044314, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.0319078, + "balance_loss_mlp": 1.03001571, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 1.620220093379254, + "language_loss": 0.74889076, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76977873, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.6470370292663574 + }, + { + "auxiliary_loss_clip": 0.0108158, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.0356636, + "balance_loss_mlp": 1.02312589, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.5848129921701322, + "language_loss": 0.72976577, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75094771, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.6070356369018555 + }, + { + "auxiliary_loss_clip": 0.01083093, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.0348177, + "balance_loss_mlp": 1.02053308, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 2.2079266130256245, + "language_loss": 0.79371506, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81488216, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 2.635519504547119 + }, + { + "auxiliary_loss_clip": 0.01001003, + "auxiliary_loss_mlp": 0.01007543, + "balance_loss_clip": 1.00840282, + "balance_loss_mlp": 1.00610018, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7593339698779239, + "language_loss": 0.62285054, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64293599, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 3.4072253704071045 + }, + { + "auxiliary_loss_clip": 0.01071926, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.0369556, + "balance_loss_mlp": 1.01819229, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.3846456812517915, + "language_loss": 0.77560896, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79663086, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.6715211868286133 + }, + { + "auxiliary_loss_clip": 0.01097542, + "auxiliary_loss_mlp": 0.01036148, + "balance_loss_clip": 1.0356319, + "balance_loss_mlp": 1.02264166, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.5578592761118057, + "language_loss": 0.70273823, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72407514, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.6047821044921875 + }, + { + "auxiliary_loss_clip": 0.01051987, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.0318954, + "balance_loss_mlp": 1.01727605, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.3074395412551916, + "language_loss": 0.79877055, + "learning_rate": 2.230631280709021e-06, + "loss": 0.81960398, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.7044997215270996 + }, + { + "auxiliary_loss_clip": 0.01098873, + "auxiliary_loss_mlp": 0.01026278, + "balance_loss_clip": 1.03642213, + "balance_loss_mlp": 1.01315331, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 4.405565162429178, + "language_loss": 0.70338988, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.72464132, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 2.5506856441497803 + }, + { + "auxiliary_loss_clip": 0.01100742, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.04131353, + "balance_loss_mlp": 1.0240314, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 2.24916628739563, + "language_loss": 0.78684562, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80821151, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 2.574411153793335 + }, + { + "auxiliary_loss_clip": 0.01009105, + "auxiliary_loss_mlp": 0.01016241, + "balance_loss_clip": 1.00687933, + "balance_loss_mlp": 1.01469135, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7701364703313678, + "language_loss": 0.53995281, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56020623, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 3.176553726196289 + }, + { + "auxiliary_loss_clip": 0.01094967, + "auxiliary_loss_mlp": 0.0103875, + "balance_loss_clip": 1.03782034, + "balance_loss_mlp": 1.0242722, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.4142767082044037, + "language_loss": 0.90134054, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.9226777, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 2.5937492847442627 + }, + { + "auxiliary_loss_clip": 0.01114649, + "auxiliary_loss_mlp": 0.01040967, + "balance_loss_clip": 1.03959322, + "balance_loss_mlp": 1.02654278, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.946502296546855, + "language_loss": 0.73291939, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75447559, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 2.5359437465667725 + }, + { + "auxiliary_loss_clip": 0.01090162, + "auxiliary_loss_mlp": 0.00749741, + "balance_loss_clip": 1.03495383, + "balance_loss_mlp": 1.00038755, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.6339794391078473, + "language_loss": 0.78355533, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80195439, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.5342512130737305 + }, + { + "auxiliary_loss_clip": 0.01086326, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.03602362, + "balance_loss_mlp": 1.02215362, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.904006701763973, + "language_loss": 0.89522922, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91644073, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 2.575082778930664 + }, + { + "auxiliary_loss_clip": 0.01103117, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.03934097, + "balance_loss_mlp": 1.0245657, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.4192695131677853, + "language_loss": 0.76983321, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79125392, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 2.5865566730499268 + }, + { + "auxiliary_loss_clip": 0.01086213, + "auxiliary_loss_mlp": 0.01037617, + "balance_loss_clip": 1.03918362, + "balance_loss_mlp": 1.02306199, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 2.0337343312680436, + "language_loss": 0.71827608, + "learning_rate": 2.227149156404295e-06, + "loss": 0.73951441, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 2.7729218006134033 + }, + { + "auxiliary_loss_clip": 0.01106639, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.03785586, + "balance_loss_mlp": 1.01857352, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.8926008225610031, + "language_loss": 0.70077372, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72214818, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 4.01335883140564 + }, + { + "auxiliary_loss_clip": 0.01082434, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.03636885, + "balance_loss_mlp": 1.01691747, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 1.6898670167187857, + "language_loss": 0.71114588, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73225182, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 2.59328556060791 + }, + { + "auxiliary_loss_clip": 0.01023767, + "auxiliary_loss_mlp": 0.00746874, + "balance_loss_clip": 1.01115346, + "balance_loss_mlp": 0.99993056, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.7975103350023357, + "language_loss": 0.5941807, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61188716, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.1126205921173096 + }, + { + "auxiliary_loss_clip": 0.01044293, + "auxiliary_loss_mlp": 0.01051472, + "balance_loss_clip": 1.02913952, + "balance_loss_mlp": 1.03556943, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.5816175316811307, + "language_loss": 0.67088211, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.69183975, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 2.6454083919525146 + }, + { + "auxiliary_loss_clip": 0.01085853, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.0345726, + "balance_loss_mlp": 1.02363348, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.6846105468226593, + "language_loss": 0.70188987, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72312307, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 2.5687413215637207 + }, + { + "auxiliary_loss_clip": 0.01077512, + "auxiliary_loss_mlp": 0.01038064, + "balance_loss_clip": 1.03865218, + "balance_loss_mlp": 1.02427769, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 1.825039267503123, + "language_loss": 0.79585862, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.8170144, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 2.6060402393341064 + }, + { + "auxiliary_loss_clip": 0.01054577, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.03300464, + "balance_loss_mlp": 1.02708125, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 2.001627471645601, + "language_loss": 0.74832678, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.76927221, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 2.6353507041931152 + }, + { + "auxiliary_loss_clip": 0.01077977, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.0385071, + "balance_loss_mlp": 1.01565301, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 1.8393234840945785, + "language_loss": 0.78940117, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81046045, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.645141363143921 + }, + { + "auxiliary_loss_clip": 0.01089692, + "auxiliary_loss_mlp": 0.01039706, + "balance_loss_clip": 1.03604734, + "balance_loss_mlp": 1.02632523, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.6320909850568661, + "language_loss": 0.73819679, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75949073, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.8119444847106934 + }, + { + "auxiliary_loss_clip": 0.01020559, + "auxiliary_loss_mlp": 0.0074692, + "balance_loss_clip": 1.00810313, + "balance_loss_mlp": 1.00007331, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.768610950895978, + "language_loss": 0.59050286, + "learning_rate": 2.223279311579633e-06, + "loss": 0.60817766, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.269486904144287 + }, + { + "auxiliary_loss_clip": 0.01098398, + "auxiliary_loss_mlp": 0.0074963, + "balance_loss_clip": 1.03739798, + "balance_loss_mlp": 1.00035667, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 2.505211476293315, + "language_loss": 0.67154574, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69002604, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 2.6850059032440186 + }, + { + "auxiliary_loss_clip": 0.01079958, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.03188276, + "balance_loss_mlp": 1.02231169, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.6565693016210352, + "language_loss": 0.76281512, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78396678, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.6549465656280518 + }, + { + "auxiliary_loss_clip": 0.01059478, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.03493714, + "balance_loss_mlp": 1.02647972, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.6198862278243924, + "language_loss": 0.78600168, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80698317, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 4.257676124572754 + }, + { + "auxiliary_loss_clip": 0.01090092, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.03785872, + "balance_loss_mlp": 1.01593196, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 3.032117150908392, + "language_loss": 0.79171801, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81290364, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.607391119003296 + }, + { + "auxiliary_loss_clip": 0.01052418, + "auxiliary_loss_mlp": 0.01036824, + "balance_loss_clip": 1.03219247, + "balance_loss_mlp": 1.02393782, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.4594031629923472, + "language_loss": 0.82894301, + "learning_rate": 2.2213440707461e-06, + "loss": 0.8498354, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 2.755061626434326 + }, + { + "auxiliary_loss_clip": 0.01032403, + "auxiliary_loss_mlp": 0.01034876, + "balance_loss_clip": 1.02923477, + "balance_loss_mlp": 1.02268076, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.6565280395061226, + "language_loss": 0.80586541, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82653821, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 4.250489711761475 + }, + { + "auxiliary_loss_clip": 0.0105754, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0315032, + "balance_loss_mlp": 1.02150106, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 2.3763321515114186, + "language_loss": 0.72788274, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74879962, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.706693172454834 + }, + { + "auxiliary_loss_clip": 0.01107597, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.03653526, + "balance_loss_mlp": 1.01792169, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.6993249330903808, + "language_loss": 0.7081731, + "learning_rate": 2.220182825407892e-06, + "loss": 0.72955519, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 2.635357141494751 + }, + { + "auxiliary_loss_clip": 0.01097253, + "auxiliary_loss_mlp": 0.01040656, + "balance_loss_clip": 1.03524601, + "balance_loss_mlp": 1.02873552, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 1.5222172262517488, + "language_loss": 0.71526372, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73664284, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.621251106262207 + }, + { + "auxiliary_loss_clip": 0.01099855, + "auxiliary_loss_mlp": 0.01038981, + "balance_loss_clip": 1.03900874, + "balance_loss_mlp": 1.02602887, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.5442125695894227, + "language_loss": 0.74801528, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.76940364, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 2.703446626663208 + }, + { + "auxiliary_loss_clip": 0.01097168, + "auxiliary_loss_mlp": 0.01039416, + "balance_loss_clip": 1.03622317, + "balance_loss_mlp": 1.02685153, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.7190309259803278, + "language_loss": 0.81694436, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83831012, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.607903003692627 + }, + { + "auxiliary_loss_clip": 0.01103548, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.03988183, + "balance_loss_mlp": 1.02198696, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.7463249897496451, + "language_loss": 0.72025454, + "learning_rate": 2.218634381467819e-06, + "loss": 0.74164432, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 4.178043365478516 + }, + { + "auxiliary_loss_clip": 0.01089653, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.0364759, + "balance_loss_mlp": 1.02271199, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.709871790601476, + "language_loss": 0.82342029, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84466279, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.5286436080932617 + }, + { + "auxiliary_loss_clip": 0.0108781, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_clip": 1.03716552, + "balance_loss_mlp": 1.02983332, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.2591898445744643, + "language_loss": 0.77373898, + "learning_rate": 2.217860109695239e-06, + "loss": 0.79506677, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.534921646118164 + }, + { + "auxiliary_loss_clip": 0.01091004, + "auxiliary_loss_mlp": 0.01035489, + "balance_loss_clip": 1.03526413, + "balance_loss_mlp": 1.02285314, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 1.6053571867347896, + "language_loss": 0.70729494, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72855985, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.536370038986206 + }, + { + "auxiliary_loss_clip": 0.01075376, + "auxiliary_loss_mlp": 0.01037123, + "balance_loss_clip": 1.03275228, + "balance_loss_mlp": 1.02423072, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.9524984477282032, + "language_loss": 0.70852554, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72965056, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.5996017456054688 + }, + { + "auxiliary_loss_clip": 0.01109183, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.03698444, + "balance_loss_mlp": 1.01902747, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.3764149505743135, + "language_loss": 0.71463513, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.73603928, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.4895026683807373 + }, + { + "auxiliary_loss_clip": 0.01073535, + "auxiliary_loss_mlp": 0.0104257, + "balance_loss_clip": 1.03633511, + "balance_loss_mlp": 1.0290339, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.7347011760560935, + "language_loss": 0.60385704, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62501812, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.592756986618042 + }, + { + "auxiliary_loss_clip": 0.01000158, + "auxiliary_loss_mlp": 0.01002742, + "balance_loss_clip": 1.00725329, + "balance_loss_mlp": 1.00152647, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8613925872415454, + "language_loss": 0.61312389, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63315284, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.1160449981689453 + }, + { + "auxiliary_loss_clip": 0.01099246, + "auxiliary_loss_mlp": 0.01041216, + "balance_loss_clip": 1.03785896, + "balance_loss_mlp": 1.02757215, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.852745700568185, + "language_loss": 0.73111534, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75251997, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.5454118251800537 + }, + { + "auxiliary_loss_clip": 0.01078738, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.0326314, + "balance_loss_mlp": 1.02291894, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.8377194756973048, + "language_loss": 0.79528534, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81642008, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 2.6170191764831543 + }, + { + "auxiliary_loss_clip": 0.01072077, + "auxiliary_loss_mlp": 0.01038889, + "balance_loss_clip": 1.03638697, + "balance_loss_mlp": 1.02581191, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 2.1432429785895613, + "language_loss": 0.73755205, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75866175, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 2.65557599067688 + }, + { + "auxiliary_loss_clip": 0.01081299, + "auxiliary_loss_mlp": 0.01026925, + "balance_loss_clip": 1.03850758, + "balance_loss_mlp": 1.01517057, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 2.271465181329216, + "language_loss": 0.90861124, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92969346, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 2.6843175888061523 + }, + { + "auxiliary_loss_clip": 0.01111707, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.03773487, + "balance_loss_mlp": 1.02183318, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 2.2265401984785425, + "language_loss": 0.74063921, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76210427, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 2.4991180896759033 + }, + { + "auxiliary_loss_clip": 0.01091663, + "auxiliary_loss_mlp": 0.01037522, + "balance_loss_clip": 1.03679705, + "balance_loss_mlp": 1.02422404, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 1.980797671309803, + "language_loss": 0.80655378, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82784563, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 2.6301708221435547 + }, + { + "auxiliary_loss_clip": 0.01093305, + "auxiliary_loss_mlp": 0.01026055, + "balance_loss_clip": 1.03867769, + "balance_loss_mlp": 1.01481962, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 1.9114447232786922, + "language_loss": 0.77685356, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79804718, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 2.536593198776245 + }, + { + "auxiliary_loss_clip": 0.01091755, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.03606665, + "balance_loss_mlp": 1.01843405, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 2.1634857490205084, + "language_loss": 0.79658997, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.81781888, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 2.5934009552001953 + }, + { + "auxiliary_loss_clip": 0.01088145, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.04587972, + "balance_loss_mlp": 1.01624084, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.8217120069788368, + "language_loss": 0.76055938, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78172171, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 2.6883726119995117 + }, + { + "auxiliary_loss_clip": 0.01072121, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.03432, + "balance_loss_mlp": 1.02327108, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.8247985134257938, + "language_loss": 0.79059994, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81167948, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 2.64202880859375 + }, + { + "auxiliary_loss_clip": 0.01103613, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.03511024, + "balance_loss_mlp": 1.01718032, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 1.826590320239249, + "language_loss": 0.69826919, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71959919, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 2.6548540592193604 + }, + { + "auxiliary_loss_clip": 0.01076978, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.03257108, + "balance_loss_mlp": 1.01731324, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.921593595264797, + "language_loss": 0.63051498, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.65158784, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 4.085464000701904 + }, + { + "auxiliary_loss_clip": 0.01082906, + "auxiliary_loss_mlp": 0.0074928, + "balance_loss_clip": 1.0355444, + "balance_loss_mlp": 1.00033021, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.3769245201080103, + "language_loss": 0.66170967, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68003148, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 2.5564239025115967 + }, + { + "auxiliary_loss_clip": 0.01026916, + "auxiliary_loss_mlp": 0.01042944, + "balance_loss_clip": 1.02850866, + "balance_loss_mlp": 1.02910364, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 1.727712211444173, + "language_loss": 0.76545137, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78614992, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 2.79197359085083 + }, + { + "auxiliary_loss_clip": 0.01086319, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.03608894, + "balance_loss_mlp": 1.02341986, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.516780159837747, + "language_loss": 0.75292027, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.774149, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 2.9001550674438477 + }, + { + "auxiliary_loss_clip": 0.01104855, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.03518772, + "balance_loss_mlp": 1.01746154, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.9241712178181065, + "language_loss": 0.70715499, + "learning_rate": 2.209728283441112e-06, + "loss": 0.72849715, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.526296377182007 + }, + { + "auxiliary_loss_clip": 0.01091628, + "auxiliary_loss_mlp": 0.01039025, + "balance_loss_clip": 1.03420472, + "balance_loss_mlp": 1.02484536, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 1.8339376662681997, + "language_loss": 0.7505123, + "learning_rate": 2.209340965060465e-06, + "loss": 0.77181882, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 2.605363368988037 + }, + { + "auxiliary_loss_clip": 0.01086462, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.03738117, + "balance_loss_mlp": 1.02219808, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.6647814856984193, + "language_loss": 0.67408729, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69529557, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.6822457313537598 + }, + { + "auxiliary_loss_clip": 0.01086687, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.03684878, + "balance_loss_mlp": 1.02155924, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 2.1000056497098796, + "language_loss": 0.73258841, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75378895, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.5705044269561768 + }, + { + "auxiliary_loss_clip": 0.01081587, + "auxiliary_loss_mlp": 0.0102919, + "balance_loss_clip": 1.03620863, + "balance_loss_mlp": 1.01664364, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 1.953068845495246, + "language_loss": 0.84592503, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.86703277, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.6299009323120117 + }, + { + "auxiliary_loss_clip": 0.010727, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.03307271, + "balance_loss_mlp": 1.02021408, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 1.990825691453966, + "language_loss": 0.73891079, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.75995392, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 2.5770232677459717 + }, + { + "auxiliary_loss_clip": 0.01077847, + "auxiliary_loss_mlp": 0.01040994, + "balance_loss_clip": 1.03487468, + "balance_loss_mlp": 1.02746344, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 2.07184090191897, + "language_loss": 0.71761882, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.7388072, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 2.6661367416381836 + }, + { + "auxiliary_loss_clip": 0.01085673, + "auxiliary_loss_mlp": 0.01040306, + "balance_loss_clip": 1.0320127, + "balance_loss_mlp": 1.0276463, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.655778983443967, + "language_loss": 0.74116886, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76242864, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.579737663269043 + }, + { + "auxiliary_loss_clip": 0.01055515, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.0382266, + "balance_loss_mlp": 1.01909232, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 5.351258529042612, + "language_loss": 0.83253837, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85340619, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 4.160994052886963 + }, + { + "auxiliary_loss_clip": 0.01071071, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.03480077, + "balance_loss_mlp": 1.01857138, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 1.6166137665624278, + "language_loss": 0.78967667, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81068534, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 2.6341588497161865 + }, + { + "auxiliary_loss_clip": 0.01084041, + "auxiliary_loss_mlp": 0.00749628, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.00029743, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 2.5527888720432417, + "language_loss": 0.69810104, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71643776, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 2.799006700515747 + }, + { + "auxiliary_loss_clip": 0.0109274, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.03447056, + "balance_loss_mlp": 1.0174576, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 1.8086277543148621, + "language_loss": 0.72736871, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74858892, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 4.110067129135132 + }, + { + "auxiliary_loss_clip": 0.01055356, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.03311229, + "balance_loss_mlp": 1.02825773, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.7724344082320913, + "language_loss": 0.6905266, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71150768, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 2.6711065769195557 + }, + { + "auxiliary_loss_clip": 0.01074019, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.03559542, + "balance_loss_mlp": 1.02187014, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.508143406829665, + "language_loss": 0.79449975, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81558371, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 2.7105371952056885 + }, + { + "auxiliary_loss_clip": 0.010943, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.03613865, + "balance_loss_mlp": 1.01768088, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.702401579427894, + "language_loss": 0.77549863, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79673493, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 2.562102794647217 + }, + { + "auxiliary_loss_clip": 0.01099271, + "auxiliary_loss_mlp": 0.01037583, + "balance_loss_clip": 1.03784239, + "balance_loss_mlp": 1.02479768, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.5639584418068173, + "language_loss": 0.75735116, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77871966, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.709266185760498 + }, + { + "auxiliary_loss_clip": 0.01069723, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.03511214, + "balance_loss_mlp": 1.01583862, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.6468464555052047, + "language_loss": 0.66834652, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68932378, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.6928725242614746 + }, + { + "auxiliary_loss_clip": 0.01011047, + "auxiliary_loss_mlp": 0.01002267, + "balance_loss_clip": 1.00920606, + "balance_loss_mlp": 1.00092554, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.685268210085898, + "language_loss": 0.58551192, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.605645, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 4.71445107460022 + }, + { + "auxiliary_loss_clip": 0.0108054, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.03435373, + "balance_loss_mlp": 1.02101994, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 2.0969195723413065, + "language_loss": 0.71500808, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.73616922, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.56852388381958 + }, + { + "auxiliary_loss_clip": 0.01049915, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.03214633, + "balance_loss_mlp": 1.01875424, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.2971844872096874, + "language_loss": 0.75969493, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78051817, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.635324001312256 + }, + { + "auxiliary_loss_clip": 0.01064541, + "auxiliary_loss_mlp": 0.01038366, + "balance_loss_clip": 1.03761923, + "balance_loss_mlp": 1.02580118, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 2.2409725732627717, + "language_loss": 0.69000959, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71103871, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.666843891143799 + }, + { + "auxiliary_loss_clip": 0.01106751, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.03640699, + "balance_loss_mlp": 1.01916528, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 1.9588303125749895, + "language_loss": 0.82462287, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84600973, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.628070116043091 + }, + { + "auxiliary_loss_clip": 0.01072709, + "auxiliary_loss_mlp": 0.01034547, + "balance_loss_clip": 1.03251553, + "balance_loss_mlp": 1.02231586, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.8408399247174845, + "language_loss": 0.80314481, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82421738, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.5787386894226074 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.03647923, + "balance_loss_mlp": 1.02005219, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 2.7986985797934665, + "language_loss": 0.81246924, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83377886, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.657426118850708 + }, + { + "auxiliary_loss_clip": 0.01079493, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.03656328, + "balance_loss_mlp": 1.01980805, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.6686394239885929, + "language_loss": 0.72575939, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74685657, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 2.5579888820648193 + }, + { + "auxiliary_loss_clip": 0.01020556, + "auxiliary_loss_mlp": 0.00746654, + "balance_loss_clip": 1.00845397, + "balance_loss_mlp": 0.9998728, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7021893292956354, + "language_loss": 0.56356615, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58123827, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 3.195065975189209 + }, + { + "auxiliary_loss_clip": 0.01069786, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.03729367, + "balance_loss_mlp": 1.01893282, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.267781345710179, + "language_loss": 0.75346124, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77446902, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.632631540298462 + }, + { + "auxiliary_loss_clip": 0.01092829, + "auxiliary_loss_mlp": 0.0102688, + "balance_loss_clip": 1.03654766, + "balance_loss_mlp": 1.01578736, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 2.5461795759770154, + "language_loss": 0.66379565, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68499273, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.5201151371002197 + }, + { + "auxiliary_loss_clip": 0.01093393, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.03545427, + "balance_loss_mlp": 1.01916611, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 4.566566676936638, + "language_loss": 0.69400227, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71523738, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 2.597273111343384 + }, + { + "auxiliary_loss_clip": 0.010344, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.03058863, + "balance_loss_mlp": 1.01657271, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.537562703389419, + "language_loss": 0.69606042, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71669185, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 2.7815520763397217 + }, + { + "auxiliary_loss_clip": 0.01096086, + "auxiliary_loss_mlp": 0.01029261, + "balance_loss_clip": 1.03688002, + "balance_loss_mlp": 1.01703024, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.698992898300449, + "language_loss": 0.63110435, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65235782, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 3.1895692348480225 + }, + { + "auxiliary_loss_clip": 0.01093095, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.03427839, + "balance_loss_mlp": 1.01556361, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.7456999164385638, + "language_loss": 0.67554867, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69675767, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 2.6212146282196045 + }, + { + "auxiliary_loss_clip": 0.01063855, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.03207183, + "balance_loss_mlp": 1.02360082, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.9541980774081609, + "language_loss": 0.81571776, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83672678, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 2.703404188156128 + }, + { + "auxiliary_loss_clip": 0.01091247, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.03494966, + "balance_loss_mlp": 1.018749, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 2.003443947628944, + "language_loss": 0.79824382, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.81946695, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.5631651878356934 + }, + { + "auxiliary_loss_clip": 0.01111993, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.03913689, + "balance_loss_mlp": 1.02360618, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 3.41831059771006, + "language_loss": 0.66673374, + "learning_rate": 2.196555093055352e-06, + "loss": 0.68822217, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 2.6183300018310547 + }, + { + "auxiliary_loss_clip": 0.01096595, + "auxiliary_loss_mlp": 0.01037001, + "balance_loss_clip": 1.03886509, + "balance_loss_mlp": 1.02457333, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.8082950441349943, + "language_loss": 0.67105836, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69239438, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 4.190078020095825 + }, + { + "auxiliary_loss_clip": 0.01088821, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.03801465, + "balance_loss_mlp": 1.02950382, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 1.8564632708702256, + "language_loss": 0.82179272, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84310949, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.588270425796509 + }, + { + "auxiliary_loss_clip": 0.01039679, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.03424537, + "balance_loss_mlp": 1.01821589, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.7287790544659876, + "language_loss": 0.74563771, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76633501, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 2.7781097888946533 + }, + { + "auxiliary_loss_clip": 0.0108119, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.03496313, + "balance_loss_mlp": 1.01814723, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.760365575210405, + "language_loss": 0.78730035, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.80841839, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 2.638099193572998 + }, + { + "auxiliary_loss_clip": 0.01104933, + "auxiliary_loss_mlp": 0.0074909, + "balance_loss_clip": 1.03874302, + "balance_loss_mlp": 1.00026941, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.9869458540980038, + "language_loss": 0.7876417, + "learning_rate": 2.194617118620173e-06, + "loss": 0.80618191, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.505782127380371 + }, + { + "auxiliary_loss_clip": 0.01082808, + "auxiliary_loss_mlp": 0.00749081, + "balance_loss_clip": 1.03000939, + "balance_loss_mlp": 1.00016999, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.5495689315775292, + "language_loss": 0.76504719, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78336608, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 2.54284930229187 + }, + { + "auxiliary_loss_clip": 0.01104848, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.03747058, + "balance_loss_mlp": 1.01798368, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.4905301492921441, + "language_loss": 0.72026485, + "learning_rate": 2.193841877083912e-06, + "loss": 0.7416054, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.5845110416412354 + }, + { + "auxiliary_loss_clip": 0.01042521, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.03676343, + "balance_loss_mlp": 1.02121949, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.3025241250551685, + "language_loss": 0.79556584, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81632859, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.6593806743621826 + }, + { + "auxiliary_loss_clip": 0.0107317, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.03156745, + "balance_loss_mlp": 1.0188154, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4470591728166147, + "language_loss": 0.84704447, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86807334, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 2.564192295074463 + }, + { + "auxiliary_loss_clip": 0.01072137, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.03645957, + "balance_loss_mlp": 1.01931214, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.7327301127081294, + "language_loss": 0.78002107, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80104649, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.6588714122772217 + }, + { + "auxiliary_loss_clip": 0.01045032, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.03393173, + "balance_loss_mlp": 1.01861739, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 3.245345458011851, + "language_loss": 0.77648467, + "learning_rate": 2.192291305922943e-06, + "loss": 0.79724061, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 2.6680428981781006 + }, + { + "auxiliary_loss_clip": 0.01044154, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.03048372, + "balance_loss_mlp": 1.01702988, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 2.5186286903669286, + "language_loss": 0.72173727, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74247503, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.7438390254974365 + }, + { + "auxiliary_loss_clip": 0.01057227, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.03586078, + "balance_loss_mlp": 1.02794218, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.05700043021331, + "language_loss": 0.87671542, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89768904, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.616889715194702 + }, + { + "auxiliary_loss_clip": 0.01056453, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.02933609, + "balance_loss_mlp": 1.01998043, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 2.8542790418920503, + "language_loss": 0.61004561, + "learning_rate": 2.19112830093786e-06, + "loss": 0.63093686, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 4.154614448547363 + }, + { + "auxiliary_loss_clip": 0.01068299, + "auxiliary_loss_mlp": 0.00749554, + "balance_loss_clip": 1.0339855, + "balance_loss_mlp": 1.00030684, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.7889986460160434, + "language_loss": 0.7290715, + "learning_rate": 2.19074061809469e-06, + "loss": 0.74724996, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.6538519859313965 + }, + { + "auxiliary_loss_clip": 0.01102392, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.03663039, + "balance_loss_mlp": 1.02269483, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.4903536844196623, + "language_loss": 0.81476766, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.83612901, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 2.8892319202423096 + }, + { + "auxiliary_loss_clip": 0.01084892, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.03626847, + "balance_loss_mlp": 1.02024078, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.840655413496972, + "language_loss": 0.86368293, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88486922, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 4.199828863143921 + }, + { + "auxiliary_loss_clip": 0.00993961, + "auxiliary_loss_mlp": 0.01010522, + "balance_loss_clip": 1.01108479, + "balance_loss_mlp": 1.00918043, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.8991086692242581, + "language_loss": 0.58399904, + "learning_rate": 2.189577526226564e-06, + "loss": 0.6040439, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.142393112182617 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.03918076, + "balance_loss_mlp": 1.01856291, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.6011687702033977, + "language_loss": 0.72529817, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74670762, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.608398199081421 + }, + { + "auxiliary_loss_clip": 0.01071662, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.03714776, + "balance_loss_mlp": 1.018013, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.030504952332668, + "language_loss": 0.79411077, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81512541, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 2.654794454574585 + }, + { + "auxiliary_loss_clip": 0.01076465, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.03241754, + "balance_loss_mlp": 1.017308, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 5.179795835027922, + "language_loss": 0.83710569, + "learning_rate": 2.188414369659251e-06, + "loss": 0.85816711, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.5685575008392334 + }, + { + "auxiliary_loss_clip": 0.01086986, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03163195, + "balance_loss_mlp": 1.01706088, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.637498112319505, + "language_loss": 0.83393359, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85510612, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.637455940246582 + }, + { + "auxiliary_loss_clip": 0.01079771, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.03501177, + "balance_loss_mlp": 1.01967549, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 1.9945395735161875, + "language_loss": 0.86971432, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89082098, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 4.124284267425537 + }, + { + "auxiliary_loss_clip": 0.01060504, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.0369544, + "balance_loss_mlp": 1.02530003, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.688170449531163, + "language_loss": 0.80808717, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.82905352, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.6516382694244385 + }, + { + "auxiliary_loss_clip": 0.01094717, + "auxiliary_loss_mlp": 0.0103011, + "balance_loss_clip": 1.03578699, + "balance_loss_mlp": 1.01824856, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 1.8457708910632638, + "language_loss": 0.68095803, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70220637, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.571842908859253 + }, + { + "auxiliary_loss_clip": 0.01096401, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.03770697, + "balance_loss_mlp": 1.02124381, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.5299631877484394, + "language_loss": 0.77478081, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.7960788, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.6400699615478516 + }, + { + "auxiliary_loss_clip": 0.01103953, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.03483057, + "balance_loss_mlp": 1.01643717, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 1.8698988946122932, + "language_loss": 0.69942957, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.72075123, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.6013436317443848 + }, + { + "auxiliary_loss_clip": 0.01099491, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.03693318, + "balance_loss_mlp": 1.0259316, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.7989291956016438, + "language_loss": 0.72660649, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74799073, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 2.649407386779785 + }, + { + "auxiliary_loss_clip": 0.01082053, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.0344429, + "balance_loss_mlp": 1.01870108, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.4429251980579745, + "language_loss": 0.75185311, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77297974, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.6457653045654297 + }, + { + "auxiliary_loss_clip": 0.01073951, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.03512359, + "balance_loss_mlp": 1.01876664, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.654425010790703, + "language_loss": 0.83938456, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86043525, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 2.5900566577911377 + }, + { + "auxiliary_loss_clip": 0.01102261, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.03586161, + "balance_loss_mlp": 1.01731253, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.5412194296157609, + "language_loss": 0.7609784, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78228962, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.4920318126678467 + }, + { + "auxiliary_loss_clip": 0.01095751, + "auxiliary_loss_mlp": 0.01025993, + "balance_loss_clip": 1.03527284, + "balance_loss_mlp": 1.01429892, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.463689688210443, + "language_loss": 0.80327678, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82449418, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.6705074310302734 + }, + { + "auxiliary_loss_clip": 0.01078041, + "auxiliary_loss_mlp": 0.00749733, + "balance_loss_clip": 1.03410292, + "balance_loss_mlp": 1.00031114, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 2.7490530869207883, + "language_loss": 0.7173475, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73562527, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 2.6063220500946045 + }, + { + "auxiliary_loss_clip": 0.01104947, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.03635967, + "balance_loss_mlp": 1.01795888, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 2.0819604067810156, + "language_loss": 0.67557108, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69691843, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.496072292327881 + }, + { + "auxiliary_loss_clip": 0.01089922, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.03885627, + "balance_loss_mlp": 1.02456355, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 2.1670419289185747, + "language_loss": 0.66305101, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68432558, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 2.572676658630371 + }, + { + "auxiliary_loss_clip": 0.01088108, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.03531241, + "balance_loss_mlp": 1.01879454, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 1.812015071241857, + "language_loss": 0.78716964, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80836993, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 2.517054796218872 + }, + { + "auxiliary_loss_clip": 0.0107042, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.03015411, + "balance_loss_mlp": 1.02253187, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.8109829075209074, + "language_loss": 0.67587829, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69693154, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 2.6066017150878906 + }, + { + "auxiliary_loss_clip": 0.01073795, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.03254008, + "balance_loss_mlp": 1.0226934, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.60111390711683, + "language_loss": 0.71715069, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73824131, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.585059404373169 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.03703535, + "balance_loss_mlp": 1.02189136, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.7229591185147324, + "language_loss": 0.6632126, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68456948, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 2.7364275455474854 + }, + { + "auxiliary_loss_clip": 0.01049517, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.03085124, + "balance_loss_mlp": 1.02275193, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 2.0379915768702337, + "language_loss": 0.66678548, + "learning_rate": 2.181046234549138e-06, + "loss": 0.68763113, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 2.6753289699554443 + }, + { + "auxiliary_loss_clip": 0.01069863, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.03355026, + "balance_loss_mlp": 1.02081478, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.8842322182464626, + "language_loss": 0.76924062, + "learning_rate": 2.180658368429088e-06, + "loss": 0.79026479, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 4.238575458526611 + }, + { + "auxiliary_loss_clip": 0.01030396, + "auxiliary_loss_mlp": 0.01001587, + "balance_loss_clip": 1.00763106, + "balance_loss_mlp": 1.00028181, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6888314404873118, + "language_loss": 0.52256143, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54288125, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 3.1828393936157227 + }, + { + "auxiliary_loss_clip": 0.01077628, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.03749728, + "balance_loss_mlp": 1.02033317, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 1.8467542510261639, + "language_loss": 0.73610741, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75720561, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.5702567100524902 + }, + { + "auxiliary_loss_clip": 0.01098832, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.03816068, + "balance_loss_mlp": 1.02805614, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.5931809084475632, + "language_loss": 0.62848532, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.64988583, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 2.591404914855957 + }, + { + "auxiliary_loss_clip": 0.01105713, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.03711545, + "balance_loss_mlp": 1.01498294, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 1.8034406684212467, + "language_loss": 0.68954968, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71087521, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 2.5937209129333496 + }, + { + "auxiliary_loss_clip": 0.01067234, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.03479183, + "balance_loss_mlp": 1.01895809, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.8179456863603263, + "language_loss": 0.73535621, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75633115, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.6319849491119385 + }, + { + "auxiliary_loss_clip": 0.010912, + "auxiliary_loss_mlp": 0.00749582, + "balance_loss_clip": 1.0395813, + "balance_loss_mlp": 1.00029063, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 1.6881664573334554, + "language_loss": 0.76914269, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78755057, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 2.620539665222168 + }, + { + "auxiliary_loss_clip": 0.0105135, + "auxiliary_loss_mlp": 0.01029302, + "balance_loss_clip": 1.035743, + "balance_loss_mlp": 1.01835847, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.5312933143769363, + "language_loss": 0.75197685, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77278328, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.662285327911377 + }, + { + "auxiliary_loss_clip": 0.0109307, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.03679132, + "balance_loss_mlp": 1.02074325, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.9400487983792896, + "language_loss": 0.73989165, + "learning_rate": 2.177555194083212e-06, + "loss": 0.7611326, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 2.541404962539673 + }, + { + "auxiliary_loss_clip": 0.01092701, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.03521872, + "balance_loss_mlp": 1.01695228, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.7252306932916885, + "language_loss": 0.78078425, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80199945, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 2.564347505569458 + }, + { + "auxiliary_loss_clip": 0.01096478, + "auxiliary_loss_mlp": 0.01039018, + "balance_loss_clip": 1.03943181, + "balance_loss_mlp": 1.02681696, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 2.4945267779981584, + "language_loss": 0.72147644, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74283147, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 2.5102224349975586 + }, + { + "auxiliary_loss_clip": 0.01093647, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.03750384, + "balance_loss_mlp": 1.02140105, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.6486037677685228, + "language_loss": 0.76233453, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78360617, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 2.5519797801971436 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.01037951, + "balance_loss_clip": 1.03789985, + "balance_loss_mlp": 1.02567291, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.5821327156397589, + "language_loss": 0.75109643, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77247226, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.6083428859710693 + }, + { + "auxiliary_loss_clip": 0.01010993, + "auxiliary_loss_mlp": 0.00746885, + "balance_loss_clip": 1.00816512, + "balance_loss_mlp": 1.00006342, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.7832164633139138, + "language_loss": 0.48891267, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50649142, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 4.665960788726807 + }, + { + "auxiliary_loss_clip": 0.01078837, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.03870368, + "balance_loss_mlp": 1.02662528, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.3688005515615722, + "language_loss": 0.76806027, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78924793, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 2.7283520698547363 + }, + { + "auxiliary_loss_clip": 0.01089469, + "auxiliary_loss_mlp": 0.01037077, + "balance_loss_clip": 1.03963923, + "balance_loss_mlp": 1.02420866, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.421744833220671, + "language_loss": 0.7237643, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74502981, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.612793207168579 + }, + { + "auxiliary_loss_clip": 0.01068998, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.03381467, + "balance_loss_mlp": 1.02199614, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.5816826147129353, + "language_loss": 0.63228512, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65331876, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.628687858581543 + }, + { + "auxiliary_loss_clip": 0.01073096, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.03270972, + "balance_loss_mlp": 1.02339673, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.91935437430808, + "language_loss": 0.79293859, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81403977, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 4.243807077407837 + }, + { + "auxiliary_loss_clip": 0.0108484, + "auxiliary_loss_mlp": 0.01034124, + "balance_loss_clip": 1.03672838, + "balance_loss_mlp": 1.02178621, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.7070527928931598, + "language_loss": 0.63192308, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65311265, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 2.6515538692474365 + }, + { + "auxiliary_loss_clip": 0.01051227, + "auxiliary_loss_mlp": 0.00749377, + "balance_loss_clip": 1.03978848, + "balance_loss_mlp": 1.00025249, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.6479185475482945, + "language_loss": 0.72211623, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74012232, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.815127372741699 + }, + { + "auxiliary_loss_clip": 0.01098035, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.03706408, + "balance_loss_mlp": 1.01692009, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 2.0294992070927815, + "language_loss": 0.6309129, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65219128, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 2.775230646133423 + }, + { + "auxiliary_loss_clip": 0.01093662, + "auxiliary_loss_mlp": 0.0103749, + "balance_loss_clip": 1.03542888, + "balance_loss_mlp": 1.02441823, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.8883724053852649, + "language_loss": 0.82793623, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84924775, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 2.551384925842285 + }, + { + "auxiliary_loss_clip": 0.01090967, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.03481162, + "balance_loss_mlp": 1.0224582, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 2.053824343949246, + "language_loss": 0.85353106, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87480545, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 4.020538806915283 + }, + { + "auxiliary_loss_clip": 0.01072339, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.03728604, + "balance_loss_mlp": 1.01872039, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.6989466460394766, + "language_loss": 0.85593975, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.8769666, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.6422061920166016 + }, + { + "auxiliary_loss_clip": 0.01084643, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.0368855, + "balance_loss_mlp": 1.01852143, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 1.9990061416771332, + "language_loss": 0.79512656, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81628209, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.6644043922424316 + }, + { + "auxiliary_loss_clip": 0.01057966, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.03560328, + "balance_loss_mlp": 1.02172554, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.059793482317561, + "language_loss": 0.73144627, + "learning_rate": 2.170959527233356e-06, + "loss": 0.75237346, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 2.649463653564453 + }, + { + "auxiliary_loss_clip": 0.01090745, + "auxiliary_loss_mlp": 0.01033473, + "balance_loss_clip": 1.03340411, + "balance_loss_mlp": 1.02084827, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 2.31699459852521, + "language_loss": 0.68459928, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70584142, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 2.6529347896575928 + }, + { + "auxiliary_loss_clip": 0.01106749, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.03428531, + "balance_loss_mlp": 1.02126265, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 1.634411888555473, + "language_loss": 0.76243234, + "learning_rate": 2.170183441856481e-06, + "loss": 0.7838347, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.502450466156006 + }, + { + "auxiliary_loss_clip": 0.01106515, + "auxiliary_loss_mlp": 0.01030941, + "balance_loss_clip": 1.03660166, + "balance_loss_mlp": 1.01947343, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.7048457072373275, + "language_loss": 0.76009023, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78146482, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 2.6113288402557373 + }, + { + "auxiliary_loss_clip": 0.01095972, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.03579974, + "balance_loss_mlp": 1.01500809, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.440003125946324, + "language_loss": 0.65119362, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67242992, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 2.4713449478149414 + }, + { + "auxiliary_loss_clip": 0.01065943, + "auxiliary_loss_mlp": 0.01025506, + "balance_loss_clip": 1.0308485, + "balance_loss_mlp": 1.01351309, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 1.8459236769653413, + "language_loss": 0.7206471, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74156153, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 2.597517967224121 + }, + { + "auxiliary_loss_clip": 0.01096426, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.03640425, + "balance_loss_mlp": 1.02256417, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.9789330871286335, + "language_loss": 0.69442135, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71573174, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 2.6697049140930176 + }, + { + "auxiliary_loss_clip": 0.01088464, + "auxiliary_loss_mlp": 0.01026831, + "balance_loss_clip": 1.03571689, + "balance_loss_mlp": 1.01482654, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.4415485986610583, + "language_loss": 0.70487559, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72602856, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 2.5721590518951416 + }, + { + "auxiliary_loss_clip": 0.0103893, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.02768016, + "balance_loss_mlp": 1.01983035, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 2.2083136761438444, + "language_loss": 0.70671111, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72742337, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 2.686474323272705 + }, + { + "auxiliary_loss_clip": 0.0106498, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.03544343, + "balance_loss_mlp": 1.01989853, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 1.8302814637842104, + "language_loss": 0.80409443, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82506669, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 2.727656364440918 + }, + { + "auxiliary_loss_clip": 0.01103445, + "auxiliary_loss_mlp": 0.01030914, + "balance_loss_clip": 1.03481483, + "balance_loss_mlp": 1.01996493, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.636238451791847, + "language_loss": 0.74794078, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76928437, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 2.5511837005615234 + }, + { + "auxiliary_loss_clip": 0.01075226, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.03323627, + "balance_loss_mlp": 1.02393663, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 2.384804313801764, + "language_loss": 0.73365295, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75476897, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.6208949089050293 + }, + { + "auxiliary_loss_clip": 0.01034324, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.03192019, + "balance_loss_mlp": 1.01821303, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 2.1591361352734317, + "language_loss": 0.74869108, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.76933861, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.746182441711426 + }, + { + "auxiliary_loss_clip": 0.01071279, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.03605402, + "balance_loss_mlp": 1.02015662, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.6425689435320385, + "language_loss": 0.74217069, + "learning_rate": 2.165914514023972e-06, + "loss": 0.7631948, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.6177728176116943 + }, + { + "auxiliary_loss_clip": 0.01092333, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.03341413, + "balance_loss_mlp": 1.01906538, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.707460457639098, + "language_loss": 0.62322664, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64445627, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 2.577113389968872 + }, + { + "auxiliary_loss_clip": 0.01076023, + "auxiliary_loss_mlp": 0.01036916, + "balance_loss_clip": 1.0360446, + "balance_loss_mlp": 1.02408886, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 1.659876294325226, + "language_loss": 0.82462621, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84575558, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 4.200445652008057 + }, + { + "auxiliary_loss_clip": 0.01076658, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.04003263, + "balance_loss_mlp": 1.02345538, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.6183579182668304, + "language_loss": 0.72322571, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74435192, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.747623920440674 + }, + { + "auxiliary_loss_clip": 0.0110343, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.03522706, + "balance_loss_mlp": 1.02072716, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.8497562891080075, + "language_loss": 0.67000949, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69136167, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.662008047103882 + }, + { + "auxiliary_loss_clip": 0.01093656, + "auxiliary_loss_mlp": 0.00749252, + "balance_loss_clip": 1.0350554, + "balance_loss_mlp": 1.00021982, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.504667044736859, + "language_loss": 0.75157881, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77000791, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 2.7249257564544678 + }, + { + "auxiliary_loss_clip": 0.01074794, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.03104305, + "balance_loss_mlp": 1.01781905, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.7719944564501986, + "language_loss": 0.75950813, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.78055441, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 2.6273033618927 + }, + { + "auxiliary_loss_clip": 0.01084851, + "auxiliary_loss_mlp": 0.00749589, + "balance_loss_clip": 1.03345215, + "balance_loss_mlp": 1.00025129, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.670379895195161, + "language_loss": 0.79953903, + "learning_rate": 2.163197525984761e-06, + "loss": 0.81788343, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 2.6518642902374268 + }, + { + "auxiliary_loss_clip": 0.01088898, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.03226364, + "balance_loss_mlp": 1.01607847, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.6226160265568363, + "language_loss": 0.74322832, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76439786, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.6408658027648926 + }, + { + "auxiliary_loss_clip": 0.01070052, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.03399265, + "balance_loss_mlp": 1.01446915, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.4299407311235166, + "language_loss": 0.82797408, + "learning_rate": 2.162421187770864e-06, + "loss": 0.84894133, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 2.5892462730407715 + }, + { + "auxiliary_loss_clip": 0.01067401, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.03306484, + "balance_loss_mlp": 1.02048349, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.835380824151987, + "language_loss": 0.74038935, + "learning_rate": 2.162033009418015e-06, + "loss": 0.76137316, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.698159694671631 + }, + { + "auxiliary_loss_clip": 0.01110547, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.0378232, + "balance_loss_mlp": 1.01591825, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 1.951288569714734, + "language_loss": 0.76144284, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78283489, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.542448043823242 + }, + { + "auxiliary_loss_clip": 0.01086039, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.03884506, + "balance_loss_mlp": 1.02092326, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 1.8790389742767735, + "language_loss": 0.72469258, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.74588287, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 2.6168723106384277 + }, + { + "auxiliary_loss_clip": 0.00989198, + "auxiliary_loss_mlp": 0.0100165, + "balance_loss_clip": 1.00722015, + "balance_loss_mlp": 1.00010014, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8341208377228353, + "language_loss": 0.54334819, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56325668, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.197373628616333 + }, + { + "auxiliary_loss_clip": 0.01044607, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.03261161, + "balance_loss_mlp": 1.01920211, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.7739496998623003, + "language_loss": 0.61696225, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.63772357, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 2.7957894802093506 + }, + { + "auxiliary_loss_clip": 0.01070626, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.03382361, + "balance_loss_mlp": 1.02362263, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.523703087713714, + "language_loss": 0.7685324, + "learning_rate": 2.160092025783549e-06, + "loss": 0.7895962, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 4.100632905960083 + }, + { + "auxiliary_loss_clip": 0.01009675, + "auxiliary_loss_mlp": 0.01005208, + "balance_loss_clip": 1.00721824, + "balance_loss_mlp": 1.00384355, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.984274472015863, + "language_loss": 0.67055106, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69069993, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.2637133598327637 + }, + { + "auxiliary_loss_clip": 0.01106552, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.03706002, + "balance_loss_mlp": 1.02135813, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 1.805479530696852, + "language_loss": 0.76656842, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78795898, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.5005087852478027 + }, + { + "auxiliary_loss_clip": 0.01095598, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.03643548, + "balance_loss_mlp": 1.01860058, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.1114446342626945, + "language_loss": 0.8383832, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85964179, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 2.6168622970581055 + }, + { + "auxiliary_loss_clip": 0.01095912, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.03486657, + "balance_loss_mlp": 1.02238417, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.727163882133495, + "language_loss": 0.79710305, + "learning_rate": 2.158539129514956e-06, + "loss": 0.81840736, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 4.01365327835083 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.03929842, + "balance_loss_mlp": 1.02072787, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.4454879894645511, + "language_loss": 0.68922937, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71067452, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.5892372131347656 + }, + { + "auxiliary_loss_clip": 0.01087418, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.03427875, + "balance_loss_mlp": 1.02412486, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 2.0435065103150896, + "language_loss": 0.73126745, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75251532, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.592857599258423 + }, + { + "auxiliary_loss_clip": 0.01096647, + "auxiliary_loss_mlp": 0.0104245, + "balance_loss_clip": 1.03706837, + "balance_loss_mlp": 1.02935517, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 2.794506672540521, + "language_loss": 0.71526635, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73665732, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.542701482772827 + }, + { + "auxiliary_loss_clip": 0.01058858, + "auxiliary_loss_mlp": 0.01036371, + "balance_loss_clip": 1.03363705, + "balance_loss_mlp": 1.02430677, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.7690778954133302, + "language_loss": 0.68657577, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70752811, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.679081916809082 + }, + { + "auxiliary_loss_clip": 0.01096429, + "auxiliary_loss_mlp": 0.01036084, + "balance_loss_clip": 1.03756618, + "balance_loss_mlp": 1.02270865, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.7241878685017158, + "language_loss": 0.63680935, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65813446, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 4.069500684738159 + }, + { + "auxiliary_loss_clip": 0.01071639, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.03561652, + "balance_loss_mlp": 1.01879525, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 1.9468571299557893, + "language_loss": 0.76813596, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.78916234, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 2.61502742767334 + }, + { + "auxiliary_loss_clip": 0.01088956, + "auxiliary_loss_mlp": 0.01036836, + "balance_loss_clip": 1.03377032, + "balance_loss_mlp": 1.0224117, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.8548998183616336, + "language_loss": 0.76459157, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78584957, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.5349862575531006 + }, + { + "auxiliary_loss_clip": 0.01085681, + "auxiliary_loss_mlp": 0.01033246, + "balance_loss_clip": 1.03679156, + "balance_loss_mlp": 1.02137256, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.6995589745912167, + "language_loss": 0.77616256, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79735184, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.6241910457611084 + }, + { + "auxiliary_loss_clip": 0.01020252, + "auxiliary_loss_mlp": 0.01001484, + "balance_loss_clip": 1.00749791, + "balance_loss_mlp": 0.99982738, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.8053727812991431, + "language_loss": 0.54219985, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56241721, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.1174299716949463 + }, + { + "auxiliary_loss_clip": 0.01061459, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.03494811, + "balance_loss_mlp": 1.02367842, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.1692880325921813, + "language_loss": 0.86012661, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.88109696, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 2.6940574645996094 + }, + { + "auxiliary_loss_clip": 0.01085136, + "auxiliary_loss_mlp": 0.01037438, + "balance_loss_clip": 1.03345656, + "balance_loss_mlp": 1.02418232, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6680962064747407, + "language_loss": 0.73473281, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75595856, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 2.6058568954467773 + }, + { + "auxiliary_loss_clip": 0.01091714, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.03350401, + "balance_loss_mlp": 1.01688135, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.7317744427939246, + "language_loss": 0.7782352, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.79943013, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 2.5952067375183105 + }, + { + "auxiliary_loss_clip": 0.01075639, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.03293073, + "balance_loss_mlp": 1.02263665, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.1537831571341797, + "language_loss": 0.76278609, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78388435, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 2.7330634593963623 + }, + { + "auxiliary_loss_clip": 0.01084644, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.03392279, + "balance_loss_mlp": 1.02435064, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 1.9527759021910367, + "language_loss": 0.81491876, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83612752, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 2.646620035171509 + }, + { + "auxiliary_loss_clip": 0.01018416, + "auxiliary_loss_mlp": 0.01001852, + "balance_loss_clip": 1.00596857, + "balance_loss_mlp": 1.0003562, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6869818175523393, + "language_loss": 0.53327543, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55347812, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 3.096649169921875 + }, + { + "auxiliary_loss_clip": 0.01098292, + "auxiliary_loss_mlp": 0.00749398, + "balance_loss_clip": 1.03630948, + "balance_loss_mlp": 1.0001502, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.8611489494243378, + "language_loss": 0.63066554, + "learning_rate": 2.152326591972107e-06, + "loss": 0.6491425, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 2.5665524005889893 + }, + { + "auxiliary_loss_clip": 0.01065469, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.03395176, + "balance_loss_mlp": 1.0222106, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.767853931748332, + "language_loss": 0.69236517, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71337926, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.645700693130493 + }, + { + "auxiliary_loss_clip": 0.01098184, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.03828919, + "balance_loss_mlp": 1.01908863, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.6039108526204233, + "language_loss": 0.74297261, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76425731, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.626603841781616 + }, + { + "auxiliary_loss_clip": 0.01097517, + "auxiliary_loss_mlp": 0.010382, + "balance_loss_clip": 1.03691721, + "balance_loss_mlp": 1.02633226, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.6188189289842174, + "language_loss": 0.69614804, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.71750522, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.5394270420074463 + }, + { + "auxiliary_loss_clip": 0.01007927, + "auxiliary_loss_mlp": 0.00746804, + "balance_loss_clip": 1.00575328, + "balance_loss_mlp": 0.9998464, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.7011925724378781, + "language_loss": 0.46217299, + "learning_rate": 2.150773224180877e-06, + "loss": 0.47972029, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 3.1398582458496094 + }, + { + "auxiliary_loss_clip": 0.01111619, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.03874469, + "balance_loss_mlp": 1.02260518, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.8095885150514968, + "language_loss": 0.65796053, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.67943454, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.526401996612549 + }, + { + "auxiliary_loss_clip": 0.01017118, + "auxiliary_loss_mlp": 0.01053491, + "balance_loss_clip": 1.02884841, + "balance_loss_mlp": 1.03854227, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 2.1199720610902153, + "language_loss": 0.69913286, + "learning_rate": 2.149996505922343e-06, + "loss": 0.71983892, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 2.7617263793945312 + }, + { + "auxiliary_loss_clip": 0.01081746, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.03422236, + "balance_loss_mlp": 1.02421045, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.6492983215560182, + "language_loss": 0.84083784, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86202562, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 4.376409530639648 + }, + { + "auxiliary_loss_clip": 0.01104196, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.03732312, + "balance_loss_mlp": 1.02318251, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 1.877224487234738, + "language_loss": 0.72654152, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74792695, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.66902494430542 + }, + { + "auxiliary_loss_clip": 0.01071229, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.03589869, + "balance_loss_mlp": 1.02067494, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.0432061969026445, + "language_loss": 0.72252721, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74356753, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 2.725156784057617 + }, + { + "auxiliary_loss_clip": 0.01044559, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.02967739, + "balance_loss_mlp": 1.02023983, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 2.2143961065551174, + "language_loss": 0.77472836, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.7955063, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 2.7365708351135254 + }, + { + "auxiliary_loss_clip": 0.01074242, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.03444767, + "balance_loss_mlp": 1.02131665, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 2.513657012834114, + "language_loss": 0.70711601, + "learning_rate": 2.148054610995789e-06, + "loss": 0.72818685, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 2.6620123386383057 + }, + { + "auxiliary_loss_clip": 0.01087278, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.03558135, + "balance_loss_mlp": 1.02161241, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.9551103819209958, + "language_loss": 0.74985236, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77107167, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.6239078044891357 + }, + { + "auxiliary_loss_clip": 0.01097593, + "auxiliary_loss_mlp": 0.0103566, + "balance_loss_clip": 1.03804016, + "balance_loss_mlp": 1.02349472, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.320397592394639, + "language_loss": 0.67784131, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69917381, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 2.5578551292419434 + }, + { + "auxiliary_loss_clip": 0.01057491, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.03167748, + "balance_loss_mlp": 1.02277362, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.5263945395406158, + "language_loss": 0.66956091, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69048375, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.6809399127960205 + }, + { + "auxiliary_loss_clip": 0.01097816, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.03810799, + "balance_loss_mlp": 1.01946425, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.6608618491987428, + "language_loss": 0.74852461, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76980937, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 2.6525588035583496 + }, + { + "auxiliary_loss_clip": 0.01084021, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.03460097, + "balance_loss_mlp": 1.01470947, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.705205836798364, + "language_loss": 0.64581227, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66692173, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.7651195526123047 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.0102997, + "balance_loss_clip": 1.03824198, + "balance_loss_mlp": 1.01823974, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 2.0344193398516697, + "language_loss": 0.71994996, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.74131835, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.5080225467681885 + }, + { + "auxiliary_loss_clip": 0.0110636, + "auxiliary_loss_mlp": 0.00749341, + "balance_loss_clip": 1.036412, + "balance_loss_mlp": 1.00020778, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.7095296201747592, + "language_loss": 0.71869254, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.73724949, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 2.691844940185547 + }, + { + "auxiliary_loss_clip": 0.0100837, + "auxiliary_loss_mlp": 0.01007528, + "balance_loss_clip": 1.00586021, + "balance_loss_mlp": 1.00631797, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7157365539471386, + "language_loss": 0.5209294, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.5410884, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.2180609703063965 + }, + { + "auxiliary_loss_clip": 0.01104719, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.03720963, + "balance_loss_mlp": 1.02693224, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 2.100130302395534, + "language_loss": 0.77241135, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79384494, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 4.151713132858276 + }, + { + "auxiliary_loss_clip": 0.01073608, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.03049755, + "balance_loss_mlp": 1.01848447, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 2.076882105283944, + "language_loss": 0.70587301, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72690654, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.6250762939453125 + }, + { + "auxiliary_loss_clip": 0.01068824, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03873014, + "balance_loss_mlp": 1.01557803, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 1.9161340282609238, + "language_loss": 0.80905807, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83002073, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.736421585083008 + }, + { + "auxiliary_loss_clip": 0.01069253, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.03237951, + "balance_loss_mlp": 1.02321887, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.9456593881616282, + "language_loss": 0.70204598, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72309667, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 2.662522315979004 + }, + { + "auxiliary_loss_clip": 0.01093031, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.03747725, + "balance_loss_mlp": 1.01841342, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 2.45644992680866, + "language_loss": 0.84585124, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86707556, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 4.146308183670044 + }, + { + "auxiliary_loss_clip": 0.01100374, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.03834569, + "balance_loss_mlp": 1.0238322, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 2.5979158423013753, + "language_loss": 0.76275975, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78412741, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.5911386013031006 + }, + { + "auxiliary_loss_clip": 0.0108137, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.03463995, + "balance_loss_mlp": 1.02828264, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.384562057814335, + "language_loss": 0.59967339, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62090379, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.6383731365203857 + }, + { + "auxiliary_loss_clip": 0.01093113, + "auxiliary_loss_mlp": 0.01034944, + "balance_loss_clip": 1.03599596, + "balance_loss_mlp": 1.02340484, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.4858105992623503, + "language_loss": 0.78978467, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81106532, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 2.590097188949585 + }, + { + "auxiliary_loss_clip": 0.01096783, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.03470445, + "balance_loss_mlp": 1.01870441, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 1.8847257005634044, + "language_loss": 0.67479801, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69608659, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 2.573209524154663 + }, + { + "auxiliary_loss_clip": 0.01081096, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.03558731, + "balance_loss_mlp": 1.01885629, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 2.154077065452319, + "language_loss": 0.75131571, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77243245, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.6987857818603516 + }, + { + "auxiliary_loss_clip": 0.01056915, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.03568459, + "balance_loss_mlp": 1.02109396, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.1348073922189528, + "language_loss": 0.80508757, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82598996, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 4.267724990844727 + }, + { + "auxiliary_loss_clip": 0.01092903, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.03598821, + "balance_loss_mlp": 1.02018499, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 1.8440810125227116, + "language_loss": 0.6601274, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68137026, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 2.6361007690429688 + }, + { + "auxiliary_loss_clip": 0.01111633, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.03769624, + "balance_loss_mlp": 1.02087998, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 2.0211533555042807, + "language_loss": 0.66235155, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68381381, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 2.532914400100708 + }, + { + "auxiliary_loss_clip": 0.01054483, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.03204608, + "balance_loss_mlp": 1.02780604, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 2.354996389377837, + "language_loss": 0.76801467, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.7889657, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 2.6802682876586914 + }, + { + "auxiliary_loss_clip": 0.01082805, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.03655887, + "balance_loss_mlp": 1.01889694, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.5159103046592803, + "language_loss": 0.60175395, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62290204, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 2.581921100616455 + }, + { + "auxiliary_loss_clip": 0.01087482, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.03670859, + "balance_loss_mlp": 1.01735663, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 1.8368632495108002, + "language_loss": 0.78437388, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80554664, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 2.5560240745544434 + }, + { + "auxiliary_loss_clip": 0.01071331, + "auxiliary_loss_mlp": 0.00749742, + "balance_loss_clip": 1.0304873, + "balance_loss_mlp": 1.00010717, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.8854359564085115, + "language_loss": 0.78838527, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80659604, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 2.62854266166687 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.03745437, + "balance_loss_mlp": 1.02229822, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 2.3851244302025254, + "language_loss": 0.81155598, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83288729, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 2.6185848712921143 + }, + { + "auxiliary_loss_clip": 0.01057141, + "auxiliary_loss_mlp": 0.01046923, + "balance_loss_clip": 1.03104389, + "balance_loss_mlp": 1.03267109, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.198212346292147, + "language_loss": 0.91199791, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93303847, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 2.6674327850341797 + }, + { + "auxiliary_loss_clip": 0.01056008, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.03066397, + "balance_loss_mlp": 1.02392697, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.9411643565487546, + "language_loss": 0.64739323, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.66832787, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.6619935035705566 + }, + { + "auxiliary_loss_clip": 0.01052735, + "auxiliary_loss_mlp": 0.00749477, + "balance_loss_clip": 1.03063381, + "balance_loss_mlp": 1.0001533, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.8176654735305413, + "language_loss": 0.75726449, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77528667, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.7444961071014404 + }, + { + "auxiliary_loss_clip": 0.01108588, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.03784287, + "balance_loss_mlp": 1.021065, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.6969632555910803, + "language_loss": 0.8442322, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86564982, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.561476469039917 + }, + { + "auxiliary_loss_clip": 0.01088513, + "auxiliary_loss_mlp": 0.01024555, + "balance_loss_clip": 1.03404665, + "balance_loss_mlp": 1.01411796, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.4853529941933312, + "language_loss": 0.82985419, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85098487, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 2.7118897438049316 + }, + { + "auxiliary_loss_clip": 0.01080134, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.03585398, + "balance_loss_mlp": 1.01800573, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.3956042502072492, + "language_loss": 0.7445522, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76564765, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 2.659949541091919 + }, + { + "auxiliary_loss_clip": 0.01103576, + "auxiliary_loss_mlp": 0.00749, + "balance_loss_clip": 1.03756571, + "balance_loss_mlp": 1.00015402, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.6175636203935517, + "language_loss": 0.78512776, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80365348, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.5655391216278076 + }, + { + "auxiliary_loss_clip": 0.01058154, + "auxiliary_loss_mlp": 0.00749281, + "balance_loss_clip": 1.03477871, + "balance_loss_mlp": 1.0001688, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.0570001033948877, + "language_loss": 0.76753712, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78561145, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.636676788330078 + }, + { + "auxiliary_loss_clip": 0.01083534, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.03547668, + "balance_loss_mlp": 1.01765537, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.6311694159183978, + "language_loss": 0.62177801, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64291155, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 4.031908750534058 + }, + { + "auxiliary_loss_clip": 0.01103939, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.03503668, + "balance_loss_mlp": 1.01935196, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 10.190659236799576, + "language_loss": 0.72141969, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74277002, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.4917967319488525 + }, + { + "auxiliary_loss_clip": 0.01064904, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.03446996, + "balance_loss_mlp": 1.01980925, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.5946584888954756, + "language_loss": 0.79391146, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81487048, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.726274013519287 + }, + { + "auxiliary_loss_clip": 0.01093291, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.03634596, + "balance_loss_mlp": 1.02009416, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.853258320333803, + "language_loss": 0.72287798, + "learning_rate": 2.133291755093088e-06, + "loss": 0.7441324, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 2.534119129180908 + }, + { + "auxiliary_loss_clip": 0.0109487, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.03596151, + "balance_loss_mlp": 1.02327466, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.5389667968970542, + "language_loss": 0.74951446, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77081895, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 2.5660979747772217 + }, + { + "auxiliary_loss_clip": 0.01081224, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.03743494, + "balance_loss_mlp": 1.0166769, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.842824828458196, + "language_loss": 0.63763815, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.65874028, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 2.6746175289154053 + }, + { + "auxiliary_loss_clip": 0.01084442, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.03614163, + "balance_loss_mlp": 1.02211249, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 1.9011128056827262, + "language_loss": 0.76543498, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78661585, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.6484360694885254 + }, + { + "auxiliary_loss_clip": 0.01107062, + "auxiliary_loss_mlp": 0.01032294, + "balance_loss_clip": 1.03601956, + "balance_loss_mlp": 1.01950336, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.7871421243719878, + "language_loss": 0.70888257, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73027617, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.582869529724121 + }, + { + "auxiliary_loss_clip": 0.01086156, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.03519964, + "balance_loss_mlp": 1.02068758, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 2.3095012468362497, + "language_loss": 0.71486551, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73605728, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 2.8009276390075684 + }, + { + "auxiliary_loss_clip": 0.01101777, + "auxiliary_loss_mlp": 0.01024944, + "balance_loss_clip": 1.03485298, + "balance_loss_mlp": 1.01349354, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.5289183382024103, + "language_loss": 0.83494806, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.8562153, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.683804512023926 + }, + { + "auxiliary_loss_clip": 0.01093373, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.03316784, + "balance_loss_mlp": 1.02176356, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 1.9278307166042155, + "language_loss": 0.74922442, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.7705065, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.583552837371826 + }, + { + "auxiliary_loss_clip": 0.01089063, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.03625321, + "balance_loss_mlp": 1.01397562, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 1.9420950405280062, + "language_loss": 0.79597354, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.8171196, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 2.6360223293304443 + }, + { + "auxiliary_loss_clip": 0.01020784, + "auxiliary_loss_mlp": 0.01009822, + "balance_loss_clip": 1.00875878, + "balance_loss_mlp": 1.00854683, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7516708886053728, + "language_loss": 0.6010288, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62133491, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.2638931274414062 + }, + { + "auxiliary_loss_clip": 0.01081465, + "auxiliary_loss_mlp": 0.01033473, + "balance_loss_clip": 1.03392363, + "balance_loss_mlp": 1.0202229, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.8256405741558641, + "language_loss": 0.69214571, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71329504, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.6360301971435547 + }, + { + "auxiliary_loss_clip": 0.01046275, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.03137493, + "balance_loss_mlp": 1.01947761, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 1.97035477325437, + "language_loss": 0.67103988, + "learning_rate": 2.129016898898633e-06, + "loss": 0.69182706, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 4.313495635986328 + }, + { + "auxiliary_loss_clip": 0.01013304, + "auxiliary_loss_mlp": 0.01000834, + "balance_loss_clip": 1.01169777, + "balance_loss_mlp": 0.99972564, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.7953563326186577, + "language_loss": 0.58011603, + "learning_rate": 2.128628245959482e-06, + "loss": 0.6002574, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 3.050433874130249 + }, + { + "auxiliary_loss_clip": 0.01071186, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.03279102, + "balance_loss_mlp": 1.02167034, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.7552582276693274, + "language_loss": 0.76774025, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.78879356, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.68717098236084 + }, + { + "auxiliary_loss_clip": 0.01062925, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.0359838, + "balance_loss_mlp": 1.02225125, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.702211536003247, + "language_loss": 0.72661358, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74757755, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 2.63651967048645 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01028591, + "balance_loss_clip": 1.03526592, + "balance_loss_mlp": 1.01674175, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.9603129646897905, + "language_loss": 0.75946939, + "learning_rate": 2.127462257935406e-06, + "loss": 0.78078187, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 4.011401176452637 + }, + { + "auxiliary_loss_clip": 0.01061242, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.03277302, + "balance_loss_mlp": 1.02597332, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.2183285313990235, + "language_loss": 0.74187142, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.76288843, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 2.645472764968872 + }, + { + "auxiliary_loss_clip": 0.01023295, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.03463137, + "balance_loss_mlp": 1.02025414, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.6824983441442387, + "language_loss": 0.78618997, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80676699, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 2.899782419204712 + }, + { + "auxiliary_loss_clip": 0.01090894, + "auxiliary_loss_mlp": 0.01034163, + "balance_loss_clip": 1.03408933, + "balance_loss_mlp": 1.0229336, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 1.8838546466381503, + "language_loss": 0.86014229, + "learning_rate": 2.126296226410898e-06, + "loss": 0.88139284, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.8513705730438232 + }, + { + "auxiliary_loss_clip": 0.01049034, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.03396368, + "balance_loss_mlp": 1.02058935, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.7432340484079238, + "language_loss": 0.77425408, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79507291, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.644024133682251 + }, + { + "auxiliary_loss_clip": 0.01082481, + "auxiliary_loss_mlp": 0.00749093, + "balance_loss_clip": 1.03461027, + "balance_loss_mlp": 1.00012684, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.5727064426465323, + "language_loss": 0.67043895, + "learning_rate": 2.125518848090833e-06, + "loss": 0.68875468, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.617461919784546 + }, + { + "auxiliary_loss_clip": 0.01091872, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.03739524, + "balance_loss_mlp": 1.01701951, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.664755336938688, + "language_loss": 0.68413734, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70533907, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 4.0839197635650635 + }, + { + "auxiliary_loss_clip": 0.01073753, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03401589, + "balance_loss_mlp": 1.01933718, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.9443664106327407, + "language_loss": 0.74765068, + "learning_rate": 2.12474145073202e-06, + "loss": 0.76870662, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 2.598614454269409 + }, + { + "auxiliary_loss_clip": 0.01095136, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.0370965, + "balance_loss_mlp": 1.01575005, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 1.8774039969863952, + "language_loss": 0.81925821, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.84048802, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 2.5256896018981934 + }, + { + "auxiliary_loss_clip": 0.01065705, + "auxiliary_loss_mlp": 0.01036836, + "balance_loss_clip": 1.03545749, + "balance_loss_mlp": 1.02347267, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.7761892334540652, + "language_loss": 0.84105808, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.86208344, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 2.6632237434387207 + }, + { + "auxiliary_loss_clip": 0.01077746, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.03767514, + "balance_loss_mlp": 1.01869357, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 1.979833943225232, + "language_loss": 0.83501232, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85609734, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 2.644127607345581 + }, + { + "auxiliary_loss_clip": 0.01096307, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.03676057, + "balance_loss_mlp": 1.01823783, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 2.1911630005379448, + "language_loss": 0.73695606, + "learning_rate": 2.123186599369812e-06, + "loss": 0.7582233, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 2.602553129196167 + }, + { + "auxiliary_loss_clip": 0.01086002, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.03636456, + "balance_loss_mlp": 1.02345324, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.6729793269325985, + "language_loss": 0.7619065, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78312695, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 2.6698200702667236 + }, + { + "auxiliary_loss_clip": 0.01107328, + "auxiliary_loss_mlp": 0.0103548, + "balance_loss_clip": 1.03717601, + "balance_loss_mlp": 1.0236311, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 2.0760346377353507, + "language_loss": 0.70216626, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72359431, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 2.5623559951782227 + }, + { + "auxiliary_loss_clip": 0.0107169, + "auxiliary_loss_mlp": 0.0074926, + "balance_loss_clip": 1.04021156, + "balance_loss_mlp": 1.00017416, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 2.252100961688436, + "language_loss": 0.79599386, + "learning_rate": 2.122020411748461e-06, + "loss": 0.81420338, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 2.635852813720703 + }, + { + "auxiliary_loss_clip": 0.011062, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.03670537, + "balance_loss_mlp": 1.0167042, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.6537858482776078, + "language_loss": 0.80919594, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.83056295, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.473017930984497 + }, + { + "auxiliary_loss_clip": 0.01069195, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.03265738, + "balance_loss_mlp": 1.01884437, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.3706866072325787, + "language_loss": 0.66815776, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.68915033, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 2.7284090518951416 + }, + { + "auxiliary_loss_clip": 0.01066966, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.03404748, + "balance_loss_mlp": 1.02937603, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.6409921604841529, + "language_loss": 0.74126673, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76236594, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.6556456089019775 + }, + { + "auxiliary_loss_clip": 0.0107451, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.0328455, + "balance_loss_mlp": 1.01814222, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.6728574629425155, + "language_loss": 0.81333691, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83438265, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.5629990100860596 + }, + { + "auxiliary_loss_clip": 0.01080495, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.03612924, + "balance_loss_mlp": 1.01867104, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.3936495266096591, + "language_loss": 0.80867922, + "learning_rate": 2.120076673368901e-06, + "loss": 0.82978606, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.607379674911499 + }, + { + "auxiliary_loss_clip": 0.01109109, + "auxiliary_loss_mlp": 0.01037216, + "balance_loss_clip": 1.03615832, + "balance_loss_mlp": 1.02438879, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 11.517424583421558, + "language_loss": 0.66535646, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68681967, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.4617490768432617 + }, + { + "auxiliary_loss_clip": 0.01090745, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.03482056, + "balance_loss_mlp": 1.01563108, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.5232150301739966, + "language_loss": 0.77351815, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79468691, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.5944113731384277 + }, + { + "auxiliary_loss_clip": 0.0107648, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.03617811, + "balance_loss_mlp": 1.01694357, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.5009955906827874, + "language_loss": 0.78560537, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80665773, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 4.3292341232299805 + }, + { + "auxiliary_loss_clip": 0.01077093, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.03257942, + "balance_loss_mlp": 1.01875949, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 3.2041631471849965, + "language_loss": 0.76564842, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78672814, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.588447332382202 + }, + { + "auxiliary_loss_clip": 0.01052302, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.03032804, + "balance_loss_mlp": 1.01869011, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 2.356806239037615, + "language_loss": 0.89845645, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91927582, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 2.8023934364318848 + }, + { + "auxiliary_loss_clip": 0.0104874, + "auxiliary_loss_mlp": 0.01030158, + "balance_loss_clip": 1.03251243, + "balance_loss_mlp": 1.01845729, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.4959858567641158, + "language_loss": 0.74035788, + "learning_rate": 2.11774403721606e-06, + "loss": 0.7611469, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 2.7210710048675537 + }, + { + "auxiliary_loss_clip": 0.01061923, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.03786087, + "balance_loss_mlp": 1.02002716, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.8625503626083058, + "language_loss": 0.6930716, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71401763, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 2.665853977203369 + }, + { + "auxiliary_loss_clip": 0.0107761, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.03315735, + "balance_loss_mlp": 1.01747, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.9849285567529118, + "language_loss": 0.64922673, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67029762, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.585008144378662 + }, + { + "auxiliary_loss_clip": 0.01007402, + "auxiliary_loss_mlp": 0.01015772, + "balance_loss_clip": 1.00539327, + "balance_loss_mlp": 1.01419878, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.8217894127266323, + "language_loss": 0.53451985, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55475157, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.18255352973938 + }, + { + "auxiliary_loss_clip": 0.01091547, + "auxiliary_loss_mlp": 0.01028488, + "balance_loss_clip": 1.03466499, + "balance_loss_mlp": 1.01707399, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.6085332041745968, + "language_loss": 0.79322147, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81442189, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 2.5737903118133545 + }, + { + "auxiliary_loss_clip": 0.0108378, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.0367806, + "balance_loss_mlp": 1.01833463, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.3333837171044896, + "language_loss": 0.74643743, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76759279, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.6752305030822754 + }, + { + "auxiliary_loss_clip": 0.01093372, + "auxiliary_loss_mlp": 0.00749344, + "balance_loss_clip": 1.03361928, + "balance_loss_mlp": 1.00012851, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.6406527326962483, + "language_loss": 0.67917782, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69760501, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 2.818972110748291 + }, + { + "auxiliary_loss_clip": 0.0107812, + "auxiliary_loss_mlp": 0.01034199, + "balance_loss_clip": 1.03535891, + "balance_loss_mlp": 1.02215266, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.4781412668776492, + "language_loss": 0.85544956, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87657273, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.6008079051971436 + }, + { + "auxiliary_loss_clip": 0.01059414, + "auxiliary_loss_mlp": 0.00749047, + "balance_loss_clip": 1.03496265, + "balance_loss_mlp": 1.00010836, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.5871241066676898, + "language_loss": 0.70769423, + "learning_rate": 2.114633606196899e-06, + "loss": 0.72577894, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.7045042514801025 + }, + { + "auxiliary_loss_clip": 0.01087463, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.03525448, + "balance_loss_mlp": 1.01805067, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.3579162551487292, + "language_loss": 0.78376466, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80494928, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.5955796241760254 + }, + { + "auxiliary_loss_clip": 0.01068844, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.03553057, + "balance_loss_mlp": 1.02096033, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 1.8011704178742987, + "language_loss": 0.66088974, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68190837, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.7935166358947754 + }, + { + "auxiliary_loss_clip": 0.01065318, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.03302622, + "balance_loss_mlp": 1.01904738, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.7248051282662904, + "language_loss": 0.78167635, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80263448, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 4.128297567367554 + }, + { + "auxiliary_loss_clip": 0.01067338, + "auxiliary_loss_mlp": 0.0103094, + "balance_loss_clip": 1.03307867, + "balance_loss_mlp": 1.01841712, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 3.405696315068776, + "language_loss": 0.76137733, + "learning_rate": 2.113078285889493e-06, + "loss": 0.78236008, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.7410826683044434 + }, + { + "auxiliary_loss_clip": 0.01096787, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.03751945, + "balance_loss_mlp": 1.02095413, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 2.2251380742859848, + "language_loss": 0.83754575, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85886496, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.5649123191833496 + }, + { + "auxiliary_loss_clip": 0.01099959, + "auxiliary_loss_mlp": 0.00749189, + "balance_loss_clip": 1.03458405, + "balance_loss_mlp": 1.00014997, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.4797817105751931, + "language_loss": 0.70309806, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72158957, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 4.010979175567627 + }, + { + "auxiliary_loss_clip": 0.01089034, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.03566158, + "balance_loss_mlp": 1.02022481, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.7369719542328723, + "language_loss": 0.82472211, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84593523, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 2.536125898361206 + }, + { + "auxiliary_loss_clip": 0.01094585, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.01940358, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 2.324095376598897, + "language_loss": 0.67711967, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69837773, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.5305349826812744 + }, + { + "auxiliary_loss_clip": 0.01094048, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.03387773, + "balance_loss_mlp": 1.01912904, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 2.606455342780221, + "language_loss": 0.70470595, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72596395, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.538996934890747 + }, + { + "auxiliary_loss_clip": 0.01070432, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.03205204, + "balance_loss_mlp": 1.01932573, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.762626384834883, + "language_loss": 0.64918351, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67020297, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 2.6632773876190186 + }, + { + "auxiliary_loss_clip": 0.01096233, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.03550756, + "balance_loss_mlp": 1.01635051, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 2.452734834596525, + "language_loss": 0.72943497, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.7506848, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 2.5291659832000732 + }, + { + "auxiliary_loss_clip": 0.01076005, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.03508937, + "balance_loss_mlp": 1.01870108, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.7005929746873687, + "language_loss": 0.73352051, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75457108, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.7520792484283447 + }, + { + "auxiliary_loss_clip": 0.01040841, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.02800345, + "balance_loss_mlp": 1.02657914, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.6068693697525156, + "language_loss": 0.78666174, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.80748093, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 4.339629650115967 + }, + { + "auxiliary_loss_clip": 0.01084234, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.03583038, + "balance_loss_mlp": 1.02195013, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.9701435488956127, + "language_loss": 0.73832875, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75951838, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 2.6787166595458984 + }, + { + "auxiliary_loss_clip": 0.01091083, + "auxiliary_loss_mlp": 0.01025558, + "balance_loss_clip": 1.03803754, + "balance_loss_mlp": 1.0128324, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.6202260905188108, + "language_loss": 0.73872066, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.75988704, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 2.581026315689087 + }, + { + "auxiliary_loss_clip": 0.01086347, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.03751969, + "balance_loss_mlp": 1.02745664, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 2.830964718390854, + "language_loss": 0.85163873, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87289649, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 2.6006968021392822 + }, + { + "auxiliary_loss_clip": 0.01052984, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.03181505, + "balance_loss_mlp": 1.01418746, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.5736681404644148, + "language_loss": 0.7255215, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74631774, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 2.7640345096588135 + }, + { + "auxiliary_loss_clip": 0.01085779, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.0346719, + "balance_loss_mlp": 1.020118, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 2.5178689192358563, + "language_loss": 0.80241853, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82361561, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.56126070022583 + }, + { + "auxiliary_loss_clip": 0.01090541, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.03193736, + "balance_loss_mlp": 1.01990139, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 1.9725805439564206, + "language_loss": 0.73165488, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75287563, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 2.5626161098480225 + }, + { + "auxiliary_loss_clip": 0.01096603, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.03672314, + "balance_loss_mlp": 1.02039003, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.552455988843875, + "language_loss": 0.84353507, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86484301, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.577232837677002 + }, + { + "auxiliary_loss_clip": 0.01077963, + "auxiliary_loss_mlp": 0.01040494, + "balance_loss_clip": 1.03344047, + "balance_loss_mlp": 1.02507985, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.572886816801432, + "language_loss": 0.66749543, + "learning_rate": 2.106467420591409e-06, + "loss": 0.68868005, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 2.5880112648010254 + }, + { + "auxiliary_loss_clip": 0.01105256, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.03701556, + "balance_loss_mlp": 1.01931429, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.669607234572897, + "language_loss": 0.67370361, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69506204, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.488400459289551 + }, + { + "auxiliary_loss_clip": 0.01089843, + "auxiliary_loss_mlp": 0.01027186, + "balance_loss_clip": 1.03548276, + "balance_loss_mlp": 1.01571834, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.7862572784909376, + "language_loss": 0.8168931, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.83806336, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.5899593830108643 + }, + { + "auxiliary_loss_clip": 0.01094473, + "auxiliary_loss_mlp": 0.01026084, + "balance_loss_clip": 1.03417993, + "balance_loss_mlp": 1.01326883, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.659437833808194, + "language_loss": 0.7290355, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.75024104, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.5660572052001953 + }, + { + "auxiliary_loss_clip": 0.01042714, + "auxiliary_loss_mlp": 0.01034551, + "balance_loss_clip": 1.03365195, + "balance_loss_mlp": 1.02290392, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.9447157113141473, + "language_loss": 0.67691112, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69768381, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.883392095565796 + }, + { + "auxiliary_loss_clip": 0.0108302, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.03713238, + "balance_loss_mlp": 1.02089167, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 2.2549684033261603, + "language_loss": 0.64947575, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67063963, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.690182685852051 + }, + { + "auxiliary_loss_clip": 0.01051623, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.03453445, + "balance_loss_mlp": 1.02098465, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.9800451844997913, + "language_loss": 0.69753301, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71836555, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 2.718503952026367 + }, + { + "auxiliary_loss_clip": 0.01100782, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.03370106, + "balance_loss_mlp": 1.02156377, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 1.7029312398769676, + "language_loss": 0.84692305, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86826301, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 4.057506084442139 + }, + { + "auxiliary_loss_clip": 0.01074825, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.03487682, + "balance_loss_mlp": 1.02186382, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 2.102307311405125, + "language_loss": 0.68932861, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.7104305, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 2.651427984237671 + }, + { + "auxiliary_loss_clip": 0.01008778, + "auxiliary_loss_mlp": 0.01007499, + "balance_loss_clip": 1.01183307, + "balance_loss_mlp": 1.00624108, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7574176805231956, + "language_loss": 0.5113281, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53149086, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 3.2838146686553955 + }, + { + "auxiliary_loss_clip": 0.01076499, + "auxiliary_loss_mlp": 0.01037073, + "balance_loss_clip": 1.03375411, + "balance_loss_mlp": 1.02484798, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.7300740382770972, + "language_loss": 0.84550488, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86664063, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 2.597419500350952 + }, + { + "auxiliary_loss_clip": 0.01093607, + "auxiliary_loss_mlp": 0.01028483, + "balance_loss_clip": 1.03692174, + "balance_loss_mlp": 1.01740265, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.8096512103609148, + "language_loss": 0.69142866, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71264946, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.510847568511963 + }, + { + "auxiliary_loss_clip": 0.01106852, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.03677225, + "balance_loss_mlp": 1.01947057, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.8883488687946142, + "language_loss": 0.72842103, + "learning_rate": 2.101800220681144e-06, + "loss": 0.74980295, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.5796759128570557 + }, + { + "auxiliary_loss_clip": 0.01094639, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.03752053, + "balance_loss_mlp": 1.02543008, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.9627949131290383, + "language_loss": 0.81212127, + "learning_rate": 2.10141126191199e-06, + "loss": 0.833435, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 2.6298129558563232 + }, + { + "auxiliary_loss_clip": 0.01005005, + "auxiliary_loss_mlp": 0.01009401, + "balance_loss_clip": 1.01168716, + "balance_loss_mlp": 1.00816727, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7194348857613312, + "language_loss": 0.5686776, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58882165, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.282484769821167 + }, + { + "auxiliary_loss_clip": 0.01107111, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.038656, + "balance_loss_mlp": 1.0258683, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.98060551052448, + "language_loss": 0.82430828, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84576821, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.4908764362335205 + }, + { + "auxiliary_loss_clip": 0.01104791, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.03633714, + "balance_loss_mlp": 1.02483785, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.7023164105178819, + "language_loss": 0.60894239, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63036013, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 2.586076021194458 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.03338516, + "balance_loss_mlp": 1.02055359, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.6283858369230533, + "language_loss": 0.74748886, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76880306, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.552722454071045 + }, + { + "auxiliary_loss_clip": 0.01083071, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.03470755, + "balance_loss_mlp": 1.02285266, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.0726502139128273, + "language_loss": 0.79986984, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82104778, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 2.713014602661133 + }, + { + "auxiliary_loss_clip": 0.01089108, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.03494143, + "balance_loss_mlp": 1.02595711, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.6040832532467302, + "language_loss": 0.70972598, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.7309891, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.552485704421997 + }, + { + "auxiliary_loss_clip": 0.0107748, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.03654742, + "balance_loss_mlp": 1.02463174, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.8762394347946116, + "language_loss": 0.77462876, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79576331, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 2.629014015197754 + }, + { + "auxiliary_loss_clip": 0.01076125, + "auxiliary_loss_mlp": 0.01037686, + "balance_loss_clip": 1.03822207, + "balance_loss_mlp": 1.02524686, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.6394448847858032, + "language_loss": 0.84609503, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86723316, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.6952056884765625 + }, + { + "auxiliary_loss_clip": 0.01079749, + "auxiliary_loss_mlp": 0.01031246, + "balance_loss_clip": 1.03548479, + "balance_loss_mlp": 1.01878238, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 1.6542216013915416, + "language_loss": 0.81236368, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83347368, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 2.610682964324951 + }, + { + "auxiliary_loss_clip": 0.01074989, + "auxiliary_loss_mlp": 0.00749506, + "balance_loss_clip": 1.03510141, + "balance_loss_mlp": 1.00011897, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 2.066812740013985, + "language_loss": 0.79255891, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81080389, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 4.176628828048706 + }, + { + "auxiliary_loss_clip": 0.01105834, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.0381875, + "balance_loss_mlp": 1.0163784, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 1.6677781008024604, + "language_loss": 0.74476427, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.7661041, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 2.861605644226074 + }, + { + "auxiliary_loss_clip": 0.01086955, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.03625631, + "balance_loss_mlp": 1.01805544, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.499314049386416, + "language_loss": 0.8138746, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83503574, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 4.051889657974243 + }, + { + "auxiliary_loss_clip": 0.01083574, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.03553462, + "balance_loss_mlp": 1.02150559, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 2.437806125476479, + "language_loss": 0.83413863, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85531998, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 2.604893684387207 + }, + { + "auxiliary_loss_clip": 0.01095162, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.03665972, + "balance_loss_mlp": 1.01772618, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.6916431453408538, + "language_loss": 0.8175655, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83880824, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.5796139240264893 + }, + { + "auxiliary_loss_clip": 0.01057558, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.03069568, + "balance_loss_mlp": 1.01581907, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.7254163783030942, + "language_loss": 0.71840739, + "learning_rate": 2.095576427171635e-06, + "loss": 0.7392571, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 2.7031519412994385 + }, + { + "auxiliary_loss_clip": 0.01072066, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_clip": 1.03646159, + "balance_loss_mlp": 1.02826333, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 3.670531214405415, + "language_loss": 0.75895852, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.78010118, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 2.567003011703491 + }, + { + "auxiliary_loss_clip": 0.01096207, + "auxiliary_loss_mlp": 0.00749326, + "balance_loss_clip": 1.03658664, + "balance_loss_mlp": 1.00009775, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.8055634503487858, + "language_loss": 0.83191478, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85037005, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.6134285926818848 + }, + { + "auxiliary_loss_clip": 0.01097523, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.0355711, + "balance_loss_mlp": 1.0197742, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.437747208245562, + "language_loss": 0.73198414, + "learning_rate": 2.094409360775228e-06, + "loss": 0.7532832, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 2.5133795738220215 + }, + { + "auxiliary_loss_clip": 0.01067603, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.03658772, + "balance_loss_mlp": 1.01961756, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.4878072556909843, + "language_loss": 0.69087052, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71186733, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 2.6217594146728516 + }, + { + "auxiliary_loss_clip": 0.01090946, + "auxiliary_loss_mlp": 0.0074948, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.00019264, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 3.0781746518819784, + "language_loss": 0.7232796, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.74168384, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 4.170056581497192 + }, + { + "auxiliary_loss_clip": 0.01070107, + "auxiliary_loss_mlp": 0.01036171, + "balance_loss_clip": 1.0346204, + "balance_loss_mlp": 1.0223248, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.5776692246244433, + "language_loss": 0.73185891, + "learning_rate": 2.093242262158709e-06, + "loss": 0.7529217, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 2.6778249740600586 + }, + { + "auxiliary_loss_clip": 0.01075359, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.03352308, + "balance_loss_mlp": 1.02197671, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.5965595632970833, + "language_loss": 0.77915925, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80024803, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 2.566119909286499 + }, + { + "auxiliary_loss_clip": 0.01110599, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.03881097, + "balance_loss_mlp": 1.0195775, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.333212814346525, + "language_loss": 0.88043749, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90186352, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.503784418106079 + }, + { + "auxiliary_loss_clip": 0.01076975, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.03712368, + "balance_loss_mlp": 1.02128029, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.02046624398951, + "language_loss": 0.74212861, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76323998, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.5864362716674805 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.03771317, + "balance_loss_mlp": 1.02033973, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 1.660594802412282, + "language_loss": 0.79537046, + "learning_rate": 2.091686081238281e-06, + "loss": 0.8167547, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 2.567985773086548 + }, + { + "auxiliary_loss_clip": 0.01007738, + "auxiliary_loss_mlp": 0.0074668, + "balance_loss_clip": 1.01105785, + "balance_loss_mlp": 0.99974918, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7233368439425124, + "language_loss": 0.55977714, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.57732129, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 2.946532964706421 + }, + { + "auxiliary_loss_clip": 0.01096093, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.0376749, + "balance_loss_mlp": 1.01833272, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 2.489520267924677, + "language_loss": 0.65230894, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67357087, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.6350767612457275 + }, + { + "auxiliary_loss_clip": 0.01103777, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.03634119, + "balance_loss_mlp": 1.02077246, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.497282262812115, + "language_loss": 0.74677348, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.76813114, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.581920862197876 + }, + { + "auxiliary_loss_clip": 0.01107189, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.03614998, + "balance_loss_mlp": 1.01839173, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 3.602932125425957, + "language_loss": 0.80435443, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82573217, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.504298686981201 + }, + { + "auxiliary_loss_clip": 0.01021046, + "auxiliary_loss_mlp": 0.01006849, + "balance_loss_clip": 1.00899899, + "balance_loss_mlp": 1.00543654, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8931541493068408, + "language_loss": 0.62730861, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64758754, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.084838628768921 + }, + { + "auxiliary_loss_clip": 0.01091748, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.03412354, + "balance_loss_mlp": 1.01778841, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.7091358072710559, + "language_loss": 0.79502374, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81623805, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.6410815715789795 + }, + { + "auxiliary_loss_clip": 0.0106278, + "auxiliary_loss_mlp": 0.0103324, + "balance_loss_clip": 1.03135061, + "balance_loss_mlp": 1.0202167, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.8159664371212472, + "language_loss": 0.80301452, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82397473, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.6615853309631348 + }, + { + "auxiliary_loss_clip": 0.01109974, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.03657651, + "balance_loss_mlp": 1.02142882, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 2.4433445040844175, + "language_loss": 0.79377496, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81521392, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.485239028930664 + }, + { + "auxiliary_loss_clip": 0.01082911, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.03623986, + "balance_loss_mlp": 1.01704156, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6957710182241819, + "language_loss": 0.85038519, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87150538, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 4.141860008239746 + }, + { + "auxiliary_loss_clip": 0.01094678, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.03629231, + "balance_loss_mlp": 1.02312195, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.570568214053673, + "language_loss": 0.70442629, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72572255, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 2.655097723007202 + }, + { + "auxiliary_loss_clip": 0.01068445, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.0326401, + "balance_loss_mlp": 1.02888381, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 4.302474077607297, + "language_loss": 0.78027618, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.80139649, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 2.6435132026672363 + }, + { + "auxiliary_loss_clip": 0.01078804, + "auxiliary_loss_mlp": 0.01038821, + "balance_loss_clip": 1.03553188, + "balance_loss_mlp": 1.02532649, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.579250490285609, + "language_loss": 0.89745373, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91863, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.727128744125366 + }, + { + "auxiliary_loss_clip": 0.01085036, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.03551114, + "balance_loss_mlp": 1.02186, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 1.7163074674397494, + "language_loss": 0.76894057, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.79013801, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.6783766746520996 + }, + { + "auxiliary_loss_clip": 0.01094661, + "auxiliary_loss_mlp": 0.0102699, + "balance_loss_clip": 1.03658283, + "balance_loss_mlp": 1.01586771, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 1.8449598279784096, + "language_loss": 0.67438984, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69560635, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 2.6145970821380615 + }, + { + "auxiliary_loss_clip": 0.01085735, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.03480434, + "balance_loss_mlp": 1.02411175, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.8355076368140781, + "language_loss": 0.75231004, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77353001, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 2.6430864334106445 + }, + { + "auxiliary_loss_clip": 0.01090915, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.03732586, + "balance_loss_mlp": 1.01756096, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 2.5887113783102147, + "language_loss": 0.7840085, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80522263, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.5743372440338135 + }, + { + "auxiliary_loss_clip": 0.01083975, + "auxiliary_loss_mlp": 0.00749396, + "balance_loss_clip": 1.03371477, + "balance_loss_mlp": 1.00009251, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.578125450692513, + "language_loss": 0.69356799, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71190166, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 2.5936119556427 + }, + { + "auxiliary_loss_clip": 0.01064046, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.03172815, + "balance_loss_mlp": 1.02342212, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.808003102153322, + "language_loss": 0.70968235, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73068511, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.6353559494018555 + }, + { + "auxiliary_loss_clip": 0.01093272, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.03655958, + "balance_loss_mlp": 1.02125978, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.4726978342695705, + "language_loss": 0.74281365, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76406884, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.557612419128418 + }, + { + "auxiliary_loss_clip": 0.0109551, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.0343926, + "balance_loss_mlp": 1.01863909, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.075482909005553, + "language_loss": 0.64253318, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.66380531, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.5038092136383057 + }, + { + "auxiliary_loss_clip": 0.00999661, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.00785744, + "balance_loss_mlp": 1.03538346, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.7788168019091608, + "language_loss": 0.59746045, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.61782688, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.2948248386383057 + }, + { + "auxiliary_loss_clip": 0.01077599, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.03382134, + "balance_loss_mlp": 1.02101672, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 1.5958574711467064, + "language_loss": 0.7519455, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77306026, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 2.634453773498535 + }, + { + "auxiliary_loss_clip": 0.01076025, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.03533924, + "balance_loss_mlp": 1.01733887, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 2.068124975315553, + "language_loss": 0.72186446, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74292815, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 2.601440906524658 + }, + { + "auxiliary_loss_clip": 0.01100975, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.0398742, + "balance_loss_mlp": 1.02261412, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 1.9567649431563727, + "language_loss": 0.73745674, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.75882387, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.5206573009490967 + }, + { + "auxiliary_loss_clip": 0.01081011, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.03422403, + "balance_loss_mlp": 1.02285814, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.606295596531646, + "language_loss": 0.72238743, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74355388, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 4.142221450805664 + }, + { + "auxiliary_loss_clip": 0.01097849, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03714049, + "balance_loss_mlp": 1.01895952, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.684110222597653, + "language_loss": 0.8149268, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83623087, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 2.6264564990997314 + }, + { + "auxiliary_loss_clip": 0.01099823, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.03601348, + "balance_loss_mlp": 1.02017283, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.267109791650364, + "language_loss": 0.76509672, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78643936, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 4.1212852001190186 + }, + { + "auxiliary_loss_clip": 0.01098725, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.03800297, + "balance_loss_mlp": 1.02224135, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.7179249201601556, + "language_loss": 0.76157725, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78292096, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 2.5615479946136475 + }, + { + "auxiliary_loss_clip": 0.01083493, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.03400493, + "balance_loss_mlp": 1.02191556, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 3.4295338399893938, + "language_loss": 0.72003186, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74121571, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 2.569274663925171 + }, + { + "auxiliary_loss_clip": 0.01075636, + "auxiliary_loss_mlp": 0.01038271, + "balance_loss_clip": 1.03469992, + "balance_loss_mlp": 1.02600455, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.6236522656991665, + "language_loss": 0.77154273, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79268181, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 2.5999255180358887 + }, + { + "auxiliary_loss_clip": 0.01065422, + "auxiliary_loss_mlp": 0.01037235, + "balance_loss_clip": 1.03702331, + "balance_loss_mlp": 1.02506948, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.6266349400607063, + "language_loss": 0.76554251, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78656912, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.6350247859954834 + }, + { + "auxiliary_loss_clip": 0.01076591, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.03429425, + "balance_loss_mlp": 1.01922798, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.7181560794194892, + "language_loss": 0.85026169, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87135398, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 2.7782418727874756 + }, + { + "auxiliary_loss_clip": 0.01086159, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.0355829, + "balance_loss_mlp": 1.01977563, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.6129430421027464, + "language_loss": 0.78521717, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80639666, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.6332600116729736 + }, + { + "auxiliary_loss_clip": 0.01092912, + "auxiliary_loss_mlp": 0.01028003, + "balance_loss_clip": 1.0362438, + "balance_loss_mlp": 1.01593852, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 2.2384011914723287, + "language_loss": 0.75511646, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77632558, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 2.5793139934539795 + }, + { + "auxiliary_loss_clip": 0.01104437, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.03589058, + "balance_loss_mlp": 1.01637137, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5691226744314033, + "language_loss": 0.69375706, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.71508276, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 4.161816596984863 + }, + { + "auxiliary_loss_clip": 0.01086968, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.03633046, + "balance_loss_mlp": 1.01751375, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.5374932991866315, + "language_loss": 0.73231423, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75348878, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 2.6803832054138184 + }, + { + "auxiliary_loss_clip": 0.01097114, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.03872705, + "balance_loss_mlp": 1.02102375, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.445016574768982, + "language_loss": 0.78328502, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80458462, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 2.647582769393921 + }, + { + "auxiliary_loss_clip": 0.01093449, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.0346694, + "balance_loss_mlp": 1.01692772, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.5962746013688767, + "language_loss": 0.70183921, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72305799, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 2.6160812377929688 + }, + { + "auxiliary_loss_clip": 0.01019739, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00765514, + "balance_loss_mlp": 1.00049484, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 4.376019853531922, + "language_loss": 0.63315815, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65337634, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 3.1029794216156006 + }, + { + "auxiliary_loss_clip": 0.01093556, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.03785205, + "balance_loss_mlp": 1.02098715, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.163160040847837, + "language_loss": 0.60628498, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62754309, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 2.696901321411133 + }, + { + "auxiliary_loss_clip": 0.01055578, + "auxiliary_loss_mlp": 0.0103719, + "balance_loss_clip": 1.03236079, + "balance_loss_mlp": 1.02371359, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.717726108233446, + "language_loss": 0.68205035, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70297801, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.831923246383667 + }, + { + "auxiliary_loss_clip": 0.01080514, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.0352273, + "balance_loss_mlp": 1.01535082, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 1.646660758183683, + "language_loss": 0.67225116, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.69334245, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.7175209522247314 + }, + { + "auxiliary_loss_clip": 0.0106483, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.03262472, + "balance_loss_mlp": 1.02596569, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 1.8262392705347632, + "language_loss": 0.66517591, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.6862253, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.7023956775665283 + }, + { + "auxiliary_loss_clip": 0.01082067, + "auxiliary_loss_mlp": 0.01028861, + "balance_loss_clip": 1.03477871, + "balance_loss_mlp": 1.01667774, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.5799159318200193, + "language_loss": 0.74585509, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76696444, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.5738492012023926 + }, + { + "auxiliary_loss_clip": 0.01082472, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.03543842, + "balance_loss_mlp": 1.0241586, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.8448285039867112, + "language_loss": 0.68111795, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70231539, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.6043267250061035 + }, + { + "auxiliary_loss_clip": 0.01055272, + "auxiliary_loss_mlp": 0.01035287, + "balance_loss_clip": 1.03506625, + "balance_loss_mlp": 1.02162611, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 2.206883536767217, + "language_loss": 0.78476834, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.80567396, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.6510305404663086 + }, + { + "auxiliary_loss_clip": 0.01097146, + "auxiliary_loss_mlp": 0.00749525, + "balance_loss_clip": 1.0350728, + "balance_loss_mlp": 1.00009108, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.110833568150679, + "language_loss": 0.59796673, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.6164335, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 2.6261518001556396 + }, + { + "auxiliary_loss_clip": 0.01083016, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.03384805, + "balance_loss_mlp": 1.01821995, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.9852054346758814, + "language_loss": 0.76102221, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78216267, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.603438138961792 + }, + { + "auxiliary_loss_clip": 0.01067086, + "auxiliary_loss_mlp": 0.01034359, + "balance_loss_clip": 1.03590739, + "balance_loss_mlp": 1.02252793, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.7837780927563218, + "language_loss": 0.74822617, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.76924062, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 4.127391576766968 + }, + { + "auxiliary_loss_clip": 0.01095183, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.03920555, + "balance_loss_mlp": 1.0224247, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 2.144958314048867, + "language_loss": 0.66283268, + "learning_rate": 2.072229431544548e-06, + "loss": 0.68413198, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 2.6011769771575928 + }, + { + "auxiliary_loss_clip": 0.01052026, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.03519058, + "balance_loss_mlp": 1.02065563, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.8253750253040424, + "language_loss": 0.62784398, + "learning_rate": 2.071840222561051e-06, + "loss": 0.64868885, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 2.7716236114501953 + }, + { + "auxiliary_loss_clip": 0.01078039, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.03195763, + "balance_loss_mlp": 1.02262855, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.435938266303118, + "language_loss": 0.67521089, + "learning_rate": 2.071451010853365e-06, + "loss": 0.6963383, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.725726366043091 + }, + { + "auxiliary_loss_clip": 0.01094786, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.037642, + "balance_loss_mlp": 1.02155507, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 1.8840247989764636, + "language_loss": 0.62457436, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64587307, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 2.723494529724121 + }, + { + "auxiliary_loss_clip": 0.01064897, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.03364086, + "balance_loss_mlp": 1.02038765, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.8486090514233244, + "language_loss": 0.66878092, + "learning_rate": 2.070672579324465e-06, + "loss": 0.68975067, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.622190475463867 + }, + { + "auxiliary_loss_clip": 0.01090006, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.03585315, + "balance_loss_mlp": 1.02534127, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.7018557927454605, + "language_loss": 0.70798552, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.72925627, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.63870906829834 + }, + { + "auxiliary_loss_clip": 0.01091045, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.03411388, + "balance_loss_mlp": 1.01931274, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 1.8075736262521995, + "language_loss": 0.83051467, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85173452, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 2.5375096797943115 + }, + { + "auxiliary_loss_clip": 0.01095422, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.03806698, + "balance_loss_mlp": 1.02299392, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.7039184842226587, + "language_loss": 0.66618758, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.68750423, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.5710108280181885 + }, + { + "auxiliary_loss_clip": 0.01046175, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.03077888, + "balance_loss_mlp": 1.0205555, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.8030364551872886, + "language_loss": 0.8018747, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82265973, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.697582244873047 + }, + { + "auxiliary_loss_clip": 0.01095327, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.03623438, + "balance_loss_mlp": 1.01823258, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.5886093716653944, + "language_loss": 0.70159835, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72285551, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.6469554901123047 + }, + { + "auxiliary_loss_clip": 0.01073903, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.0342133, + "balance_loss_mlp": 1.02565718, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 2.3507073966446814, + "language_loss": 0.6955266, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71664888, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 2.61283802986145 + }, + { + "auxiliary_loss_clip": 0.01012247, + "auxiliary_loss_mlp": 0.00998987, + "balance_loss_clip": 1.01058602, + "balance_loss_mlp": 0.99776477, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.812296020166657, + "language_loss": 0.52978402, + "learning_rate": 2.067947985330974e-06, + "loss": 0.54989636, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 2.9766786098480225 + }, + { + "auxiliary_loss_clip": 0.0100185, + "auxiliary_loss_mlp": 0.01010671, + "balance_loss_clip": 1.01272988, + "balance_loss_mlp": 1.0094012, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8741035610340039, + "language_loss": 0.60764366, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62776887, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 2.9808707237243652 + }, + { + "auxiliary_loss_clip": 0.01065211, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.03196669, + "balance_loss_mlp": 1.02022338, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.764149162199026, + "language_loss": 0.84652007, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86748993, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 2.664614200592041 + }, + { + "auxiliary_loss_clip": 0.01070692, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.03367925, + "balance_loss_mlp": 1.01977921, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 1.9726481366257274, + "language_loss": 0.50271642, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.52373928, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 2.7452330589294434 + }, + { + "auxiliary_loss_clip": 0.01107221, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.0361048, + "balance_loss_mlp": 1.02355742, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.5016924121601714, + "language_loss": 0.747271, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.76871508, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 4.062511444091797 + }, + { + "auxiliary_loss_clip": 0.01084355, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.03278232, + "balance_loss_mlp": 1.02311254, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.0399925341557976, + "language_loss": 0.68180025, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.70300221, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 4.168480157852173 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.03902197, + "balance_loss_mlp": 1.02450025, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.7220293803888624, + "language_loss": 0.78921711, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81055582, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 2.614405632019043 + }, + { + "auxiliary_loss_clip": 0.01060158, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.03387439, + "balance_loss_mlp": 1.01821113, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 3.2637383088222336, + "language_loss": 0.66359746, + "learning_rate": 2.065223265084376e-06, + "loss": 0.6844998, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 2.6949355602264404 + }, + { + "auxiliary_loss_clip": 0.01096434, + "auxiliary_loss_mlp": 0.00749438, + "balance_loss_clip": 1.03880644, + "balance_loss_mlp": 1.00005198, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.5742361631747057, + "language_loss": 0.71776438, + "learning_rate": 2.064834009323688e-06, + "loss": 0.7362231, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.673673391342163 + }, + { + "auxiliary_loss_clip": 0.01073442, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_clip": 1.03474295, + "balance_loss_mlp": 1.03296876, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 2.2410793521633345, + "language_loss": 0.8161599, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83736408, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 2.7084262371063232 + }, + { + "auxiliary_loss_clip": 0.01069368, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.03735423, + "balance_loss_mlp": 1.02151132, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 2.6297836180611562, + "language_loss": 0.78744197, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.80849338, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 2.6429762840270996 + }, + { + "auxiliary_loss_clip": 0.01108878, + "auxiliary_loss_mlp": 0.00749435, + "balance_loss_clip": 1.03679562, + "balance_loss_mlp": 1.00011778, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.5663696784640884, + "language_loss": 0.69465786, + "learning_rate": 2.063666227349593e-06, + "loss": 0.71324104, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.599874496459961 + }, + { + "auxiliary_loss_clip": 0.01093692, + "auxiliary_loss_mlp": 0.00749528, + "balance_loss_clip": 1.03340483, + "balance_loss_mlp": 1.00012159, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.7651768106362753, + "language_loss": 0.69303286, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71146506, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 2.633701801300049 + }, + { + "auxiliary_loss_clip": 0.0109034, + "auxiliary_loss_mlp": 0.0103867, + "balance_loss_clip": 1.03396749, + "balance_loss_mlp": 1.02717805, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.5320906108534456, + "language_loss": 0.85824597, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87953615, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 4.1319615840911865 + }, + { + "auxiliary_loss_clip": 0.01072479, + "auxiliary_loss_mlp": 0.00749302, + "balance_loss_clip": 1.03508317, + "balance_loss_mlp": 1.00009561, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.7065957971932064, + "language_loss": 0.75675595, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77497375, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 2.616213798522949 + }, + { + "auxiliary_loss_clip": 0.01106518, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.03585041, + "balance_loss_mlp": 1.01475453, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 2.4583398763858453, + "language_loss": 0.73293436, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75427443, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 2.6539885997772217 + }, + { + "auxiliary_loss_clip": 0.01070424, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.03598118, + "balance_loss_mlp": 1.01988029, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 2.987891181854585, + "language_loss": 0.77252769, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79354823, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 2.6465256214141846 + }, + { + "auxiliary_loss_clip": 0.01063449, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.02980566, + "balance_loss_mlp": 1.02171254, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.8241159849700983, + "language_loss": 0.63123, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65220147, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.7558016777038574 + }, + { + "auxiliary_loss_clip": 0.01071238, + "auxiliary_loss_mlp": 0.01036305, + "balance_loss_clip": 1.03382695, + "balance_loss_mlp": 1.02113008, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.6985093667953677, + "language_loss": 0.63625431, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65732974, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 2.587726354598999 + }, + { + "auxiliary_loss_clip": 0.01081686, + "auxiliary_loss_mlp": 0.01026521, + "balance_loss_clip": 1.03630829, + "balance_loss_mlp": 1.01554823, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3813753124855674, + "language_loss": 0.71048611, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73156822, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.6933226585388184 + }, + { + "auxiliary_loss_clip": 0.01078079, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.03426003, + "balance_loss_mlp": 1.02390575, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.7387552618888846, + "language_loss": 0.79170346, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81285274, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.6729259490966797 + }, + { + "auxiliary_loss_clip": 0.01108021, + "auxiliary_loss_mlp": 0.01037241, + "balance_loss_clip": 1.03701425, + "balance_loss_mlp": 1.02355576, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.746372839861276, + "language_loss": 0.82004631, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.84149897, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.525895595550537 + }, + { + "auxiliary_loss_clip": 0.01085001, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.03578258, + "balance_loss_mlp": 1.02432251, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 7.970615858057281, + "language_loss": 0.80361021, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82482719, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.5759060382843018 + }, + { + "auxiliary_loss_clip": 0.01077075, + "auxiliary_loss_mlp": 0.00749478, + "balance_loss_clip": 1.03728437, + "balance_loss_mlp": 1.00012207, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.918341459476698, + "language_loss": 0.80183649, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82010198, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.6517045497894287 + }, + { + "auxiliary_loss_clip": 0.01090324, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.0310688, + "balance_loss_mlp": 1.01905072, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.215775057772063, + "language_loss": 0.61988324, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64110041, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 2.6739253997802734 + }, + { + "auxiliary_loss_clip": 0.01069898, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.0326736, + "balance_loss_mlp": 1.01737785, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 2.2034256458345123, + "language_loss": 0.81810725, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.83910143, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.6015636920928955 + }, + { + "auxiliary_loss_clip": 0.01074715, + "auxiliary_loss_mlp": 0.01036558, + "balance_loss_clip": 1.0379827, + "balance_loss_mlp": 1.02466083, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.6026184781895207, + "language_loss": 0.79008126, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81119394, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.6475067138671875 + }, + { + "auxiliary_loss_clip": 0.01051508, + "auxiliary_loss_mlp": 0.01035692, + "balance_loss_clip": 1.03094339, + "balance_loss_mlp": 1.02344322, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 2.028533393886105, + "language_loss": 0.62673527, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.64760727, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 2.6544084548950195 + }, + { + "auxiliary_loss_clip": 0.01074251, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.03397489, + "balance_loss_mlp": 1.02039361, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 1.7345519030112395, + "language_loss": 0.77325463, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79432356, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 4.072146892547607 + }, + { + "auxiliary_loss_clip": 0.01045349, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.03356349, + "balance_loss_mlp": 1.01864088, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.7700290712870252, + "language_loss": 0.77116418, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79193562, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 2.698107957839966 + }, + { + "auxiliary_loss_clip": 0.01106963, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.03602862, + "balance_loss_mlp": 1.02113056, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 2.011082505945341, + "language_loss": 0.77363229, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79504102, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 2.501289129257202 + }, + { + "auxiliary_loss_clip": 0.01086439, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.03235388, + "balance_loss_mlp": 1.01980543, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.557921586898012, + "language_loss": 0.66799319, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68919122, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.6006128787994385 + }, + { + "auxiliary_loss_clip": 0.01106392, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.03649688, + "balance_loss_mlp": 1.02075529, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.547598447587525, + "language_loss": 0.81552631, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83691871, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.57053804397583 + }, + { + "auxiliary_loss_clip": 0.01108662, + "auxiliary_loss_mlp": 0.00749683, + "balance_loss_clip": 1.03629816, + "balance_loss_mlp": 1.00009573, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.9520043099814652, + "language_loss": 0.74344492, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76202834, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 2.57238507270813 + }, + { + "auxiliary_loss_clip": 0.01104849, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.0357182, + "balance_loss_mlp": 1.0245645, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.4739629093835764, + "language_loss": 0.71511054, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73653167, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.5942940711975098 + }, + { + "auxiliary_loss_clip": 0.01064739, + "auxiliary_loss_mlp": 0.01039421, + "balance_loss_clip": 1.03167653, + "balance_loss_mlp": 1.02534199, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.8073590286599526, + "language_loss": 0.78336048, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80440211, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.653399705886841 + }, + { + "auxiliary_loss_clip": 0.01098265, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.03822505, + "balance_loss_mlp": 1.02241731, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 1.984737532801395, + "language_loss": 0.78318202, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80451292, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.5812950134277344 + }, + { + "auxiliary_loss_clip": 0.01102662, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.03435147, + "balance_loss_mlp": 1.01675034, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.6771150196694595, + "language_loss": 0.71901512, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.74033642, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 2.509328842163086 + }, + { + "auxiliary_loss_clip": 0.01088301, + "auxiliary_loss_mlp": 0.00749381, + "balance_loss_clip": 1.03311777, + "balance_loss_mlp": 1.00012267, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6339438022563406, + "language_loss": 0.82894325, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84732008, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 2.673116445541382 + }, + { + "auxiliary_loss_clip": 0.0107881, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.03694832, + "balance_loss_mlp": 1.02101064, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 4.944210471787853, + "language_loss": 0.73442912, + "learning_rate": 2.052765934536682e-06, + "loss": 0.7555654, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.72153639793396 + }, + { + "auxiliary_loss_clip": 0.01035309, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.02826858, + "balance_loss_mlp": 1.02767658, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.767530526179752, + "language_loss": 0.76627183, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78704536, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.656517505645752 + }, + { + "auxiliary_loss_clip": 0.01087859, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.0335449, + "balance_loss_mlp": 1.01994276, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.6128174934335635, + "language_loss": 0.72158033, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74278247, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 2.5267040729522705 + }, + { + "auxiliary_loss_clip": 0.00989081, + "auxiliary_loss_mlp": 0.01014993, + "balance_loss_clip": 1.00968027, + "balance_loss_mlp": 1.01337171, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 1.8070371550595734, + "language_loss": 0.63690472, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65694547, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 3.2548532485961914 + }, + { + "auxiliary_loss_clip": 0.01072676, + "auxiliary_loss_mlp": 0.01038737, + "balance_loss_clip": 1.03451848, + "balance_loss_mlp": 1.02637494, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 3.110421250514795, + "language_loss": 0.77648813, + "learning_rate": 2.051208614233681e-06, + "loss": 0.7976023, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 2.6655893325805664 + }, + { + "auxiliary_loss_clip": 0.0108386, + "auxiliary_loss_mlp": 0.0103311, + "balance_loss_clip": 1.03259611, + "balance_loss_mlp": 1.02046227, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.6202448536922038, + "language_loss": 0.70954192, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73071158, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 4.065747022628784 + }, + { + "auxiliary_loss_clip": 0.01099652, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.03698981, + "balance_loss_mlp": 1.0203495, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 1.8884363633324759, + "language_loss": 0.72268504, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74402022, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 4.094459295272827 + }, + { + "auxiliary_loss_clip": 0.01108524, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.0376718, + "balance_loss_mlp": 1.01869273, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.5592660096090298, + "language_loss": 0.83569336, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85710168, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.573338031768799 + }, + { + "auxiliary_loss_clip": 0.01094818, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.03587699, + "balance_loss_mlp": 1.01537943, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.7867999537253572, + "language_loss": 0.80560672, + "learning_rate": 2.049651262861309e-06, + "loss": 0.8268311, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 2.6639022827148438 + }, + { + "auxiliary_loss_clip": 0.01067728, + "auxiliary_loss_mlp": 0.01036093, + "balance_loss_clip": 1.03549814, + "balance_loss_mlp": 1.02175212, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.5009713143358647, + "language_loss": 0.79350764, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81454587, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 2.692214012145996 + }, + { + "auxiliary_loss_clip": 0.01069778, + "auxiliary_loss_mlp": 0.00749796, + "balance_loss_clip": 1.03200173, + "balance_loss_mlp": 1.00013554, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5971712199485835, + "language_loss": 0.70714319, + "learning_rate": 2.048872575819383e-06, + "loss": 0.72533894, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 2.649808168411255 + }, + { + "auxiliary_loss_clip": 0.0107867, + "auxiliary_loss_mlp": 0.01034708, + "balance_loss_clip": 1.0343411, + "balance_loss_mlp": 1.0217973, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.6721610568148326, + "language_loss": 0.70977247, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73090625, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.6420891284942627 + }, + { + "auxiliary_loss_clip": 0.01098049, + "auxiliary_loss_mlp": 0.00749636, + "balance_loss_clip": 1.03527725, + "balance_loss_mlp": 1.00012064, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 2.1650500873194236, + "language_loss": 0.63549173, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65396857, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 2.53066086769104 + }, + { + "auxiliary_loss_clip": 0.01052928, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.03253937, + "balance_loss_mlp": 1.0152849, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.6744897838406059, + "language_loss": 0.7130934, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73389047, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.6788582801818848 + }, + { + "auxiliary_loss_clip": 0.01011832, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02603507, + "balance_loss_mlp": 1.02684164, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.2137731250751216, + "language_loss": 0.62245452, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64298671, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 2.859957456588745 + }, + { + "auxiliary_loss_clip": 0.0107091, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.03202152, + "balance_loss_mlp": 1.01842403, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.7206080124077747, + "language_loss": 0.63746381, + "learning_rate": 2.046925826041012e-06, + "loss": 0.658481, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 4.2861762046813965 + }, + { + "auxiliary_loss_clip": 0.00993292, + "auxiliary_loss_mlp": 0.01002465, + "balance_loss_clip": 1.0094924, + "balance_loss_mlp": 1.00111818, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8495253409706544, + "language_loss": 0.6194033, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.6393609, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.1753342151641846 + }, + { + "auxiliary_loss_clip": 0.01067206, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.03264999, + "balance_loss_mlp": 1.01480579, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.6736107771404343, + "language_loss": 0.8069945, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82793427, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.584742307662964 + }, + { + "auxiliary_loss_clip": 0.01096204, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.03703427, + "balance_loss_mlp": 1.01845169, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.3422494070225393, + "language_loss": 0.70571035, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72698057, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 2.5996274948120117 + }, + { + "auxiliary_loss_clip": 0.01103785, + "auxiliary_loss_mlp": 0.00749229, + "balance_loss_clip": 1.03541434, + "balance_loss_mlp": 1.00001788, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.4152948546991062, + "language_loss": 0.71989083, + "learning_rate": 2.045368394099955e-06, + "loss": 0.7384209, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 2.64396071434021 + }, + { + "auxiliary_loss_clip": 0.01079882, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.03227997, + "balance_loss_mlp": 1.01629162, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.7010903837247933, + "language_loss": 0.73030746, + "learning_rate": 2.044979031776844e-06, + "loss": 0.75138736, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.6272594928741455 + }, + { + "auxiliary_loss_clip": 0.01106757, + "auxiliary_loss_mlp": 0.01028952, + "balance_loss_clip": 1.03552508, + "balance_loss_mlp": 1.01617301, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.5710953457860433, + "language_loss": 0.76874316, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79010022, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.57366943359375 + }, + { + "auxiliary_loss_clip": 0.01106014, + "auxiliary_loss_mlp": 0.01036333, + "balance_loss_clip": 1.03536749, + "balance_loss_mlp": 1.02438283, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.5910168956121689, + "language_loss": 0.84553266, + "learning_rate": 2.044200302028559e-06, + "loss": 0.86695623, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 2.5848677158355713 + }, + { + "auxiliary_loss_clip": 0.0111229, + "auxiliary_loss_mlp": 0.01035267, + "balance_loss_clip": 1.03810692, + "balance_loss_mlp": 1.02182007, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.7288147682877057, + "language_loss": 0.77372253, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.79519808, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.483123779296875 + }, + { + "auxiliary_loss_clip": 0.0107032, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.03378344, + "balance_loss_mlp": 1.02285337, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.604037168793703, + "language_loss": 0.76178515, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78283447, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 2.6630663871765137 + }, + { + "auxiliary_loss_clip": 0.01083667, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.0345304, + "balance_loss_mlp": 1.02448654, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 2.203892421928349, + "language_loss": 0.88890117, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91011542, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.7511041164398193 + }, + { + "auxiliary_loss_clip": 0.01091872, + "auxiliary_loss_mlp": 0.00749637, + "balance_loss_clip": 1.0361805, + "balance_loss_mlp": 1.00004077, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 1.5885000966723746, + "language_loss": 0.62218565, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64060068, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.711071252822876 + }, + { + "auxiliary_loss_clip": 0.01018752, + "auxiliary_loss_mlp": 0.01003762, + "balance_loss_clip": 1.00655973, + "balance_loss_mlp": 1.00253987, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.8079075876453174, + "language_loss": 0.62352788, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64375293, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 2.9879400730133057 + }, + { + "auxiliary_loss_clip": 0.01099386, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.0378871, + "balance_loss_mlp": 1.0170238, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.5756863817948146, + "language_loss": 0.67365408, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69494653, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 2.571803331375122 + }, + { + "auxiliary_loss_clip": 0.01094717, + "auxiliary_loss_mlp": 0.01030768, + "balance_loss_clip": 1.03326404, + "balance_loss_mlp": 1.01769626, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 2.1099655890238354, + "language_loss": 0.77792466, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.79917949, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 4.0330421924591064 + }, + { + "auxiliary_loss_clip": 0.01113614, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.03994036, + "balance_loss_mlp": 1.02149487, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 2.0531408103580384, + "language_loss": 0.80678475, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82826519, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 2.4682390689849854 + }, + { + "auxiliary_loss_clip": 0.01085109, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.03584623, + "balance_loss_mlp": 1.02312434, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.600049694212395, + "language_loss": 0.6883477, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70955014, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.616396903991699 + }, + { + "auxiliary_loss_clip": 0.01103865, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.0356195, + "balance_loss_mlp": 1.01697564, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.5977484241978197, + "language_loss": 0.7612412, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.78257036, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.5788261890411377 + }, + { + "auxiliary_loss_clip": 0.01066485, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.03420997, + "balance_loss_mlp": 1.02157605, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 2.2683631987454236, + "language_loss": 0.81372124, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83474094, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.5950002670288086 + }, + { + "auxiliary_loss_clip": 0.01086457, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.03270864, + "balance_loss_mlp": 1.02403808, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.6281549107894142, + "language_loss": 0.75919026, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78041589, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 2.546356678009033 + }, + { + "auxiliary_loss_clip": 0.01016811, + "auxiliary_loss_mlp": 0.01005107, + "balance_loss_clip": 1.00459421, + "balance_loss_mlp": 1.00394475, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6843614040030762, + "language_loss": 0.59402037, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61423951, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.229029417037964 + }, + { + "auxiliary_loss_clip": 0.01104249, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.03516114, + "balance_loss_mlp": 1.02199638, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.736601302079213, + "language_loss": 0.79653001, + "learning_rate": 2.038749012684354e-06, + "loss": 0.81791306, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 2.519144058227539 + }, + { + "auxiliary_loss_clip": 0.01093496, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.03388596, + "balance_loss_mlp": 1.01649451, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.4892818684615279, + "language_loss": 0.78398001, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80520368, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.653722047805786 + }, + { + "auxiliary_loss_clip": 0.01102943, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.03631818, + "balance_loss_mlp": 1.01688433, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.823381938261548, + "language_loss": 0.74747384, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76878512, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.5260798931121826 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.03578067, + "balance_loss_mlp": 1.01734304, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 4.825038578304422, + "language_loss": 0.78148329, + "learning_rate": 2.03758084040404e-06, + "loss": 0.80282462, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.4896926879882812 + }, + { + "auxiliary_loss_clip": 0.01088128, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.03651083, + "balance_loss_mlp": 1.01890898, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.5540468005440595, + "language_loss": 0.69654274, + "learning_rate": 2.037191446774109e-06, + "loss": 0.7177422, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 2.606325387954712 + }, + { + "auxiliary_loss_clip": 0.0108057, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.03462219, + "balance_loss_mlp": 1.02846003, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.5232841696361588, + "language_loss": 0.73627508, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75750691, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 2.55316424369812 + }, + { + "auxiliary_loss_clip": 0.01026836, + "auxiliary_loss_mlp": 0.00999688, + "balance_loss_clip": 1.00470781, + "balance_loss_mlp": 0.99839419, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7514407112412186, + "language_loss": 0.58109796, + "learning_rate": 2.036412655298103e-06, + "loss": 0.6013633, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.0477888584136963 + }, + { + "auxiliary_loss_clip": 0.01051345, + "auxiliary_loss_mlp": 0.01034545, + "balance_loss_clip": 1.0331651, + "balance_loss_mlp": 1.02296364, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 1.871730719598246, + "language_loss": 0.68950105, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71036005, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 2.6510837078094482 + }, + { + "auxiliary_loss_clip": 0.01082854, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.03555214, + "balance_loss_mlp": 1.01962793, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 1.730444799713993, + "language_loss": 0.85350168, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87464231, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 2.610700845718384 + }, + { + "auxiliary_loss_clip": 0.01082251, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.03566408, + "balance_loss_mlp": 1.02064323, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.1301872793807277, + "language_loss": 0.64688092, + "learning_rate": 2.035244457765222e-06, + "loss": 0.66803062, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.5184593200683594 + }, + { + "auxiliary_loss_clip": 0.01086158, + "auxiliary_loss_mlp": 0.01039398, + "balance_loss_clip": 1.03400648, + "balance_loss_mlp": 1.02627313, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 2.139478573844078, + "language_loss": 0.82215697, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84341252, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 4.08136248588562 + }, + { + "auxiliary_loss_clip": 0.01050322, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.0324086, + "balance_loss_mlp": 1.02566957, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 2.372322226623955, + "language_loss": 0.8123225, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83323473, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 4.191581726074219 + }, + { + "auxiliary_loss_clip": 0.01078009, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.03279114, + "balance_loss_mlp": 1.01710439, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.87709295099233, + "language_loss": 0.61702222, + "learning_rate": 2.034076248204082e-06, + "loss": 0.63811308, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.659698486328125 + }, + { + "auxiliary_loss_clip": 0.0109241, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.03696442, + "balance_loss_mlp": 1.02435744, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.6086561452155923, + "language_loss": 0.65732378, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.67860657, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.6204495429992676 + }, + { + "auxiliary_loss_clip": 0.01093438, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.03569233, + "balance_loss_mlp": 1.02338684, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.9181066814113252, + "language_loss": 0.69815528, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71944135, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.5309557914733887 + }, + { + "auxiliary_loss_clip": 0.0110833, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.03628552, + "balance_loss_mlp": 1.01945734, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.824550907238654, + "language_loss": 0.79257953, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81397855, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.540382146835327 + }, + { + "auxiliary_loss_clip": 0.0108974, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.03473353, + "balance_loss_mlp": 1.02029943, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.810151557083377, + "language_loss": 0.83303368, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85425526, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 2.574629306793213 + }, + { + "auxiliary_loss_clip": 0.01099708, + "auxiliary_loss_mlp": 0.00749466, + "balance_loss_clip": 1.03606367, + "balance_loss_mlp": 1.00012016, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.6458971190648182, + "language_loss": 0.85287595, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87136769, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.6486639976501465 + }, + { + "auxiliary_loss_clip": 0.01092679, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.03350854, + "balance_loss_mlp": 1.02186656, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 2.54592768602527, + "language_loss": 0.82915664, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85041904, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 2.5517303943634033 + }, + { + "auxiliary_loss_clip": 0.01077962, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.03310657, + "balance_loss_mlp": 1.01561868, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 1.8521557872482806, + "language_loss": 0.81732321, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83838892, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 4.124483823776245 + }, + { + "auxiliary_loss_clip": 0.0107592, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.03147829, + "balance_loss_mlp": 1.02221823, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 2.2552851914031393, + "language_loss": 0.7406795, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.7617805, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.5492398738861084 + }, + { + "auxiliary_loss_clip": 0.0105852, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.03186727, + "balance_loss_mlp": 1.02122676, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.4953884941056603, + "language_loss": 0.70034266, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72127223, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 2.5899012088775635 + }, + { + "auxiliary_loss_clip": 0.01083009, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.03502512, + "balance_loss_mlp": 1.02226698, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 2.4597564433997237, + "language_loss": 0.72820824, + "learning_rate": 2.030182134581827e-06, + "loss": 0.74939185, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 2.6244568824768066 + }, + { + "auxiliary_loss_clip": 0.01063562, + "auxiliary_loss_mlp": 0.00749537, + "balance_loss_clip": 1.03340042, + "balance_loss_mlp": 1.00010836, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.655035602551517, + "language_loss": 0.693802, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71193302, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.6840732097625732 + }, + { + "auxiliary_loss_clip": 0.01080174, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.03437185, + "balance_loss_mlp": 1.01919389, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.7994883937743062, + "language_loss": 0.73040235, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.75151396, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 2.7184696197509766 + }, + { + "auxiliary_loss_clip": 0.0107821, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.03406835, + "balance_loss_mlp": 1.01675534, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.7319435897890187, + "language_loss": 0.80560911, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82667673, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.604651927947998 + }, + { + "auxiliary_loss_clip": 0.01091259, + "auxiliary_loss_mlp": 0.01027924, + "balance_loss_clip": 1.03385723, + "balance_loss_mlp": 1.01627779, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.559486368543478, + "language_loss": 0.7937566, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81494844, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 2.5587100982666016 + }, + { + "auxiliary_loss_clip": 0.01071982, + "auxiliary_loss_mlp": 0.0104136, + "balance_loss_clip": 1.03414488, + "balance_loss_mlp": 1.02837217, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 1.827757052771791, + "language_loss": 0.77084684, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79198027, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.650409698486328 + }, + { + "auxiliary_loss_clip": 0.01060876, + "auxiliary_loss_mlp": 0.01030809, + "balance_loss_clip": 1.03341937, + "balance_loss_mlp": 1.01750517, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 2.1035646535731285, + "language_loss": 0.83563447, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85655129, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.650794744491577 + }, + { + "auxiliary_loss_clip": 0.01110963, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.0393889, + "balance_loss_mlp": 1.02521455, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 1.8977165634915154, + "language_loss": 0.79104429, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81252015, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 2.6319942474365234 + }, + { + "auxiliary_loss_clip": 0.01071726, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.03357875, + "balance_loss_mlp": 1.02010894, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.5778791661316867, + "language_loss": 0.783494, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80454332, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.7423622608184814 + }, + { + "auxiliary_loss_clip": 0.01093325, + "auxiliary_loss_mlp": 0.01028532, + "balance_loss_clip": 1.03597188, + "balance_loss_mlp": 1.01683176, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 2.075126589853714, + "language_loss": 0.78670347, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.80792207, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.614569902420044 + }, + { + "auxiliary_loss_clip": 0.01105293, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.03626442, + "balance_loss_mlp": 1.01861966, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.7870030684198195, + "language_loss": 0.81827772, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.83964372, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 2.5653038024902344 + }, + { + "auxiliary_loss_clip": 0.01066469, + "auxiliary_loss_mlp": 0.0074942, + "balance_loss_clip": 1.03220081, + "balance_loss_mlp": 1.00008416, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 2.1124582599256474, + "language_loss": 0.70453537, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.72269428, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 4.093623161315918 + }, + { + "auxiliary_loss_clip": 0.01049675, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.03433228, + "balance_loss_mlp": 1.02182603, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.6499150321184521, + "language_loss": 0.71886325, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.7397151, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 2.768874168395996 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01037773, + "balance_loss_clip": 1.03643441, + "balance_loss_mlp": 1.02388489, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.7829268575744583, + "language_loss": 0.62863672, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.65001655, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 2.5705137252807617 + }, + { + "auxiliary_loss_clip": 0.01105974, + "auxiliary_loss_mlp": 0.01034526, + "balance_loss_clip": 1.03409576, + "balance_loss_mlp": 1.02191329, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.988402638746741, + "language_loss": 0.87420607, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89561105, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 2.568476915359497 + }, + { + "auxiliary_loss_clip": 0.01090327, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.03343487, + "balance_loss_mlp": 1.02089739, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.5516608026145506, + "language_loss": 0.82649839, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84772658, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 2.598386526107788 + }, + { + "auxiliary_loss_clip": 0.01010481, + "auxiliary_loss_mlp": 0.01003345, + "balance_loss_clip": 1.00860512, + "balance_loss_mlp": 1.0019381, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8496469590276975, + "language_loss": 0.63877434, + "learning_rate": 2.023951320871339e-06, + "loss": 0.6589126, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 3.1580488681793213 + }, + { + "auxiliary_loss_clip": 0.01070384, + "auxiliary_loss_mlp": 0.00749425, + "balance_loss_clip": 1.03416204, + "balance_loss_mlp": 1.00010538, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 1.89444303475981, + "language_loss": 0.84189326, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86009133, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.625459909439087 + }, + { + "auxiliary_loss_clip": 0.01092558, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03545833, + "balance_loss_mlp": 1.01580405, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 3.195381394367825, + "language_loss": 0.75177091, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77297091, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.5925440788269043 + }, + { + "auxiliary_loss_clip": 0.01106136, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.03563166, + "balance_loss_mlp": 1.01883936, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.800437250491076, + "language_loss": 0.57752544, + "learning_rate": 2.022783015592131e-06, + "loss": 0.59890741, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 2.5246286392211914 + }, + { + "auxiliary_loss_clip": 0.01097374, + "auxiliary_loss_mlp": 0.01040304, + "balance_loss_clip": 1.03742599, + "balance_loss_mlp": 1.02726865, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.8219774908564965, + "language_loss": 0.85508341, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87646019, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 2.5273149013519287 + }, + { + "auxiliary_loss_clip": 0.01073107, + "auxiliary_loss_mlp": 0.00749403, + "balance_loss_clip": 1.0345937, + "balance_loss_mlp": 1.00010562, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 2.035924062035036, + "language_loss": 0.7185272, + "learning_rate": 2.022004141061709e-06, + "loss": 0.73675227, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 2.692275285720825 + }, + { + "auxiliary_loss_clip": 0.01102643, + "auxiliary_loss_mlp": 0.00749399, + "balance_loss_clip": 1.03522587, + "balance_loss_mlp": 1.00010717, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.8220560738525728, + "language_loss": 0.76380295, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78232336, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.5661325454711914 + }, + { + "auxiliary_loss_clip": 0.01105236, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.03746498, + "balance_loss_mlp": 1.02372146, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.5213645361402222, + "language_loss": 0.71516895, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73657274, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 2.6085169315338135 + }, + { + "auxiliary_loss_clip": 0.01077409, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.03516734, + "balance_loss_mlp": 1.01670182, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 1.8059421837692906, + "language_loss": 0.66093779, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68199533, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.5580544471740723 + }, + { + "auxiliary_loss_clip": 0.01039086, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.03130484, + "balance_loss_mlp": 1.02329397, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 3.0156407800281695, + "language_loss": 0.66793466, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.68869555, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 2.8154194355010986 + }, + { + "auxiliary_loss_clip": 0.01066965, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.03662205, + "balance_loss_mlp": 1.02014196, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.047848797669751, + "language_loss": 0.68955958, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71055579, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.641388416290283 + }, + { + "auxiliary_loss_clip": 0.01101631, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.03435516, + "balance_loss_mlp": 1.01781559, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.8034568629308392, + "language_loss": 0.66401434, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68531859, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 3.9956750869750977 + }, + { + "auxiliary_loss_clip": 0.01090841, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.03396702, + "balance_loss_mlp": 1.01802838, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 2.0109263019601324, + "language_loss": 0.74979365, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77099746, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 2.5283236503601074 + }, + { + "auxiliary_loss_clip": 0.0107726, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.03607142, + "balance_loss_mlp": 1.02113187, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 2.077460515562248, + "language_loss": 0.77974153, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80085135, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 4.092970848083496 + }, + { + "auxiliary_loss_clip": 0.0109753, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.03688776, + "balance_loss_mlp": 1.017784, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 5.121817463200922, + "language_loss": 0.73592365, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75719547, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.6694223880767822 + }, + { + "auxiliary_loss_clip": 0.01091542, + "auxiliary_loss_mlp": 0.01038107, + "balance_loss_clip": 1.03555417, + "balance_loss_mlp": 1.02594829, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.8400808906513442, + "language_loss": 0.78465068, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80594712, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.507516384124756 + }, + { + "auxiliary_loss_clip": 0.01106453, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.03716159, + "balance_loss_mlp": 1.02105045, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.6828003147560198, + "language_loss": 0.79391247, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81530797, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.5109710693359375 + }, + { + "auxiliary_loss_clip": 0.01088261, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.03562069, + "balance_loss_mlp": 1.02465689, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.9973616345102128, + "language_loss": 0.81500554, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83627307, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 2.552307367324829 + }, + { + "auxiliary_loss_clip": 0.01091271, + "auxiliary_loss_mlp": 0.01023442, + "balance_loss_clip": 1.03122997, + "balance_loss_mlp": 1.01088345, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.9806916572180622, + "language_loss": 0.68347132, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70461845, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.655073881149292 + }, + { + "auxiliary_loss_clip": 0.01080293, + "auxiliary_loss_mlp": 0.01038666, + "balance_loss_clip": 1.03696895, + "balance_loss_mlp": 1.02297783, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 1.8731000896764716, + "language_loss": 0.61917114, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.64036071, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.6968889236450195 + }, + { + "auxiliary_loss_clip": 0.01065305, + "auxiliary_loss_mlp": 0.0104065, + "balance_loss_clip": 1.03212023, + "balance_loss_mlp": 1.0282104, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 1.9562461129621, + "language_loss": 0.77635026, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.79740989, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 2.692078113555908 + }, + { + "auxiliary_loss_clip": 0.01081077, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.03536475, + "balance_loss_mlp": 1.02212107, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.871455530175229, + "language_loss": 0.7469551, + "learning_rate": 2.015773034588706e-06, + "loss": 0.76810431, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.607106924057007 + }, + { + "auxiliary_loss_clip": 0.01077818, + "auxiliary_loss_mlp": 0.01036563, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.02225208, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.976629016705642, + "language_loss": 0.74227464, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76341844, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 4.293464183807373 + }, + { + "auxiliary_loss_clip": 0.01096562, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.03611255, + "balance_loss_mlp": 1.02331495, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.7056874824682442, + "language_loss": 0.65180004, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67312354, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 2.5613138675689697 + }, + { + "auxiliary_loss_clip": 0.01082414, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.0382762, + "balance_loss_mlp": 1.02756393, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.492864419497771, + "language_loss": 0.74511826, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76632541, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.5613160133361816 + }, + { + "auxiliary_loss_clip": 0.0109204, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.0343101, + "balance_loss_mlp": 1.01882172, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.776070215136385, + "language_loss": 0.82831097, + "learning_rate": 2.014215231682995e-06, + "loss": 0.84953761, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.580012321472168 + }, + { + "auxiliary_loss_clip": 0.01052093, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.03165078, + "balance_loss_mlp": 1.02001512, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.8311296227103433, + "language_loss": 0.73700392, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75784659, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.6401588916778564 + }, + { + "auxiliary_loss_clip": 0.00999626, + "auxiliary_loss_mlp": 0.01004613, + "balance_loss_clip": 1.00883174, + "balance_loss_mlp": 1.00340295, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7462285658092126, + "language_loss": 0.60815775, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62820017, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 3.2268948554992676 + }, + { + "auxiliary_loss_clip": 0.01075104, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.03473461, + "balance_loss_mlp": 1.02276921, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6321419565873294, + "language_loss": 0.77011728, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79123235, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.529599189758301 + }, + { + "auxiliary_loss_clip": 0.01084449, + "auxiliary_loss_mlp": 0.01032437, + "balance_loss_clip": 1.03591359, + "balance_loss_mlp": 1.01988387, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 1.7499253374912507, + "language_loss": 0.67516667, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69633555, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.673793077468872 + }, + { + "auxiliary_loss_clip": 0.01076919, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.03758216, + "balance_loss_mlp": 1.02262294, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 2.2889714564203945, + "language_loss": 0.82117063, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84229577, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 2.645353317260742 + }, + { + "auxiliary_loss_clip": 0.01091934, + "auxiliary_loss_mlp": 0.01036808, + "balance_loss_clip": 1.03444839, + "balance_loss_mlp": 1.02372479, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.4282789449580435, + "language_loss": 0.63431042, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65559787, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.6095621585845947 + }, + { + "auxiliary_loss_clip": 0.01096652, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.0383873, + "balance_loss_mlp": 1.01550913, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.6860939070827603, + "language_loss": 0.69719779, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71844518, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.6296801567077637 + }, + { + "auxiliary_loss_clip": 0.01096527, + "auxiliary_loss_mlp": 0.01032411, + "balance_loss_clip": 1.03737009, + "balance_loss_mlp": 1.01963139, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.887535610764949, + "language_loss": 0.71071434, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73200369, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 2.613917112350464 + }, + { + "auxiliary_loss_clip": 0.01056263, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.03103733, + "balance_loss_mlp": 1.01839089, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 2.3347970001339027, + "language_loss": 0.8018859, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82276011, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 2.6299338340759277 + }, + { + "auxiliary_loss_clip": 0.0109326, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.03440762, + "balance_loss_mlp": 1.01493406, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.8472715405696143, + "language_loss": 0.7815243, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80273008, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.575448751449585 + }, + { + "auxiliary_loss_clip": 0.01078536, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.03216803, + "balance_loss_mlp": 1.02056956, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.6327829252199209, + "language_loss": 0.7587778, + "learning_rate": 2.009931232064105e-06, + "loss": 0.77989721, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 4.285281419754028 + }, + { + "auxiliary_loss_clip": 0.01057282, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.03410137, + "balance_loss_mlp": 1.01933563, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.236006351676418, + "language_loss": 0.7474128, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76831019, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 2.6651668548583984 + }, + { + "auxiliary_loss_clip": 0.01042179, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.03442395, + "balance_loss_mlp": 1.02234781, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.820787615804786, + "language_loss": 0.70624918, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72702247, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.6962647438049316 + }, + { + "auxiliary_loss_clip": 0.01084898, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.03542006, + "balance_loss_mlp": 1.01671803, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 3.549223849309923, + "language_loss": 0.79312044, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81425679, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.6295034885406494 + }, + { + "auxiliary_loss_clip": 0.01081935, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.03612363, + "balance_loss_mlp": 1.02191281, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 2.1521730095808476, + "language_loss": 0.67982483, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70099068, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.6623246669769287 + }, + { + "auxiliary_loss_clip": 0.0106437, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_clip": 1.03088093, + "balance_loss_mlp": 1.03631258, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.1886035358967173, + "language_loss": 0.72539222, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.7465387, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 2.632145643234253 + }, + { + "auxiliary_loss_clip": 0.01097264, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.03608525, + "balance_loss_mlp": 1.02736855, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 1.9797640087648158, + "language_loss": 0.81863856, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84001994, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 2.51288104057312 + }, + { + "auxiliary_loss_clip": 0.01092442, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.03488576, + "balance_loss_mlp": 1.02040958, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 2.69219165007353, + "language_loss": 0.73483825, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75609422, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.670379161834717 + }, + { + "auxiliary_loss_clip": 0.0109425, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.03416538, + "balance_loss_mlp": 1.03027141, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.6421861554669628, + "language_loss": 0.73486048, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75623238, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.5732219219207764 + }, + { + "auxiliary_loss_clip": 0.0106929, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.03396821, + "balance_loss_mlp": 1.02086091, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 4.289673821170031, + "language_loss": 0.81876892, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.83980173, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 2.6373255252838135 + }, + { + "auxiliary_loss_clip": 0.01096139, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.03780174, + "balance_loss_mlp": 1.01925457, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.7826084835544944, + "language_loss": 0.72010225, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74137074, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 2.6406989097595215 + }, + { + "auxiliary_loss_clip": 0.01095729, + "auxiliary_loss_mlp": 0.01035117, + "balance_loss_clip": 1.03684545, + "balance_loss_mlp": 1.02175951, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.513806672956362, + "language_loss": 0.76010776, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.78141618, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.557854652404785 + }, + { + "auxiliary_loss_clip": 0.0107718, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.03772461, + "balance_loss_mlp": 1.01761913, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.7020569933342944, + "language_loss": 0.6895116, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71058577, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 2.5920591354370117 + }, + { + "auxiliary_loss_clip": 0.0109704, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.03561234, + "balance_loss_mlp": 1.02422297, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.8757491396291863, + "language_loss": 0.74868536, + "learning_rate": 2.004868266210965e-06, + "loss": 0.77002549, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 2.615906000137329 + }, + { + "auxiliary_loss_clip": 0.01106475, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.03665113, + "balance_loss_mlp": 1.02682781, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.772917392960271, + "language_loss": 0.68286705, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70432305, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 2.5023863315582275 + }, + { + "auxiliary_loss_clip": 0.01097371, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.03490305, + "balance_loss_mlp": 1.02382803, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 1.9480262349898212, + "language_loss": 0.72939211, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75074708, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 4.140491485595703 + }, + { + "auxiliary_loss_clip": 0.01073688, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.03893757, + "balance_loss_mlp": 1.02248538, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.3127771274016484, + "language_loss": 0.74844599, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76953602, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.683570384979248 + }, + { + "auxiliary_loss_clip": 0.01075993, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.03692484, + "balance_loss_mlp": 1.02018881, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.760125742590333, + "language_loss": 0.8618446, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88292611, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 4.071072816848755 + }, + { + "auxiliary_loss_clip": 0.01083728, + "auxiliary_loss_mlp": 0.01037701, + "balance_loss_clip": 1.03201461, + "balance_loss_mlp": 1.02492738, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.922994648014739, + "language_loss": 0.88958836, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91080266, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.633043050765991 + }, + { + "auxiliary_loss_clip": 0.01103754, + "auxiliary_loss_mlp": 0.00749344, + "balance_loss_clip": 1.036479, + "balance_loss_mlp": 1.00007796, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.856288108139705, + "language_loss": 0.65236002, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67089105, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 2.558277130126953 + }, + { + "auxiliary_loss_clip": 0.01085905, + "auxiliary_loss_mlp": 0.00749353, + "balance_loss_clip": 1.03505576, + "balance_loss_mlp": 1.00011837, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.7480394387170948, + "language_loss": 0.63553584, + "learning_rate": 2.002142038838577e-06, + "loss": 0.6538884, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 2.5811049938201904 + }, + { + "auxiliary_loss_clip": 0.01104064, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.03544331, + "balance_loss_mlp": 1.01658511, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.5466073306119759, + "language_loss": 0.7005918, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72192055, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.500216245651245 + }, + { + "auxiliary_loss_clip": 0.01078813, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.03358316, + "balance_loss_mlp": 1.01907706, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.5357584990386222, + "language_loss": 0.66253471, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68362987, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.5904550552368164 + }, + { + "auxiliary_loss_clip": 0.0109938, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.03735745, + "balance_loss_mlp": 1.01937473, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.9285456748591803, + "language_loss": 0.77371281, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79502618, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 2.564133405685425 + }, + { + "auxiliary_loss_clip": 0.01099507, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.03644943, + "balance_loss_mlp": 1.01766944, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.022674709827724, + "language_loss": 0.82955945, + "learning_rate": 2.0005841925139e-06, + "loss": 0.8508746, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.596862316131592 + }, + { + "auxiliary_loss_clip": 0.01089542, + "auxiliary_loss_mlp": 0.01033299, + "balance_loss_clip": 1.03681302, + "balance_loss_mlp": 1.02054358, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.782327112803366, + "language_loss": 0.72840297, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.74963135, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.589078664779663 + }, + { + "auxiliary_loss_clip": 0.0109705, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.0367614, + "balance_loss_mlp": 1.01912165, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 1.8049123093315258, + "language_loss": 0.68407977, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70538568, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 4.012052059173584 + }, + { + "auxiliary_loss_clip": 0.01109094, + "auxiliary_loss_mlp": 0.00749531, + "balance_loss_clip": 1.03570688, + "balance_loss_mlp": 1.00022781, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 1.8582685017193081, + "language_loss": 0.78046525, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.79905152, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.586693525314331 + }, + { + "auxiliary_loss_clip": 0.01100103, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.03775263, + "balance_loss_mlp": 1.01612711, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.0924873671884505, + "language_loss": 0.78976464, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81106299, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.589179515838623 + }, + { + "auxiliary_loss_clip": 0.01084826, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.0360074, + "balance_loss_mlp": 1.01978016, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 3.0495301730268025, + "language_loss": 0.9092145, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93037868, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.596851110458374 + }, + { + "auxiliary_loss_clip": 0.0111056, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03805828, + "balance_loss_mlp": 1.02018201, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.819274182161722, + "language_loss": 0.76520997, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78664935, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 2.5448060035705566 + }, + { + "auxiliary_loss_clip": 0.01098898, + "auxiliary_loss_mlp": 0.01040572, + "balance_loss_clip": 1.03732347, + "balance_loss_mlp": 1.02642751, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.5520732005828624, + "language_loss": 0.73737025, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75876498, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.718270778656006 + }, + { + "auxiliary_loss_clip": 0.01013402, + "auxiliary_loss_mlp": 0.01003001, + "balance_loss_clip": 1.01113641, + "balance_loss_mlp": 1.00169587, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7793609303415546, + "language_loss": 0.52973706, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54990107, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.239333391189575 + }, + { + "auxiliary_loss_clip": 0.01094025, + "auxiliary_loss_mlp": 0.01037731, + "balance_loss_clip": 1.03765976, + "balance_loss_mlp": 1.0255661, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.6103813607512703, + "language_loss": 0.7675848, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.7889024, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 2.562380075454712 + }, + { + "auxiliary_loss_clip": 0.01096067, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.03742123, + "balance_loss_mlp": 1.01777935, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.8668543560728765, + "language_loss": 0.7698676, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79113138, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.614283323287964 + }, + { + "auxiliary_loss_clip": 0.01079844, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.03527987, + "balance_loss_mlp": 1.01746643, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.9687229224207912, + "language_loss": 0.85379863, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87489265, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 2.611755847930908 + }, + { + "auxiliary_loss_clip": 0.0109873, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.0366447, + "balance_loss_mlp": 1.02079439, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.766404664599203, + "language_loss": 0.76712799, + "learning_rate": 1.995910655193932e-06, + "loss": 0.78845066, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.5910298824310303 + }, + { + "auxiliary_loss_clip": 0.01061092, + "auxiliary_loss_mlp": 0.00749659, + "balance_loss_clip": 1.0346384, + "balance_loss_mlp": 1.00012934, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.2074621318435113, + "language_loss": 0.75788081, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77598834, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 2.658751964569092 + }, + { + "auxiliary_loss_clip": 0.01080906, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.03509521, + "balance_loss_mlp": 1.02815211, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.8012051296894824, + "language_loss": 0.81200117, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83324528, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.6053502559661865 + }, + { + "auxiliary_loss_clip": 0.01103748, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.03543711, + "balance_loss_mlp": 1.02183628, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 2.0337766362862295, + "language_loss": 0.76049435, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78187287, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 2.5747110843658447 + }, + { + "auxiliary_loss_clip": 0.01073979, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.03800917, + "balance_loss_mlp": 1.01994371, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.5907527842904603, + "language_loss": 0.78782988, + "learning_rate": 1.994352813122559e-06, + "loss": 0.8088913, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 2.6469566822052 + }, + { + "auxiliary_loss_clip": 0.01068921, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.03488624, + "balance_loss_mlp": 1.034073, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 3.7745604249966624, + "language_loss": 0.72853315, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74971747, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 4.175795316696167 + }, + { + "auxiliary_loss_clip": 0.01095846, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.03771341, + "balance_loss_mlp": 1.01753235, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.0678459444830164, + "language_loss": 0.73835635, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.75961101, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 2.5616071224212646 + }, + { + "auxiliary_loss_clip": 0.01068933, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.03573203, + "balance_loss_mlp": 1.02133989, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 2.1766954148632958, + "language_loss": 0.65948474, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68050778, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 2.628382921218872 + }, + { + "auxiliary_loss_clip": 0.01093572, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.03469491, + "balance_loss_mlp": 1.02121127, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.5093894235133394, + "language_loss": 0.75798452, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.77926427, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 2.547621011734009 + }, + { + "auxiliary_loss_clip": 0.01076828, + "auxiliary_loss_mlp": 0.01040683, + "balance_loss_clip": 1.03629112, + "balance_loss_mlp": 1.02761126, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 2.327421733935682, + "language_loss": 0.78829825, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.8094734, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 2.634951591491699 + }, + { + "auxiliary_loss_clip": 0.01086068, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.03450894, + "balance_loss_mlp": 1.02282929, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.249725054533682, + "language_loss": 0.80513525, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.8263424, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 2.4666082859039307 + }, + { + "auxiliary_loss_clip": 0.01086398, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.03663492, + "balance_loss_mlp": 1.02008784, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 1.6966198286554666, + "language_loss": 0.71637887, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73756778, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.50223970413208 + }, + { + "auxiliary_loss_clip": 0.01019226, + "auxiliary_loss_mlp": 0.01001079, + "balance_loss_clip": 1.00661135, + "balance_loss_mlp": 0.99963677, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7300289888760629, + "language_loss": 0.57825935, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59846234, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 3.071075677871704 + }, + { + "auxiliary_loss_clip": 0.01081256, + "auxiliary_loss_mlp": 0.01040988, + "balance_loss_clip": 1.03559959, + "balance_loss_mlp": 1.02764869, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.5941517175704947, + "language_loss": 0.75356829, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77479076, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.571779727935791 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.03705812, + "balance_loss_mlp": 1.02203631, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.5085037941447563, + "language_loss": 0.67485821, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69617814, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.519286870956421 + }, + { + "auxiliary_loss_clip": 0.0102231, + "auxiliary_loss_mlp": 0.01000723, + "balance_loss_clip": 1.00980735, + "balance_loss_mlp": 0.99926835, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.773320567938719, + "language_loss": 0.55823731, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57846761, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 3.0188565254211426 + }, + { + "auxiliary_loss_clip": 0.01079934, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_clip": 1.03515577, + "balance_loss_mlp": 1.01436949, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.5590601632221883, + "language_loss": 0.81615955, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83721626, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 2.5689682960510254 + }, + { + "auxiliary_loss_clip": 0.01090744, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.03665996, + "balance_loss_mlp": 1.01721621, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.9371846561140151, + "language_loss": 0.83096945, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85216784, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 2.6048424243927 + }, + { + "auxiliary_loss_clip": 0.01077969, + "auxiliary_loss_mlp": 0.0103729, + "balance_loss_clip": 1.03511047, + "balance_loss_mlp": 1.02361655, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.7443291274662653, + "language_loss": 0.69221359, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71336615, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.6533422470092773 + }, + { + "auxiliary_loss_clip": 0.01062015, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.03288007, + "balance_loss_mlp": 1.01557159, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.4058695781015491, + "language_loss": 0.77581608, + "learning_rate": 1.988510943586582e-06, + "loss": 0.7967236, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.666788339614868 + }, + { + "auxiliary_loss_clip": 0.01106974, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.03846085, + "balance_loss_mlp": 1.02427006, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.5655314551411144, + "language_loss": 0.65326583, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67469752, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 4.009212493896484 + }, + { + "auxiliary_loss_clip": 0.01064884, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.03603506, + "balance_loss_mlp": 1.02015913, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.5187038541280418, + "language_loss": 0.75383854, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.7748298, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 4.265533447265625 + }, + { + "auxiliary_loss_clip": 0.01106054, + "auxiliary_loss_mlp": 0.01022896, + "balance_loss_clip": 1.03649592, + "balance_loss_mlp": 1.01106429, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 2.0273795993177406, + "language_loss": 0.81263703, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83392656, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.548842191696167 + }, + { + "auxiliary_loss_clip": 0.01051024, + "auxiliary_loss_mlp": 0.01043176, + "balance_loss_clip": 1.03077102, + "balance_loss_mlp": 1.03030753, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.5460693830364232, + "language_loss": 0.75423515, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77517712, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.689277172088623 + }, + { + "auxiliary_loss_clip": 0.01087775, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.03815544, + "balance_loss_mlp": 1.01953304, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.554449404304085, + "language_loss": 0.72364438, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74483776, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 2.5986568927764893 + }, + { + "auxiliary_loss_clip": 0.01064469, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.03164172, + "balance_loss_mlp": 1.01907647, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.4511541781474684, + "language_loss": 0.74700141, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.7679677, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.6647579669952393 + }, + { + "auxiliary_loss_clip": 0.01097297, + "auxiliary_loss_mlp": 0.01038406, + "balance_loss_clip": 1.03741145, + "balance_loss_mlp": 1.02505422, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 2.345583849545911, + "language_loss": 0.83568323, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85704029, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 2.5320701599121094 + }, + { + "auxiliary_loss_clip": 0.0110709, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.03787851, + "balance_loss_mlp": 1.01869822, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.8405959481721965, + "language_loss": 0.74392146, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76530528, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.67256236076355 + }, + { + "auxiliary_loss_clip": 0.01086312, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.03675866, + "balance_loss_mlp": 1.02373028, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 8.332943102681606, + "language_loss": 0.72603106, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74725068, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.587613821029663 + }, + { + "auxiliary_loss_clip": 0.01090606, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.03814137, + "balance_loss_mlp": 1.02054489, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.8919295956165951, + "language_loss": 0.85376894, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87501502, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 2.628195285797119 + }, + { + "auxiliary_loss_clip": 0.01092414, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.03607035, + "balance_loss_mlp": 1.01568198, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 2.3490109865698763, + "language_loss": 0.6440835, + "learning_rate": 1.984226965411294e-06, + "loss": 0.66528177, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 4.1611857414245605 + }, + { + "auxiliary_loss_clip": 0.0107949, + "auxiliary_loss_mlp": 0.01030325, + "balance_loss_clip": 1.03479004, + "balance_loss_mlp": 1.01777852, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.4583862407547934, + "language_loss": 0.77999425, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80109251, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.6102476119995117 + }, + { + "auxiliary_loss_clip": 0.01097914, + "auxiliary_loss_mlp": 0.01037378, + "balance_loss_clip": 1.03802872, + "balance_loss_mlp": 1.02413392, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 1.7217125611276654, + "language_loss": 0.72094297, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74229592, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 2.5586965084075928 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.03854001, + "balance_loss_mlp": 1.02075267, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.8280098344459563, + "language_loss": 0.86199486, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88337177, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.5591840744018555 + }, + { + "auxiliary_loss_clip": 0.01091369, + "auxiliary_loss_mlp": 0.0103116, + "balance_loss_clip": 1.03354001, + "balance_loss_mlp": 1.01951337, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.9732739281626062, + "language_loss": 0.73722035, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75844562, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.570807933807373 + }, + { + "auxiliary_loss_clip": 0.01112173, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.03821254, + "balance_loss_mlp": 1.01816916, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 2.3334056436607584, + "language_loss": 0.66752654, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.68897092, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.5159449577331543 + }, + { + "auxiliary_loss_clip": 0.01105666, + "auxiliary_loss_mlp": 0.01036874, + "balance_loss_clip": 1.03598785, + "balance_loss_mlp": 1.02423191, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.449424828261367, + "language_loss": 0.77000809, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79143357, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 2.538830041885376 + }, + { + "auxiliary_loss_clip": 0.01096464, + "auxiliary_loss_mlp": 0.01038135, + "balance_loss_clip": 1.03520966, + "balance_loss_mlp": 1.02592778, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.8995135084862496, + "language_loss": 0.81770885, + "learning_rate": 1.981500833922294e-06, + "loss": 0.83905482, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.5340688228607178 + }, + { + "auxiliary_loss_clip": 0.01110727, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.03982425, + "balance_loss_mlp": 1.02478302, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.9312024812434156, + "language_loss": 0.66044432, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68193138, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 2.5095272064208984 + }, + { + "auxiliary_loss_clip": 0.01082995, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.03674603, + "balance_loss_mlp": 1.02106345, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.9869074798335644, + "language_loss": 0.86542988, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88659978, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.5123372077941895 + }, + { + "auxiliary_loss_clip": 0.01096693, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.03816915, + "balance_loss_mlp": 1.02468145, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 3.9410954652732166, + "language_loss": 0.80933946, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.83067358, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 2.587251663208008 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.00749466, + "balance_loss_clip": 1.04278433, + "balance_loss_mlp": 1.00021648, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 2.1413249484005723, + "language_loss": 0.75586462, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77439481, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.5441997051239014 + }, + { + "auxiliary_loss_clip": 0.01108118, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.03708076, + "balance_loss_mlp": 1.02069759, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 2.5270441251770674, + "language_loss": 0.69926304, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72068232, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 2.498706817626953 + }, + { + "auxiliary_loss_clip": 0.01019429, + "auxiliary_loss_mlp": 0.01002892, + "balance_loss_clip": 1.00745273, + "balance_loss_mlp": 1.00141954, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9627462400870868, + "language_loss": 0.67248333, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69270653, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 3.0632688999176025 + }, + { + "auxiliary_loss_clip": 0.01058037, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.03262162, + "balance_loss_mlp": 1.01765776, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 1.9797627911002993, + "language_loss": 0.79287326, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81375206, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 2.61276912689209 + }, + { + "auxiliary_loss_clip": 0.01040165, + "auxiliary_loss_mlp": 0.00749524, + "balance_loss_clip": 1.03421795, + "balance_loss_mlp": 1.00022244, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.6750433724617917, + "language_loss": 0.82054639, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.83844328, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 2.6427340507507324 + }, + { + "auxiliary_loss_clip": 0.01075206, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.03172898, + "balance_loss_mlp": 1.02331042, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 2.010520173826281, + "language_loss": 0.65949869, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.6806038, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 4.0536627769470215 + }, + { + "auxiliary_loss_clip": 0.01085762, + "auxiliary_loss_mlp": 0.01042398, + "balance_loss_clip": 1.03486776, + "balance_loss_mlp": 1.02902818, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 2.120464015650248, + "language_loss": 0.6047858, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62606734, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.568415403366089 + }, + { + "auxiliary_loss_clip": 0.01105655, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.03573668, + "balance_loss_mlp": 1.01975739, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7122005789799628, + "language_loss": 0.76213372, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.7835114, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.525306463241577 + }, + { + "auxiliary_loss_clip": 0.01055165, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.03058958, + "balance_loss_mlp": 1.02940321, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 1.9196297688544675, + "language_loss": 0.70997608, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73095846, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 2.6495304107666016 + }, + { + "auxiliary_loss_clip": 0.01082607, + "auxiliary_loss_mlp": 0.0103284, + "balance_loss_clip": 1.03452182, + "balance_loss_mlp": 1.02150321, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.740983274715277, + "language_loss": 0.67592233, + "learning_rate": 1.976438113333184e-06, + "loss": 0.6970768, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.5785727500915527 + }, + { + "auxiliary_loss_clip": 0.01092716, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.03524327, + "balance_loss_mlp": 1.01881015, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 4.803770068215773, + "language_loss": 0.6990062, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72024465, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.5735037326812744 + }, + { + "auxiliary_loss_clip": 0.01110842, + "auxiliary_loss_mlp": 0.0074957, + "balance_loss_clip": 1.03880572, + "balance_loss_mlp": 1.00016892, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 3.9996455912809146, + "language_loss": 0.73013091, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.74873507, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 2.519425868988037 + }, + { + "auxiliary_loss_clip": 0.01084569, + "auxiliary_loss_mlp": 0.01029781, + "balance_loss_clip": 1.03800201, + "balance_loss_mlp": 1.01855755, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 1.7723844050845368, + "language_loss": 0.77323496, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79437852, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 2.5883352756500244 + }, + { + "auxiliary_loss_clip": 0.0109934, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.03819633, + "balance_loss_mlp": 1.01812077, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.274696197437407, + "language_loss": 0.74446595, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.7657665, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.5373010635375977 + }, + { + "auxiliary_loss_clip": 0.01094177, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.0347755, + "balance_loss_mlp": 1.02287531, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 2.17868276401251, + "language_loss": 0.80559504, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82689953, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 2.5805840492248535 + }, + { + "auxiliary_loss_clip": 0.01097682, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.03685772, + "balance_loss_mlp": 1.01843584, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.663759669579636, + "language_loss": 0.74593371, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76722169, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 2.58864426612854 + }, + { + "auxiliary_loss_clip": 0.01075167, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.03746152, + "balance_loss_mlp": 1.02003813, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 2.1230387764368284, + "language_loss": 0.79047817, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.81155407, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.610792875289917 + }, + { + "auxiliary_loss_clip": 0.01095973, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.03597677, + "balance_loss_mlp": 1.02011061, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 2.5973049455568815, + "language_loss": 0.80200797, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82328737, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.557985544204712 + }, + { + "auxiliary_loss_clip": 0.01103515, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.0366118, + "balance_loss_mlp": 1.01968169, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 2.1894882370602287, + "language_loss": 0.68788576, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.7092368, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.5617125034332275 + }, + { + "auxiliary_loss_clip": 0.01080805, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.03483033, + "balance_loss_mlp": 1.01992559, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.6307380768393647, + "language_loss": 0.77622598, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79735279, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 2.567929983139038 + }, + { + "auxiliary_loss_clip": 0.01106141, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03613329, + "balance_loss_mlp": 1.01814151, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 2.3594186115440268, + "language_loss": 0.71005851, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73141992, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 3.959780693054199 + }, + { + "auxiliary_loss_clip": 0.01071427, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.03540242, + "balance_loss_mlp": 1.02022398, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 2.252228335041602, + "language_loss": 0.76127934, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78231657, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 4.129587888717651 + }, + { + "auxiliary_loss_clip": 0.01071909, + "auxiliary_loss_mlp": 0.01029377, + "balance_loss_clip": 1.03332365, + "balance_loss_mlp": 1.01764059, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 2.0721720577520415, + "language_loss": 0.7520678, + "learning_rate": 1.971375543740272e-06, + "loss": 0.77308071, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 2.5782365798950195 + }, + { + "auxiliary_loss_clip": 0.01104958, + "auxiliary_loss_mlp": 0.01029059, + "balance_loss_clip": 1.03652918, + "balance_loss_mlp": 1.01679873, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.9366135980603054, + "language_loss": 0.77399504, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79533529, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.5075926780700684 + }, + { + "auxiliary_loss_clip": 0.01063468, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.03416014, + "balance_loss_mlp": 1.023121, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 2.0486953563746724, + "language_loss": 0.66536736, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68634957, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 2.649397850036621 + }, + { + "auxiliary_loss_clip": 0.01104151, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.03640974, + "balance_loss_mlp": 1.02286816, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.9024344893113248, + "language_loss": 0.76423597, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78561735, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 2.5736515522003174 + }, + { + "auxiliary_loss_clip": 0.01104073, + "auxiliary_loss_mlp": 0.0102983, + "balance_loss_clip": 1.03646386, + "balance_loss_mlp": 1.0183208, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.525890653091208, + "language_loss": 0.83261931, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85395837, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 2.619046926498413 + }, + { + "auxiliary_loss_clip": 0.01106697, + "auxiliary_loss_mlp": 0.01038316, + "balance_loss_clip": 1.0355885, + "balance_loss_mlp": 1.02577507, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.548451854044746, + "language_loss": 0.69959593, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72104609, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.6856160163879395 + }, + { + "auxiliary_loss_clip": 0.01094134, + "auxiliary_loss_mlp": 0.0074938, + "balance_loss_clip": 1.03545976, + "balance_loss_mlp": 1.00023627, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.5220975007350994, + "language_loss": 0.80425531, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82269037, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 2.62180757522583 + }, + { + "auxiliary_loss_clip": 0.01103889, + "auxiliary_loss_mlp": 0.01031581, + "balance_loss_clip": 1.03453159, + "balance_loss_mlp": 1.0195291, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.9428080336671787, + "language_loss": 0.77960432, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80095899, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 3.96701979637146 + }, + { + "auxiliary_loss_clip": 0.01096494, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.03700924, + "balance_loss_mlp": 1.02415609, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.7873047974377414, + "language_loss": 0.65782934, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.6791544, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 2.5376830101013184 + }, + { + "auxiliary_loss_clip": 0.01109566, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.03748417, + "balance_loss_mlp": 1.01948369, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 1.5696416142939835, + "language_loss": 0.71339047, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73481685, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.5465404987335205 + }, + { + "auxiliary_loss_clip": 0.01089123, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.03852892, + "balance_loss_mlp": 1.01780856, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.902059538721769, + "language_loss": 0.64249873, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66369843, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.585718870162964 + }, + { + "auxiliary_loss_clip": 0.01082813, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.03485298, + "balance_loss_mlp": 1.01794231, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.6731352430484427, + "language_loss": 0.70609206, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72723317, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.5874533653259277 + }, + { + "auxiliary_loss_clip": 0.01102161, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03419602, + "balance_loss_mlp": 1.01636434, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 2.1741889327110338, + "language_loss": 0.77596343, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79726648, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 2.5391998291015625 + }, + { + "auxiliary_loss_clip": 0.01052008, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.03558064, + "balance_loss_mlp": 1.02539659, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.660551328191122, + "language_loss": 0.78451312, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80541837, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.6760692596435547 + }, + { + "auxiliary_loss_clip": 0.01069304, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.03551495, + "balance_loss_mlp": 1.01778889, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 1.9049096147946816, + "language_loss": 0.70153791, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72254395, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 2.607210159301758 + }, + { + "auxiliary_loss_clip": 0.01072539, + "auxiliary_loss_mlp": 0.01038533, + "balance_loss_clip": 1.03428996, + "balance_loss_mlp": 1.026052, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 3.397947790680184, + "language_loss": 0.78821391, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80932462, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 2.6348838806152344 + }, + { + "auxiliary_loss_clip": 0.0109352, + "auxiliary_loss_mlp": 0.01039044, + "balance_loss_clip": 1.03576541, + "balance_loss_mlp": 1.02596104, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 1.8240229332218996, + "language_loss": 0.84402215, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86534786, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 2.577409505844116 + }, + { + "auxiliary_loss_clip": 0.01097007, + "auxiliary_loss_mlp": 0.01032884, + "balance_loss_clip": 1.03909314, + "balance_loss_mlp": 1.02142203, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 2.5675412853244177, + "language_loss": 0.66577888, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.68707776, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 2.501927614212036 + }, + { + "auxiliary_loss_clip": 0.01063219, + "auxiliary_loss_mlp": 0.01036459, + "balance_loss_clip": 1.03659272, + "balance_loss_mlp": 1.02432311, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 1.9646850115765564, + "language_loss": 0.7349937, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75599045, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.6407835483551025 + }, + { + "auxiliary_loss_clip": 0.01071566, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.03425539, + "balance_loss_mlp": 1.02498829, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.7118551230441097, + "language_loss": 0.71261424, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73370582, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 2.6332221031188965 + }, + { + "auxiliary_loss_clip": 0.01105795, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.03601038, + "balance_loss_mlp": 1.02351534, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.9469879437142794, + "language_loss": 0.83395821, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85537368, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 2.5692198276519775 + }, + { + "auxiliary_loss_clip": 0.01080413, + "auxiliary_loss_mlp": 0.01040143, + "balance_loss_clip": 1.03428566, + "balance_loss_mlp": 1.02625537, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 2.192131069718027, + "language_loss": 0.75287372, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77407926, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.5846168994903564 + }, + { + "auxiliary_loss_clip": 0.01104946, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.03698945, + "balance_loss_mlp": 1.02349353, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 2.270522617132737, + "language_loss": 0.77706301, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.798464, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 4.0812437534332275 + }, + { + "auxiliary_loss_clip": 0.010836, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.03369665, + "balance_loss_mlp": 1.02199697, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 3.889047736869623, + "language_loss": 0.70678759, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72796214, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 2.610278844833374 + }, + { + "auxiliary_loss_clip": 0.01087184, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.03425002, + "balance_loss_mlp": 1.01853967, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.642010342495608, + "language_loss": 0.69467521, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71586275, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 2.569754123687744 + }, + { + "auxiliary_loss_clip": 0.01073464, + "auxiliary_loss_mlp": 0.00749419, + "balance_loss_clip": 1.03344297, + "balance_loss_mlp": 1.00017262, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 1.540260168123416, + "language_loss": 0.76415813, + "learning_rate": 1.961640376626072e-06, + "loss": 0.78238702, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.6002728939056396 + }, + { + "auxiliary_loss_clip": 0.01081345, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.03623605, + "balance_loss_mlp": 1.02292168, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 8.460225632964612, + "language_loss": 0.76401377, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78517628, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.591557264328003 + }, + { + "auxiliary_loss_clip": 0.01095893, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.03840542, + "balance_loss_mlp": 1.02126074, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.844449780909931, + "language_loss": 0.71894288, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74022877, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 2.586505651473999 + }, + { + "auxiliary_loss_clip": 0.01092268, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.03767943, + "balance_loss_mlp": 1.0217123, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.7781669578361825, + "language_loss": 0.68059349, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70187461, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.586413860321045 + }, + { + "auxiliary_loss_clip": 0.01065923, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.03899908, + "balance_loss_mlp": 1.01968801, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.6313798441338863, + "language_loss": 0.8068673, + "learning_rate": 1.960082828259629e-06, + "loss": 0.82783771, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 2.6429221630096436 + }, + { + "auxiliary_loss_clip": 0.0108287, + "auxiliary_loss_mlp": 0.01027654, + "balance_loss_clip": 1.03393865, + "balance_loss_mlp": 1.0159055, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.035072163282444, + "language_loss": 0.63855577, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.659661, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 2.64119029045105 + }, + { + "auxiliary_loss_clip": 0.01083916, + "auxiliary_loss_mlp": 0.00749265, + "balance_loss_clip": 1.03812015, + "balance_loss_mlp": 1.0002439, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.7011693134215171, + "language_loss": 0.66628283, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68461466, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.6069769859313965 + }, + { + "auxiliary_loss_clip": 0.01066029, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.03357589, + "balance_loss_mlp": 1.02319908, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.012875682630006, + "language_loss": 0.75863314, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.77963728, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 2.6568033695220947 + }, + { + "auxiliary_loss_clip": 0.01077778, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.04072261, + "balance_loss_mlp": 1.02027047, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 2.1833368388317402, + "language_loss": 0.78131777, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80242348, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.638798236846924 + }, + { + "auxiliary_loss_clip": 0.01060928, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.03034234, + "balance_loss_mlp": 1.01835144, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.7904586724424894, + "language_loss": 0.71590734, + "learning_rate": 1.958135926969736e-06, + "loss": 0.7368089, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 2.6351478099823 + }, + { + "auxiliary_loss_clip": 0.01087493, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.03461289, + "balance_loss_mlp": 1.01790011, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.5811351637995605, + "language_loss": 0.74617141, + "learning_rate": 1.957746551415166e-06, + "loss": 0.76734686, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.5110292434692383 + }, + { + "auxiliary_loss_clip": 0.01077455, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.03305137, + "balance_loss_mlp": 1.01806843, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.061335033923913, + "language_loss": 0.85911155, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88019478, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.5647497177124023 + }, + { + "auxiliary_loss_clip": 0.01013472, + "auxiliary_loss_mlp": 0.010027, + "balance_loss_clip": 1.01417542, + "balance_loss_mlp": 1.00131691, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8695235677081972, + "language_loss": 0.63180327, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.6519649, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 4.541351795196533 + }, + { + "auxiliary_loss_clip": 0.01092205, + "auxiliary_loss_mlp": 0.01027853, + "balance_loss_clip": 1.0366919, + "balance_loss_mlp": 1.0165695, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.555905045196246, + "language_loss": 0.68981999, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71102059, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 2.622819662094116 + }, + { + "auxiliary_loss_clip": 0.01088824, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.03355527, + "balance_loss_mlp": 1.02004147, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.7670332993629034, + "language_loss": 0.65269828, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67390001, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 4.092331886291504 + }, + { + "auxiliary_loss_clip": 0.01080485, + "auxiliary_loss_mlp": 0.01038043, + "balance_loss_clip": 1.03261328, + "balance_loss_mlp": 1.02560353, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 4.857438996763209, + "language_loss": 0.69003129, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71121657, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 2.623244047164917 + }, + { + "auxiliary_loss_clip": 0.01107823, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.03959608, + "balance_loss_mlp": 1.02433014, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.6088143770869388, + "language_loss": 0.66193438, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.68337333, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.519045114517212 + }, + { + "auxiliary_loss_clip": 0.01103694, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.03557539, + "balance_loss_mlp": 1.023669, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.919586563886483, + "language_loss": 0.83041793, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85180914, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 2.4907474517822266 + }, + { + "auxiliary_loss_clip": 0.01080173, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.03514004, + "balance_loss_mlp": 1.01970387, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.8652749203449532, + "language_loss": 0.77784157, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79895216, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.5855093002319336 + }, + { + "auxiliary_loss_clip": 0.01061555, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.03077602, + "balance_loss_mlp": 1.02534842, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.5448594804838633, + "language_loss": 0.69203889, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71302009, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.7217483520507812 + }, + { + "auxiliary_loss_clip": 0.01071408, + "auxiliary_loss_mlp": 0.01039436, + "balance_loss_clip": 1.03401613, + "balance_loss_mlp": 1.02715111, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 2.1763387588242953, + "language_loss": 0.7612825, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78239095, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.6824376583099365 + }, + { + "auxiliary_loss_clip": 0.01080435, + "auxiliary_loss_mlp": 0.00749083, + "balance_loss_clip": 1.03110325, + "balance_loss_mlp": 1.00023198, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.6645942761822108, + "language_loss": 0.75812209, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77641726, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 2.6491732597351074 + }, + { + "auxiliary_loss_clip": 0.01085284, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.0389607, + "balance_loss_mlp": 1.02425027, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.7905592969169117, + "language_loss": 0.80511677, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.82632411, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 4.170694828033447 + }, + { + "auxiliary_loss_clip": 0.01082104, + "auxiliary_loss_mlp": 0.01028214, + "balance_loss_clip": 1.03575325, + "balance_loss_mlp": 1.01758671, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.7091020400416834, + "language_loss": 0.69845319, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.71955639, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.6424968242645264 + }, + { + "auxiliary_loss_clip": 0.01099757, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.03476989, + "balance_loss_mlp": 1.01990557, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.0994169345409253, + "language_loss": 0.8337121, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85501462, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 2.529118299484253 + }, + { + "auxiliary_loss_clip": 0.01090509, + "auxiliary_loss_mlp": 0.00749147, + "balance_loss_clip": 1.03501463, + "balance_loss_mlp": 1.00016749, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.572333870055082, + "language_loss": 0.72835159, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.74674809, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 2.5153305530548096 + }, + { + "auxiliary_loss_clip": 0.01075045, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.03147101, + "balance_loss_mlp": 1.01937962, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.9177097331593955, + "language_loss": 0.82899499, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85005581, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 2.644174337387085 + }, + { + "auxiliary_loss_clip": 0.01067919, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.03478503, + "balance_loss_mlp": 1.02322018, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.0577584925677863, + "language_loss": 0.79319727, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81422937, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.65444016456604 + }, + { + "auxiliary_loss_clip": 0.01098216, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03723335, + "balance_loss_mlp": 1.02119708, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 3.36742035048654, + "language_loss": 0.76673567, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78806186, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 2.6389546394348145 + }, + { + "auxiliary_loss_clip": 0.0108859, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.03440297, + "balance_loss_mlp": 1.01844883, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.5986825099189081, + "language_loss": 0.72652113, + "learning_rate": 1.950348737138691e-06, + "loss": 0.7476967, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 2.6098947525024414 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01038654, + "balance_loss_clip": 1.03766787, + "balance_loss_mlp": 1.02567828, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 1.8706337448631296, + "language_loss": 0.81984568, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84132195, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.5323753356933594 + }, + { + "auxiliary_loss_clip": 0.01000609, + "auxiliary_loss_mlp": 0.01024767, + "balance_loss_clip": 1.01200318, + "balance_loss_mlp": 1.02341425, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.8036047486617096, + "language_loss": 0.55697733, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57723105, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 3.2079551219940186 + }, + { + "auxiliary_loss_clip": 0.01043179, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.03446317, + "balance_loss_mlp": 1.02450764, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.610513166737519, + "language_loss": 0.73623085, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75702506, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 2.6782515048980713 + }, + { + "auxiliary_loss_clip": 0.01077942, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.03436565, + "balance_loss_mlp": 1.02313256, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.8994740204599487, + "language_loss": 0.70996559, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73109484, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 2.55688738822937 + }, + { + "auxiliary_loss_clip": 0.01074962, + "auxiliary_loss_mlp": 0.01029174, + "balance_loss_clip": 1.03617752, + "balance_loss_mlp": 1.0184449, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 2.0308722933126506, + "language_loss": 0.8047055, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82574683, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 2.6853575706481934 + }, + { + "auxiliary_loss_clip": 0.01088604, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.03307986, + "balance_loss_mlp": 1.0205735, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.7393185367189747, + "language_loss": 0.74403405, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76524216, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.58532977104187 + }, + { + "auxiliary_loss_clip": 0.01090549, + "auxiliary_loss_mlp": 0.0074941, + "balance_loss_clip": 1.03241563, + "balance_loss_mlp": 1.00026298, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 2.1189422703527687, + "language_loss": 0.73468888, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75308847, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 2.5462355613708496 + }, + { + "auxiliary_loss_clip": 0.01076035, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.03423595, + "balance_loss_mlp": 1.01972151, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.626883110158143, + "language_loss": 0.66761339, + "learning_rate": 1.947234065463318e-06, + "loss": 0.68869597, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 4.128460645675659 + }, + { + "auxiliary_loss_clip": 0.01086323, + "auxiliary_loss_mlp": 0.00749258, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.0002768, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 2.123986938596141, + "language_loss": 0.66875768, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68711346, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.6754989624023438 + }, + { + "auxiliary_loss_clip": 0.0107942, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.03406584, + "balance_loss_mlp": 1.01883507, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.8773151925646259, + "language_loss": 0.76501071, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78611261, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.6633553504943848 + }, + { + "auxiliary_loss_clip": 0.01098578, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.03616941, + "balance_loss_mlp": 1.02165735, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.052954320571404, + "language_loss": 0.77121055, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79255188, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 2.6901402473449707 + }, + { + "auxiliary_loss_clip": 0.01082009, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.03628635, + "balance_loss_mlp": 1.0216378, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.86256118408701, + "language_loss": 0.78420079, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80535042, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 2.5524420738220215 + }, + { + "auxiliary_loss_clip": 0.01088011, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.03750241, + "balance_loss_mlp": 1.01830649, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 2.1202265381163823, + "language_loss": 0.69998407, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.72117072, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 2.603250503540039 + }, + { + "auxiliary_loss_clip": 0.01022699, + "auxiliary_loss_mlp": 0.01005012, + "balance_loss_clip": 1.01120949, + "balance_loss_mlp": 1.00321221, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.7743455519465303, + "language_loss": 0.52501655, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54529363, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.193941354751587 + }, + { + "auxiliary_loss_clip": 0.01074961, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.03112936, + "balance_loss_mlp": 1.02179515, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.9101974810363331, + "language_loss": 0.74581277, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76689708, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.5666403770446777 + }, + { + "auxiliary_loss_clip": 0.01076754, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.03629351, + "balance_loss_mlp": 1.01492047, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.853680874652186, + "language_loss": 0.77250004, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79353291, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.602433204650879 + }, + { + "auxiliary_loss_clip": 0.0105142, + "auxiliary_loss_mlp": 0.01036057, + "balance_loss_clip": 1.02966321, + "balance_loss_mlp": 1.02241921, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 2.0091966361185865, + "language_loss": 0.83642817, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85730296, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 2.6856372356414795 + }, + { + "auxiliary_loss_clip": 0.01073406, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.03269935, + "balance_loss_mlp": 1.01857209, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.8096705749561353, + "language_loss": 0.69573808, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71677548, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 2.5800373554229736 + }, + { + "auxiliary_loss_clip": 0.01093945, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.03560448, + "balance_loss_mlp": 1.02235687, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.911982208059692, + "language_loss": 0.83022106, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85150433, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.5775420665740967 + }, + { + "auxiliary_loss_clip": 0.01105518, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.02831757, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.7544685192734122, + "language_loss": 0.6944139, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71587741, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.5501255989074707 + }, + { + "auxiliary_loss_clip": 0.01065224, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.03061831, + "balance_loss_mlp": 1.02232289, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.461225464261008, + "language_loss": 0.76520801, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.7862308, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.6242709159851074 + }, + { + "auxiliary_loss_clip": 0.01066865, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.03523993, + "balance_loss_mlp": 1.02270544, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 15.033924219906067, + "language_loss": 0.76065117, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78169256, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 4.092950105667114 + }, + { + "auxiliary_loss_clip": 0.01075709, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.03166842, + "balance_loss_mlp": 1.02183044, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.7602999203779939, + "language_loss": 0.71025378, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73134279, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.74265718460083 + }, + { + "auxiliary_loss_clip": 0.01103277, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.03537595, + "balance_loss_mlp": 1.02333426, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 7.4519885329500175, + "language_loss": 0.87138832, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89276087, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 2.584315299987793 + }, + { + "auxiliary_loss_clip": 0.01087563, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.03558433, + "balance_loss_mlp": 1.01976871, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 2.0348193911453154, + "language_loss": 0.61339962, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63458842, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 4.124040365219116 + }, + { + "auxiliary_loss_clip": 0.01074314, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.03433013, + "balance_loss_mlp": 1.02351213, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 2.318210974519009, + "language_loss": 0.72202277, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74312687, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.628181219100952 + }, + { + "auxiliary_loss_clip": 0.01087878, + "auxiliary_loss_mlp": 0.01029604, + "balance_loss_clip": 1.03355408, + "balance_loss_mlp": 1.0191319, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 2.0613453933123496, + "language_loss": 0.73050618, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75168097, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 2.570812940597534 + }, + { + "auxiliary_loss_clip": 0.01088694, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.03313041, + "balance_loss_mlp": 1.02508903, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6402700796262866, + "language_loss": 0.70198804, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72324598, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 2.6987860202789307 + }, + { + "auxiliary_loss_clip": 0.01043412, + "auxiliary_loss_mlp": 0.01037018, + "balance_loss_clip": 1.02977741, + "balance_loss_mlp": 1.02466786, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.8914520213154353, + "language_loss": 0.8676824, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88848674, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.718474864959717 + }, + { + "auxiliary_loss_clip": 0.01102365, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.03552532, + "balance_loss_mlp": 1.01793098, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.9312876727380468, + "language_loss": 0.79963303, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82095778, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.539290428161621 + }, + { + "auxiliary_loss_clip": 0.01092044, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.03807425, + "balance_loss_mlp": 1.02393878, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.0590647367963983, + "language_loss": 0.7493757, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77065837, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 2.572502374649048 + }, + { + "auxiliary_loss_clip": 0.01108329, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.0357784, + "balance_loss_mlp": 1.02030802, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.6052894775622137, + "language_loss": 0.70343763, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72485751, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.6262519359588623 + }, + { + "auxiliary_loss_clip": 0.00995854, + "auxiliary_loss_mlp": 0.01004812, + "balance_loss_clip": 1.00466847, + "balance_loss_mlp": 1.00283337, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7561831815450949, + "language_loss": 0.55644166, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57644832, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 4.728524208068848 + }, + { + "auxiliary_loss_clip": 0.01007168, + "auxiliary_loss_mlp": 0.01003021, + "balance_loss_clip": 1.01061153, + "balance_loss_mlp": 1.00154245, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.7915184035874421, + "language_loss": 0.58313954, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60324144, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.2235796451568604 + }, + { + "auxiliary_loss_clip": 0.01094818, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.03421617, + "balance_loss_mlp": 1.02078485, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3792331553001176, + "language_loss": 0.70439541, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72568357, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 2.7015724182128906 + }, + { + "auxiliary_loss_clip": 0.01092765, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.0340414, + "balance_loss_mlp": 1.01444411, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.5625318527117393, + "language_loss": 0.69427454, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71546197, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 2.651925802230835 + }, + { + "auxiliary_loss_clip": 0.01066894, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.03467476, + "balance_loss_mlp": 1.02296805, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.733598737639231, + "language_loss": 0.83381867, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85484052, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.677692174911499 + }, + { + "auxiliary_loss_clip": 0.01068364, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.03503931, + "balance_loss_mlp": 1.02349997, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.614690149081119, + "language_loss": 0.79671013, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81775343, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 2.7206814289093018 + }, + { + "auxiliary_loss_clip": 0.01088329, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.03390002, + "balance_loss_mlp": 1.01742625, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 2.140214339291668, + "language_loss": 0.8317858, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85295987, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.655609369277954 + }, + { + "auxiliary_loss_clip": 0.01093866, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.03596163, + "balance_loss_mlp": 1.02095485, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.5966204262171282, + "language_loss": 0.77587926, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79714155, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.596151113510132 + }, + { + "auxiliary_loss_clip": 0.01107938, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.0374043, + "balance_loss_mlp": 1.02091002, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 2.5075307150101724, + "language_loss": 0.81637883, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83779907, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 2.587937355041504 + }, + { + "auxiliary_loss_clip": 0.01077042, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.03505087, + "balance_loss_mlp": 1.01664114, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.4787508029284113, + "language_loss": 0.76972377, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79078054, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 2.650055170059204 + }, + { + "auxiliary_loss_clip": 0.01095153, + "auxiliary_loss_mlp": 0.01036597, + "balance_loss_clip": 1.03642273, + "balance_loss_mlp": 1.02546263, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.732310483692773, + "language_loss": 0.80139017, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82270765, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 2.634049654006958 + }, + { + "auxiliary_loss_clip": 0.01107485, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.03785181, + "balance_loss_mlp": 1.0199883, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.6943754011537133, + "language_loss": 0.69885683, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72026116, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 2.5989468097686768 + }, + { + "auxiliary_loss_clip": 0.0108256, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.03520942, + "balance_loss_mlp": 1.02051032, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.8733610112254455, + "language_loss": 0.77326155, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79441398, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 2.733292579650879 + }, + { + "auxiliary_loss_clip": 0.00998705, + "auxiliary_loss_mlp": 0.00746789, + "balance_loss_clip": 1.00654888, + "balance_loss_mlp": 0.99991369, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7405318020470799, + "language_loss": 0.54511231, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56256723, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.154151201248169 + }, + { + "auxiliary_loss_clip": 0.01075039, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.0345068, + "balance_loss_mlp": 1.01969647, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.564210928618323, + "language_loss": 0.85076714, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.87183571, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.745671033859253 + }, + { + "auxiliary_loss_clip": 0.01087761, + "auxiliary_loss_mlp": 0.00749489, + "balance_loss_clip": 1.03299928, + "balance_loss_mlp": 1.00023353, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 2.2556104195402473, + "language_loss": 0.69589263, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71426517, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.558229923248291 + }, + { + "auxiliary_loss_clip": 0.01085727, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.03609228, + "balance_loss_mlp": 1.02004957, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 2.1720247001280177, + "language_loss": 0.66434354, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68552804, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 4.210130453109741 + }, + { + "auxiliary_loss_clip": 0.01069964, + "auxiliary_loss_mlp": 0.01040428, + "balance_loss_clip": 1.03468847, + "balance_loss_mlp": 1.0265106, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.556442610855179, + "language_loss": 0.63350844, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65461236, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.6880533695220947 + }, + { + "auxiliary_loss_clip": 0.0101647, + "auxiliary_loss_mlp": 0.00999993, + "balance_loss_clip": 1.00521231, + "balance_loss_mlp": 0.99793112, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7715386344342383, + "language_loss": 0.54147696, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56164163, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 3.230442762374878 + }, + { + "auxiliary_loss_clip": 0.01091466, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.04027128, + "balance_loss_mlp": 1.0178628, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 3.6254383027100787, + "language_loss": 0.75737101, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77860457, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.6647756099700928 + }, + { + "auxiliary_loss_clip": 0.01090992, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.03451002, + "balance_loss_mlp": 1.02345371, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 2.0281148906862483, + "language_loss": 0.80948877, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83075166, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.6326236724853516 + }, + { + "auxiliary_loss_clip": 0.01091719, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.03453159, + "balance_loss_mlp": 1.02064657, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 1.819390332807058, + "language_loss": 0.75201195, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77325821, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": 2.61550235748291 + }, + { + "auxiliary_loss_clip": 0.01038211, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.03279471, + "balance_loss_mlp": 1.01764178, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 2.2602636282330963, + "language_loss": 0.82836878, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.8490479, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.726670026779175 + }, + { + "auxiliary_loss_clip": 0.01082367, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.03294706, + "balance_loss_mlp": 1.02028847, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 2.563121404887783, + "language_loss": 0.80601865, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82717645, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 2.642233371734619 + }, + { + "auxiliary_loss_clip": 0.01094519, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.03890538, + "balance_loss_mlp": 1.02297485, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.7494251026670444, + "language_loss": 0.7231555, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74445713, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 2.673799991607666 + }, + { + "auxiliary_loss_clip": 0.01082546, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.03409886, + "balance_loss_mlp": 1.02053165, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.6144364453329143, + "language_loss": 0.76196873, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78312147, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.712347984313965 + }, + { + "auxiliary_loss_clip": 0.01102053, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.03616655, + "balance_loss_mlp": 1.02303958, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 2.5579912556128717, + "language_loss": 0.75971639, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78108227, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.634511709213257 + }, + { + "auxiliary_loss_clip": 0.01097592, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.03738952, + "balance_loss_mlp": 1.01885557, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.8961848753321948, + "language_loss": 0.67974508, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70104551, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.618730068206787 + }, + { + "auxiliary_loss_clip": 0.01096321, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.03804374, + "balance_loss_mlp": 1.02172828, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.6014600654000921, + "language_loss": 0.84137082, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.8626681, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.6781132221221924 + }, + { + "auxiliary_loss_clip": 0.01094747, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.03740001, + "balance_loss_mlp": 1.02082109, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.163808357425774, + "language_loss": 0.86888134, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89016151, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.601217031478882 + }, + { + "auxiliary_loss_clip": 0.01001303, + "auxiliary_loss_mlp": 0.01012237, + "balance_loss_clip": 1.01192307, + "balance_loss_mlp": 1.01063919, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7214948442477429, + "language_loss": 0.58886766, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60900307, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 4.826445579528809 + }, + { + "auxiliary_loss_clip": 0.01073595, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.03310418, + "balance_loss_mlp": 1.02242374, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 2.079046145317864, + "language_loss": 0.70688498, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72797555, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 2.6524274349212646 + }, + { + "auxiliary_loss_clip": 0.01093762, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.03418624, + "balance_loss_mlp": 1.02024484, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.7693107185125516, + "language_loss": 0.88003659, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.90129793, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 4.309996128082275 + }, + { + "auxiliary_loss_clip": 0.01041025, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.03071642, + "balance_loss_mlp": 1.02045882, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 3.2181399384052964, + "language_loss": 0.76210302, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78284913, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 2.7357866764068604 + }, + { + "auxiliary_loss_clip": 0.01080329, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.03367507, + "balance_loss_mlp": 1.01506674, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 2.228909434626156, + "language_loss": 0.71465027, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.7357226, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 2.5953454971313477 + }, + { + "auxiliary_loss_clip": 0.0108498, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.03620625, + "balance_loss_mlp": 1.02130103, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 2.2419619521864598, + "language_loss": 0.75886577, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78006005, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.6460306644439697 + }, + { + "auxiliary_loss_clip": 0.01087205, + "auxiliary_loss_mlp": 0.00749356, + "balance_loss_clip": 1.03337932, + "balance_loss_mlp": 1.00020015, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.6268556535539793, + "language_loss": 0.70895481, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72732043, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.5926761627197266 + }, + { + "auxiliary_loss_clip": 0.01009961, + "auxiliary_loss_mlp": 0.0100287, + "balance_loss_clip": 1.0109446, + "balance_loss_mlp": 1.00140941, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9223992105315999, + "language_loss": 0.65366554, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67379385, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 3.0388293266296387 + }, + { + "auxiliary_loss_clip": 0.01093165, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.03485084, + "balance_loss_mlp": 1.01546645, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 5.774076011178883, + "language_loss": 0.70974159, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73095191, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 2.616882562637329 + }, + { + "auxiliary_loss_clip": 0.01057087, + "auxiliary_loss_mlp": 0.01035335, + "balance_loss_clip": 1.03114617, + "balance_loss_mlp": 1.02155435, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.7867655298951621, + "language_loss": 0.74262041, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76354462, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.616786241531372 + }, + { + "auxiliary_loss_clip": 0.01080685, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.0313623, + "balance_loss_mlp": 1.01774955, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.4317600233979004, + "language_loss": 0.85426199, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87537634, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 4.251254081726074 + }, + { + "auxiliary_loss_clip": 0.01109665, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.03889096, + "balance_loss_mlp": 1.02044785, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7792436877776066, + "language_loss": 0.79084754, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81228065, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 2.6011643409729004 + }, + { + "auxiliary_loss_clip": 0.01100056, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.03840756, + "balance_loss_mlp": 1.02176309, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 2.2055451265815744, + "language_loss": 0.73550534, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75685704, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.6535427570343018 + }, + { + "auxiliary_loss_clip": 0.01071814, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.0309068, + "balance_loss_mlp": 1.02633381, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 2.882201778293728, + "language_loss": 0.73716056, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.7582671, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 2.607691526412964 + }, + { + "auxiliary_loss_clip": 0.01065363, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.03246605, + "balance_loss_mlp": 1.01812768, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 2.213479408236827, + "language_loss": 0.73671633, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75767434, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.6268112659454346 + }, + { + "auxiliary_loss_clip": 0.0110747, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.03799713, + "balance_loss_mlp": 1.01840353, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 2.864946152187493, + "language_loss": 0.67657661, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.69795895, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.598283290863037 + }, + { + "auxiliary_loss_clip": 0.01092437, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.03555632, + "balance_loss_mlp": 1.02136493, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.7712367486662122, + "language_loss": 0.76853001, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78978813, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 2.622572183609009 + }, + { + "auxiliary_loss_clip": 0.01085087, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.03318512, + "balance_loss_mlp": 1.03087401, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.7773592642429268, + "language_loss": 0.66152298, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68281114, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 2.609309196472168 + }, + { + "auxiliary_loss_clip": 0.01073274, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.03503728, + "balance_loss_mlp": 1.02667701, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 2.1441025458275247, + "language_loss": 0.85615295, + "learning_rate": 1.91881954765502e-06, + "loss": 0.87726772, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 2.744418144226074 + }, + { + "auxiliary_loss_clip": 0.01069573, + "auxiliary_loss_mlp": 0.01028248, + "balance_loss_clip": 1.03276086, + "balance_loss_mlp": 1.01679158, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.4779943890260134, + "language_loss": 0.79842675, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.81940496, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 2.6778433322906494 + }, + { + "auxiliary_loss_clip": 0.01075387, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.03080273, + "balance_loss_mlp": 1.02325535, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.659192007576713, + "language_loss": 0.83268869, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85379815, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 2.6090123653411865 + }, + { + "auxiliary_loss_clip": 0.0107737, + "auxiliary_loss_mlp": 0.01027693, + "balance_loss_clip": 1.03430176, + "balance_loss_mlp": 1.01521754, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 2.56583799255906, + "language_loss": 0.68012512, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70117569, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.6037545204162598 + }, + { + "auxiliary_loss_clip": 0.0107669, + "auxiliary_loss_mlp": 0.01037276, + "balance_loss_clip": 1.03473437, + "balance_loss_mlp": 1.02541471, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 1.4304761458771484, + "language_loss": 0.82230616, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84344578, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.614116907119751 + }, + { + "auxiliary_loss_clip": 0.01096296, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.03770399, + "balance_loss_mlp": 1.02342415, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 1.9091970889296532, + "language_loss": 0.79289818, + "learning_rate": 1.916873882856013e-06, + "loss": 0.81422502, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.6140964031219482 + }, + { + "auxiliary_loss_clip": 0.01087284, + "auxiliary_loss_mlp": 0.0103342, + "balance_loss_clip": 1.03274977, + "balance_loss_mlp": 1.02244115, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.286596412324833, + "language_loss": 0.76600623, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78721333, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 2.593886613845825 + }, + { + "auxiliary_loss_clip": 0.01077436, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.03713107, + "balance_loss_mlp": 1.01485777, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.46731131736536, + "language_loss": 0.69050181, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71155447, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.79111385345459 + }, + { + "auxiliary_loss_clip": 0.01090148, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.03395534, + "balance_loss_mlp": 1.02468526, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.8666291904978816, + "language_loss": 0.72260761, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74386346, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 4.124256372451782 + }, + { + "auxiliary_loss_clip": 0.01076783, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.03422308, + "balance_loss_mlp": 1.02105331, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.8225725827919907, + "language_loss": 0.68331575, + "learning_rate": 1.915317407666982e-06, + "loss": 0.7044127, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.613607406616211 + }, + { + "auxiliary_loss_clip": 0.01100975, + "auxiliary_loss_mlp": 0.01044808, + "balance_loss_clip": 1.03757501, + "balance_loss_mlp": 1.0300914, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.7801390381421844, + "language_loss": 0.69797087, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71942872, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.777194023132324 + }, + { + "auxiliary_loss_clip": 0.0110773, + "auxiliary_loss_mlp": 0.01034513, + "balance_loss_clip": 1.03508067, + "balance_loss_mlp": 1.02087569, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.7023727695493682, + "language_loss": 0.74685782, + "learning_rate": 1.91453918928048e-06, + "loss": 0.76828027, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 2.5868735313415527 + }, + { + "auxiliary_loss_clip": 0.01095864, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.03629446, + "balance_loss_mlp": 1.01878154, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.675860486013062, + "language_loss": 0.83228433, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85356343, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.595731258392334 + }, + { + "auxiliary_loss_clip": 0.01065327, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.0318166, + "balance_loss_mlp": 1.01390338, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 2.870788628297091, + "language_loss": 0.83027434, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85117376, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.6634061336517334 + }, + { + "auxiliary_loss_clip": 0.0104747, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.03245306, + "balance_loss_mlp": 1.01578856, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.743870108893276, + "language_loss": 0.83492839, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85566986, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 2.705264091491699 + }, + { + "auxiliary_loss_clip": 0.01072113, + "auxiliary_loss_mlp": 0.01043025, + "balance_loss_clip": 1.0397923, + "balance_loss_mlp": 1.02926838, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 2.7435326427857776, + "language_loss": 0.7480762, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.76922762, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.8154237270355225 + }, + { + "auxiliary_loss_clip": 0.01094581, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.0354681, + "balance_loss_mlp": 1.02398014, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.5261833544132655, + "language_loss": 0.69830096, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71960843, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.6233248710632324 + }, + { + "auxiliary_loss_clip": 0.01100616, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.03467071, + "balance_loss_mlp": 1.01579964, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5605750342487688, + "language_loss": 0.78826213, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.80953836, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.566854953765869 + }, + { + "auxiliary_loss_clip": 0.01044634, + "auxiliary_loss_mlp": 0.01029114, + "balance_loss_clip": 1.03242993, + "balance_loss_mlp": 1.01735449, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 2.115294595595947, + "language_loss": 0.66481817, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68555564, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.683436632156372 + }, + { + "auxiliary_loss_clip": 0.01069577, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.03030539, + "balance_loss_mlp": 1.02522159, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 1.9195332087288266, + "language_loss": 0.79817677, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81924838, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 0.044983625411987305 + }, + { + "auxiliary_loss_clip": 0.01103933, + "auxiliary_loss_mlp": 0.01042245, + "balance_loss_clip": 1.03555059, + "balance_loss_mlp": 1.02922094, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 2.1048154035860644, + "language_loss": 0.84814042, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.8696022, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.481873035430908 + }, + { + "auxiliary_loss_clip": 0.01079616, + "auxiliary_loss_mlp": 0.01035088, + "balance_loss_clip": 1.03285599, + "balance_loss_mlp": 1.02156425, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 2.6632898927059836, + "language_loss": 0.67871463, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69986165, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.5500717163085938 + }, + { + "auxiliary_loss_clip": 0.01081526, + "auxiliary_loss_mlp": 0.01030253, + "balance_loss_clip": 1.03539205, + "balance_loss_mlp": 1.01820636, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 2.046228279471113, + "language_loss": 0.80433559, + "learning_rate": 1.910259223028374e-06, + "loss": 0.8254534, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.6044585704803467 + }, + { + "auxiliary_loss_clip": 0.01055603, + "auxiliary_loss_mlp": 0.01044833, + "balance_loss_clip": 1.03118646, + "balance_loss_mlp": 1.02924097, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 2.1532952116893957, + "language_loss": 0.69101006, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71201444, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 4.158057928085327 + }, + { + "auxiliary_loss_clip": 0.01081932, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.03317189, + "balance_loss_mlp": 1.0224613, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.7022077553618005, + "language_loss": 0.82548779, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84664512, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 4.086794376373291 + }, + { + "auxiliary_loss_clip": 0.01075839, + "auxiliary_loss_mlp": 0.00749733, + "balance_loss_clip": 1.03184247, + "balance_loss_mlp": 1.00024819, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 1.8789568996925334, + "language_loss": 0.70533258, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72358835, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 2.613525867462158 + }, + { + "auxiliary_loss_clip": 0.01091966, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.0381217, + "balance_loss_mlp": 1.02418566, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.9233263046838909, + "language_loss": 0.69570649, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71698427, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 2.6068673133850098 + }, + { + "auxiliary_loss_clip": 0.01004198, + "auxiliary_loss_mlp": 0.01003497, + "balance_loss_clip": 1.01167464, + "balance_loss_mlp": 1.00214434, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.9733648127295422, + "language_loss": 0.56942886, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.58950579, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 3.1137707233428955 + }, + { + "auxiliary_loss_clip": 0.01093398, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.04157698, + "balance_loss_mlp": 1.02382886, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5172599187291054, + "language_loss": 0.63537431, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.65666246, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.6494081020355225 + }, + { + "auxiliary_loss_clip": 0.01083538, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.03635645, + "balance_loss_mlp": 1.01729953, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.8626484909888505, + "language_loss": 0.6904918, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71162283, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 2.7850537300109863 + }, + { + "auxiliary_loss_clip": 0.01088655, + "auxiliary_loss_mlp": 0.00749345, + "balance_loss_clip": 1.03378463, + "balance_loss_mlp": 1.00014389, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.6749029153544646, + "language_loss": 0.7626034, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78098345, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.602834463119507 + }, + { + "auxiliary_loss_clip": 0.01019786, + "auxiliary_loss_mlp": 0.00999412, + "balance_loss_clip": 1.00750685, + "balance_loss_mlp": 0.99807703, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.8191928999606847, + "language_loss": 0.53003287, + "learning_rate": 1.906757737841291e-06, + "loss": 0.5502249, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 3.2234866619110107 + }, + { + "auxiliary_loss_clip": 0.01018695, + "auxiliary_loss_mlp": 0.01004732, + "balance_loss_clip": 1.00728893, + "balance_loss_mlp": 1.0030036, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7378191974377585, + "language_loss": 0.63857895, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65881324, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.130244493484497 + }, + { + "auxiliary_loss_clip": 0.0109642, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.03525114, + "balance_loss_mlp": 1.02024388, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 2.6365237817861096, + "language_loss": 0.72189403, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74317968, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 4.001817941665649 + }, + { + "auxiliary_loss_clip": 0.01064664, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.03591561, + "balance_loss_mlp": 1.01849151, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 4.39204185539261, + "language_loss": 0.68982899, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71076965, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 2.6498420238494873 + }, + { + "auxiliary_loss_clip": 0.01089862, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.03413761, + "balance_loss_mlp": 1.02157295, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 2.0006061210000476, + "language_loss": 0.86443764, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88566339, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.588839530944824 + }, + { + "auxiliary_loss_clip": 0.01098035, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.03670585, + "balance_loss_mlp": 1.02341819, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.587984168328821, + "language_loss": 0.64145058, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66279471, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 2.7491455078125 + }, + { + "auxiliary_loss_clip": 0.01101714, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.03542495, + "balance_loss_mlp": 1.02541304, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.5587338873043717, + "language_loss": 0.6804328, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70182103, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 2.657912015914917 + }, + { + "auxiliary_loss_clip": 0.00999959, + "auxiliary_loss_mlp": 0.01014388, + "balance_loss_clip": 1.01771975, + "balance_loss_mlp": 1.01302898, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.7026136524106114, + "language_loss": 0.53374594, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55388939, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 3.313002824783325 + }, + { + "auxiliary_loss_clip": 0.0101023, + "auxiliary_loss_mlp": 0.01005054, + "balance_loss_clip": 1.00892329, + "balance_loss_mlp": 1.00379086, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7282696479694543, + "language_loss": 0.56397069, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58412349, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 3.2016005516052246 + }, + { + "auxiliary_loss_clip": 0.0104278, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.03194165, + "balance_loss_mlp": 1.02118993, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 6.645369458981193, + "language_loss": 0.81793112, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.83869231, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 2.6989309787750244 + }, + { + "auxiliary_loss_clip": 0.01107981, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.03843045, + "balance_loss_mlp": 1.01611042, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.645477386242408, + "language_loss": 0.85042727, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87178642, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.604682207107544 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.0377351, + "balance_loss_mlp": 1.02197146, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.0271414543450135, + "language_loss": 0.66192174, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68327165, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.58151912689209 + }, + { + "auxiliary_loss_clip": 0.01077576, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.03319931, + "balance_loss_mlp": 1.02222753, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.430885413585662, + "language_loss": 0.71874928, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.73986483, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 2.811225414276123 + }, + { + "auxiliary_loss_clip": 0.01074225, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.03459322, + "balance_loss_mlp": 1.02006388, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6492134530159082, + "language_loss": 0.65459514, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67567313, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.64644193649292 + }, + { + "auxiliary_loss_clip": 0.01055679, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.03291309, + "balance_loss_mlp": 1.01493537, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 2.5669807357958025, + "language_loss": 0.74730718, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.7681458, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 2.7258942127227783 + }, + { + "auxiliary_loss_clip": 0.01068209, + "auxiliary_loss_mlp": 0.01037642, + "balance_loss_clip": 1.03440869, + "balance_loss_mlp": 1.02316391, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 1.7961927334477694, + "language_loss": 0.82403219, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84509075, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 2.6251449584960938 + }, + { + "auxiliary_loss_clip": 0.01080948, + "auxiliary_loss_mlp": 0.01027161, + "balance_loss_clip": 1.03353798, + "balance_loss_mlp": 1.01608062, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.402925080060273, + "language_loss": 0.72668326, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74776435, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 2.622835636138916 + }, + { + "auxiliary_loss_clip": 0.01074219, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.03494394, + "balance_loss_mlp": 1.01756346, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.481995250644398, + "language_loss": 0.74362242, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76465178, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 4.243749618530273 + }, + { + "auxiliary_loss_clip": 0.01067911, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.03158152, + "balance_loss_mlp": 1.02104902, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.886464846948151, + "language_loss": 0.67083472, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69185239, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.647944688796997 + }, + { + "auxiliary_loss_clip": 0.01107942, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.03737056, + "balance_loss_mlp": 1.02334774, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.5011433403370622, + "language_loss": 0.69552875, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71697176, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.673841953277588 + }, + { + "auxiliary_loss_clip": 0.01078522, + "auxiliary_loss_mlp": 0.00749178, + "balance_loss_clip": 1.03557193, + "balance_loss_mlp": 1.00014222, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 1.9552700239997538, + "language_loss": 0.76436573, + "learning_rate": 1.898977700702689e-06, + "loss": 0.78264272, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.5872814655303955 + }, + { + "auxiliary_loss_clip": 0.01022445, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.02916121, + "balance_loss_mlp": 1.02610707, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 4.587945486755799, + "language_loss": 0.86022097, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.88083112, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 2.6898446083068848 + }, + { + "auxiliary_loss_clip": 0.01101597, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.03611314, + "balance_loss_mlp": 1.01903677, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.4802505094703762, + "language_loss": 0.64222062, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66354775, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 2.53132700920105 + }, + { + "auxiliary_loss_clip": 0.01081062, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.0354352, + "balance_loss_mlp": 1.0265696, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.731493884277163, + "language_loss": 0.6003623, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62156498, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.7851948738098145 + }, + { + "auxiliary_loss_clip": 0.01093005, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.03481317, + "balance_loss_mlp": 1.01637661, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.972214572068755, + "language_loss": 0.81128925, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83250785, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.600694417953491 + }, + { + "auxiliary_loss_clip": 0.01085754, + "auxiliary_loss_mlp": 0.01029465, + "balance_loss_clip": 1.03583598, + "balance_loss_mlp": 1.01790166, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.3783546711004653, + "language_loss": 0.78152168, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80267382, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 2.67460560798645 + }, + { + "auxiliary_loss_clip": 0.01093129, + "auxiliary_loss_mlp": 0.010275, + "balance_loss_clip": 1.0361464, + "balance_loss_mlp": 1.01559734, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.079040914519544, + "language_loss": 0.81298578, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83419216, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.62845778465271 + }, + { + "auxiliary_loss_clip": 0.0109201, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.03526139, + "balance_loss_mlp": 1.01635432, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 2.4157812228995117, + "language_loss": 0.7307772, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75197905, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 2.614088296890259 + }, + { + "auxiliary_loss_clip": 0.01066376, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.03288686, + "balance_loss_mlp": 1.021698, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 3.035913634954264, + "language_loss": 0.75532311, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77633297, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.728271007537842 + }, + { + "auxiliary_loss_clip": 0.01070623, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.03216863, + "balance_loss_mlp": 1.01677847, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.8170692897922986, + "language_loss": 0.73511583, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75612253, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.6877329349517822 + }, + { + "auxiliary_loss_clip": 0.01109239, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.03675342, + "balance_loss_mlp": 1.02595973, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.9287628776383516, + "language_loss": 0.77604395, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79753375, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.5610060691833496 + }, + { + "auxiliary_loss_clip": 0.0107995, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.03444982, + "balance_loss_mlp": 1.02272666, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.5926247169556673, + "language_loss": 0.72399986, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74515963, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.6386544704437256 + }, + { + "auxiliary_loss_clip": 0.01086783, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.0365119, + "balance_loss_mlp": 1.01789117, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 2.7446510290658623, + "language_loss": 0.80750549, + "learning_rate": 1.894310406375987e-06, + "loss": 0.82868147, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 5.766769647598267 + }, + { + "auxiliary_loss_clip": 0.01092474, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.03848863, + "balance_loss_mlp": 1.01685631, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 2.142464342412463, + "language_loss": 0.85775948, + "learning_rate": 1.893921490881035e-06, + "loss": 0.87898231, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 2.6005468368530273 + }, + { + "auxiliary_loss_clip": 0.01079774, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.03429103, + "balance_loss_mlp": 1.01991642, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.7954925974316924, + "language_loss": 0.72698003, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74808669, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 2.6239142417907715 + }, + { + "auxiliary_loss_clip": 0.01081483, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.03182673, + "balance_loss_mlp": 1.02561569, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.6221453012593612, + "language_loss": 0.76904869, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79023945, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 2.6180026531219482 + }, + { + "auxiliary_loss_clip": 0.01070557, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.0358274, + "balance_loss_mlp": 1.01668108, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.686905881458355, + "language_loss": 0.77447093, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79547286, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 2.695087432861328 + }, + { + "auxiliary_loss_clip": 0.01010191, + "auxiliary_loss_mlp": 0.01003976, + "balance_loss_clip": 1.00830114, + "balance_loss_mlp": 1.00266492, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 1.9375055029709043, + "language_loss": 0.56784624, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.5879879, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 3.258014440536499 + }, + { + "auxiliary_loss_clip": 0.01083581, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.03517807, + "balance_loss_mlp": 1.02602792, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.8114470926924804, + "language_loss": 0.73973322, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76096076, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.653923988342285 + }, + { + "auxiliary_loss_clip": 0.01007597, + "auxiliary_loss_mlp": 0.01000954, + "balance_loss_clip": 1.00679469, + "balance_loss_mlp": 0.99966109, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8903088959011373, + "language_loss": 0.61027932, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63036478, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 3.2800467014312744 + }, + { + "auxiliary_loss_clip": 0.01017629, + "auxiliary_loss_mlp": 0.01004302, + "balance_loss_clip": 1.00557709, + "balance_loss_mlp": 1.0029726, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.8427691013919726, + "language_loss": 0.62179065, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64200997, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.1173043251037598 + }, + { + "auxiliary_loss_clip": 0.01069648, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.03218448, + "balance_loss_mlp": 1.0225668, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 2.929297147377796, + "language_loss": 0.75380439, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77486801, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 2.6409530639648438 + }, + { + "auxiliary_loss_clip": 0.01092028, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.03494716, + "balance_loss_mlp": 1.02003872, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 4.637537855322696, + "language_loss": 0.75402802, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77526009, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 4.124504089355469 + }, + { + "auxiliary_loss_clip": 0.01082335, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.03515995, + "balance_loss_mlp": 1.01911283, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.7039261223637856, + "language_loss": 0.87895012, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.90007627, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 2.597184419631958 + }, + { + "auxiliary_loss_clip": 0.01063978, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.0338186, + "balance_loss_mlp": 1.02550745, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.149811434504223, + "language_loss": 0.74395311, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76498979, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.6099612712860107 + }, + { + "auxiliary_loss_clip": 0.01087484, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.03230214, + "balance_loss_mlp": 1.01871228, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 2.059818377478991, + "language_loss": 0.79590583, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81709808, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 2.610194444656372 + }, + { + "auxiliary_loss_clip": 0.01102365, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.03417778, + "balance_loss_mlp": 1.01774907, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4940091525154615, + "language_loss": 0.54916394, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57048547, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 2.6182100772857666 + }, + { + "auxiliary_loss_clip": 0.01089898, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.03442621, + "balance_loss_mlp": 1.02125597, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.8213226632548358, + "language_loss": 0.68420863, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70543993, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 2.5694727897644043 + }, + { + "auxiliary_loss_clip": 0.01007278, + "auxiliary_loss_mlp": 0.00746813, + "balance_loss_clip": 1.00516748, + "balance_loss_mlp": 0.9998461, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8050627218975254, + "language_loss": 0.62801009, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64555097, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 3.103181838989258 + }, + { + "auxiliary_loss_clip": 0.0109343, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.03338957, + "balance_loss_mlp": 1.019382, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.4562148628136975, + "language_loss": 0.79620481, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81745535, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.5232086181640625 + }, + { + "auxiliary_loss_clip": 0.01073642, + "auxiliary_loss_mlp": 0.01026125, + "balance_loss_clip": 1.03414488, + "balance_loss_mlp": 1.01512825, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.8092293575522176, + "language_loss": 0.73224324, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75324094, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 2.6644046306610107 + }, + { + "auxiliary_loss_clip": 0.01073446, + "auxiliary_loss_mlp": 0.00749283, + "balance_loss_clip": 1.02966833, + "balance_loss_mlp": 1.00017643, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 2.5026258698875528, + "language_loss": 0.64746243, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66568971, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.6360559463500977 + }, + { + "auxiliary_loss_clip": 0.01083597, + "auxiliary_loss_mlp": 0.010368, + "balance_loss_clip": 1.03540194, + "balance_loss_mlp": 1.02315676, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 2.307543443261118, + "language_loss": 0.77673566, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79793966, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.63973331451416 + }, + { + "auxiliary_loss_clip": 0.01071952, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.03539646, + "balance_loss_mlp": 1.01840186, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 2.571279643297326, + "language_loss": 0.71211708, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73315012, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 2.6493513584136963 + }, + { + "auxiliary_loss_clip": 0.0108813, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.03397405, + "balance_loss_mlp": 1.02429986, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.732696120922875, + "language_loss": 0.69315958, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71441728, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.675837993621826 + }, + { + "auxiliary_loss_clip": 0.01090693, + "auxiliary_loss_mlp": 0.01025841, + "balance_loss_clip": 1.03629494, + "balance_loss_mlp": 1.0151298, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 7.073212284576619, + "language_loss": 0.69409639, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71526176, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 2.5529801845550537 + }, + { + "auxiliary_loss_clip": 0.01085601, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.03802359, + "balance_loss_mlp": 1.02019906, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.6805793916901093, + "language_loss": 0.78015858, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80133188, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.6593644618988037 + }, + { + "auxiliary_loss_clip": 0.01061174, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.03403628, + "balance_loss_mlp": 1.02544594, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 2.8578368650679704, + "language_loss": 0.85467851, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87567711, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 4.240210056304932 + }, + { + "auxiliary_loss_clip": 0.010795, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.03222895, + "balance_loss_mlp": 1.01982665, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 6.628914552967692, + "language_loss": 0.61935616, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.64048934, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.607203245162964 + }, + { + "auxiliary_loss_clip": 0.01078211, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.03655207, + "balance_loss_mlp": 1.0245533, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 2.106284980259538, + "language_loss": 0.73339248, + "learning_rate": 1.883811143046377e-06, + "loss": 0.75454682, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 2.6460320949554443 + }, + { + "auxiliary_loss_clip": 0.01102826, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.03480339, + "balance_loss_mlp": 1.02682829, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.6072279990682998, + "language_loss": 0.64324331, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66465765, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.6161820888519287 + }, + { + "auxiliary_loss_clip": 0.01093952, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.03486884, + "balance_loss_mlp": 1.01740491, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 7.139252172001478, + "language_loss": 0.78091002, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80214477, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 2.6235852241516113 + }, + { + "auxiliary_loss_clip": 0.01091462, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.0338707, + "balance_loss_mlp": 1.01675081, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 2.059693973448193, + "language_loss": 0.73377597, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75497675, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.545250177383423 + }, + { + "auxiliary_loss_clip": 0.01079619, + "auxiliary_loss_mlp": 0.01035905, + "balance_loss_clip": 1.03274846, + "balance_loss_mlp": 1.02263713, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.5633862590544032, + "language_loss": 0.71979433, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74094951, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 2.762312650680542 + }, + { + "auxiliary_loss_clip": 0.01071301, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.0332551, + "balance_loss_mlp": 1.01946092, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.9553462571884723, + "language_loss": 0.78572667, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80675882, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.6607377529144287 + }, + { + "auxiliary_loss_clip": 0.01095572, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.03469896, + "balance_loss_mlp": 1.0255506, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.8434034195342086, + "language_loss": 0.75564373, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.77697843, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.5807039737701416 + }, + { + "auxiliary_loss_clip": 0.01086639, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.03717172, + "balance_loss_mlp": 1.02264905, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.9597713310877032, + "language_loss": 0.75430834, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.775536, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.66979718208313 + }, + { + "auxiliary_loss_clip": 0.01082923, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.0358274, + "balance_loss_mlp": 1.01931536, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.8536727614038317, + "language_loss": 0.72163475, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74277472, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.6168148517608643 + }, + { + "auxiliary_loss_clip": 0.0107992, + "auxiliary_loss_mlp": 0.01038252, + "balance_loss_clip": 1.0377233, + "balance_loss_mlp": 1.0254972, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.967435507599632, + "language_loss": 0.65023983, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67142153, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.6142423152923584 + }, + { + "auxiliary_loss_clip": 0.0108033, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.03500712, + "balance_loss_mlp": 1.0267812, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 2.3618859832017143, + "language_loss": 0.79857588, + "learning_rate": 1.879923326631099e-06, + "loss": 0.81976008, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.574155807495117 + }, + { + "auxiliary_loss_clip": 0.01094678, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.03633237, + "balance_loss_mlp": 1.01759887, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.7349203312696024, + "language_loss": 0.69674814, + "learning_rate": 1.879534569789582e-06, + "loss": 0.7179898, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.575227975845337 + }, + { + "auxiliary_loss_clip": 0.01028425, + "auxiliary_loss_mlp": 0.00998831, + "balance_loss_clip": 1.00700021, + "balance_loss_mlp": 0.99750197, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7245967214309273, + "language_loss": 0.59670746, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61698002, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 4.737492561340332 + }, + { + "auxiliary_loss_clip": 0.01093601, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.03535342, + "balance_loss_mlp": 1.02215993, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.967127998298098, + "language_loss": 0.7477181, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76899529, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 4.044584274291992 + }, + { + "auxiliary_loss_clip": 0.0101653, + "auxiliary_loss_mlp": 0.00999712, + "balance_loss_clip": 1.00711083, + "balance_loss_mlp": 0.99825776, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7554588406689808, + "language_loss": 0.57222569, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59238815, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 3.0214431285858154 + }, + { + "auxiliary_loss_clip": 0.01106593, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.03504729, + "balance_loss_mlp": 1.01853776, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.608309555479871, + "language_loss": 0.72033578, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74171424, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.5385382175445557 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.03567994, + "balance_loss_mlp": 1.02042091, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 3.044027483319315, + "language_loss": 0.83639169, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85778177, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 2.513428211212158 + }, + { + "auxiliary_loss_clip": 0.01033108, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.03041434, + "balance_loss_mlp": 1.01896429, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.407979864188947, + "language_loss": 0.79219413, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81283343, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 2.6963908672332764 + }, + { + "auxiliary_loss_clip": 0.01007472, + "auxiliary_loss_mlp": 0.01002126, + "balance_loss_clip": 1.00853229, + "balance_loss_mlp": 1.00090456, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7956005925694158, + "language_loss": 0.59264553, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61274159, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 3.1108005046844482 + }, + { + "auxiliary_loss_clip": 0.01010076, + "auxiliary_loss_mlp": 0.0100543, + "balance_loss_clip": 1.00875664, + "balance_loss_mlp": 1.00423837, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8665862829966005, + "language_loss": 0.63603061, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65618569, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 2.948786497116089 + }, + { + "auxiliary_loss_clip": 0.0105601, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.03098512, + "balance_loss_mlp": 1.019274, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 3.7028819443369234, + "language_loss": 0.82316768, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84404832, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 2.720611333847046 + }, + { + "auxiliary_loss_clip": 0.0106328, + "auxiliary_loss_mlp": 0.01036597, + "balance_loss_clip": 1.03099918, + "balance_loss_mlp": 1.0241518, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.7215536811261263, + "language_loss": 0.72262436, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74362314, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 2.608276844024658 + }, + { + "auxiliary_loss_clip": 0.0108398, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.03244793, + "balance_loss_mlp": 1.01735687, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 4.124107543376813, + "language_loss": 0.78693843, + "learning_rate": 1.87525854926798e-06, + "loss": 0.80808449, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 2.5988845825195312 + }, + { + "auxiliary_loss_clip": 0.01066904, + "auxiliary_loss_mlp": 0.00749403, + "balance_loss_clip": 1.03397179, + "balance_loss_mlp": 1.00021982, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.597057548042757, + "language_loss": 0.7491377, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76730072, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 4.377425670623779 + }, + { + "auxiliary_loss_clip": 0.01074969, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03076804, + "balance_loss_mlp": 1.01947534, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 2.2142113788337934, + "language_loss": 0.69137275, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.71244144, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.667618989944458 + }, + { + "auxiliary_loss_clip": 0.01096591, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.03397393, + "balance_loss_mlp": 1.02235162, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 8.881908202650196, + "language_loss": 0.77181208, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79312766, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 2.5646421909332275 + }, + { + "auxiliary_loss_clip": 0.01104719, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.03648508, + "balance_loss_mlp": 1.02523637, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 1.8304249514326836, + "language_loss": 0.69677979, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71820271, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 2.52606463432312 + }, + { + "auxiliary_loss_clip": 0.01107005, + "auxiliary_loss_mlp": 0.01039537, + "balance_loss_clip": 1.03504312, + "balance_loss_mlp": 1.02617371, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 2.532999220955917, + "language_loss": 0.77248341, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79394889, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.4975669384002686 + }, + { + "auxiliary_loss_clip": 0.01081368, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.03111601, + "balance_loss_mlp": 1.02341199, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.8385977632612522, + "language_loss": 0.74314612, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76431912, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 2.6250054836273193 + }, + { + "auxiliary_loss_clip": 0.01088525, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.03843474, + "balance_loss_mlp": 1.02167487, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.8254274598865157, + "language_loss": 0.87903595, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90025645, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 2.6179425716400146 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.0332222, + "balance_loss_mlp": 1.02203703, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.9182905984526273, + "language_loss": 0.72503555, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74635518, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01091243, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.03355861, + "balance_loss_mlp": 1.01891279, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 1.6937403988448652, + "language_loss": 0.75015903, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77138042, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.603001356124878 + }, + { + "auxiliary_loss_clip": 0.0106788, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.03144276, + "balance_loss_mlp": 1.01867223, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 2.512727436824187, + "language_loss": 0.76854563, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.78954029, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 2.5837535858154297 + }, + { + "auxiliary_loss_clip": 0.01073492, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.03394985, + "balance_loss_mlp": 1.01682162, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.7423157732257029, + "language_loss": 0.78837657, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80939984, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 2.5998411178588867 + }, + { + "auxiliary_loss_clip": 0.01092119, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.03446341, + "balance_loss_mlp": 1.01944196, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 2.0215351200898977, + "language_loss": 0.75718868, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.77842343, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.5780673027038574 + }, + { + "auxiliary_loss_clip": 0.01020027, + "auxiliary_loss_mlp": 0.01010199, + "balance_loss_clip": 1.00863147, + "balance_loss_mlp": 1.00895298, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.842952858502108, + "language_loss": 0.57993335, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60023558, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.2839763164520264 + }, + { + "auxiliary_loss_clip": 0.01082572, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.03608346, + "balance_loss_mlp": 1.0177623, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.879996194846863, + "language_loss": 0.69787359, + "learning_rate": 1.869817171696868e-06, + "loss": 0.71899313, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.7123169898986816 + }, + { + "auxiliary_loss_clip": 0.01077288, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.03103745, + "balance_loss_mlp": 1.01806188, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.6236208493359765, + "language_loss": 0.71630609, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73737609, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.6319544315338135 + }, + { + "auxiliary_loss_clip": 0.0106437, + "auxiliary_loss_mlp": 0.01030797, + "balance_loss_clip": 1.02938855, + "balance_loss_mlp": 1.01848292, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 2.0295735981858827, + "language_loss": 0.77115571, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79210734, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 2.651487350463867 + }, + { + "auxiliary_loss_clip": 0.01068022, + "auxiliary_loss_mlp": 0.01034515, + "balance_loss_clip": 1.03266191, + "balance_loss_mlp": 1.02275538, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.3536860760706109, + "language_loss": 0.69856608, + "learning_rate": 1.868651286721281e-06, + "loss": 0.7195915, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 4.179778099060059 + }, + { + "auxiliary_loss_clip": 0.01093064, + "auxiliary_loss_mlp": 0.00749292, + "balance_loss_clip": 1.03396249, + "balance_loss_mlp": 1.00017381, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.8080742666343435, + "language_loss": 0.72408342, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74250698, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.594703197479248 + }, + { + "auxiliary_loss_clip": 0.01081982, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.03647447, + "balance_loss_mlp": 1.02336597, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.9545116744014028, + "language_loss": 0.73534691, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75652063, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 2.6245813369750977 + }, + { + "auxiliary_loss_clip": 0.01085787, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.03337836, + "balance_loss_mlp": 1.02423668, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4413105020129995, + "language_loss": 0.83382916, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85502964, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.6096231937408447 + }, + { + "auxiliary_loss_clip": 0.01093715, + "auxiliary_loss_mlp": 0.00749311, + "balance_loss_clip": 1.03478861, + "balance_loss_mlp": 1.00014615, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 1.9082461211075172, + "language_loss": 0.74020588, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.75863618, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.590357780456543 + }, + { + "auxiliary_loss_clip": 0.01084487, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.03264952, + "balance_loss_mlp": 1.02280951, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 2.1666609541601454, + "language_loss": 0.76326525, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78447318, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.559283971786499 + }, + { + "auxiliary_loss_clip": 0.01072764, + "auxiliary_loss_mlp": 0.00749241, + "balance_loss_clip": 1.03211117, + "balance_loss_mlp": 1.00015163, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 1.8625067063211382, + "language_loss": 0.73816669, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.75638676, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.5896260738372803 + }, + { + "auxiliary_loss_clip": 0.01058906, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.03461838, + "balance_loss_mlp": 1.02017379, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 2.0338755533021744, + "language_loss": 0.84300721, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86390996, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.6014156341552734 + }, + { + "auxiliary_loss_clip": 0.01081766, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.0335443, + "balance_loss_mlp": 1.015939, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.7289897875046696, + "language_loss": 0.81907547, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.84017575, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.6814820766448975 + }, + { + "auxiliary_loss_clip": 0.01059705, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.03028107, + "balance_loss_mlp": 1.02663934, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 2.5882981736539192, + "language_loss": 0.6889751, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.70996052, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.6771934032440186 + }, + { + "auxiliary_loss_clip": 0.01078517, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.03352165, + "balance_loss_mlp": 1.02051806, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 2.0830562864185866, + "language_loss": 0.71298146, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73408753, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 2.5960614681243896 + }, + { + "auxiliary_loss_clip": 0.01074768, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.0353694, + "balance_loss_mlp": 1.02063882, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.7028981258856917, + "language_loss": 0.72298771, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74405432, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.638639211654663 + }, + { + "auxiliary_loss_clip": 0.01081027, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.03570521, + "balance_loss_mlp": 1.02149284, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.0175527899000367, + "language_loss": 0.70712185, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72828096, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 2.650418519973755 + }, + { + "auxiliary_loss_clip": 0.01071349, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.03166366, + "balance_loss_mlp": 1.02447319, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 3.043494482212083, + "language_loss": 0.74536419, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.76644623, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 2.754572868347168 + }, + { + "auxiliary_loss_clip": 0.01062114, + "auxiliary_loss_mlp": 0.00749396, + "balance_loss_clip": 1.03307557, + "balance_loss_mlp": 1.00020278, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.0746721512546378, + "language_loss": 0.72103566, + "learning_rate": 1.863211089308289e-06, + "loss": 0.73915076, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 5.898691415786743 + }, + { + "auxiliary_loss_clip": 0.01081643, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.03438711, + "balance_loss_mlp": 1.02120018, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.044712497077433, + "language_loss": 0.71660352, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73775542, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 2.6245100498199463 + }, + { + "auxiliary_loss_clip": 0.01084292, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.03675532, + "balance_loss_mlp": 1.01824594, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.416043420340176, + "language_loss": 0.74786454, + "learning_rate": 1.862434000299067e-06, + "loss": 0.76900697, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 2.628589630126953 + }, + { + "auxiliary_loss_clip": 0.01074338, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.03226721, + "balance_loss_mlp": 1.01978612, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 2.898497024641449, + "language_loss": 0.71652782, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73758572, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 2.5320000648498535 + }, + { + "auxiliary_loss_clip": 0.0108306, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.03142834, + "balance_loss_mlp": 1.02082491, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 2.7159186277084992, + "language_loss": 0.68627912, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.7074542, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 2.7465903759002686 + }, + { + "auxiliary_loss_clip": 0.01092679, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.03557825, + "balance_loss_mlp": 1.02066374, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 2.0699075569221668, + "language_loss": 0.81920779, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84045875, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.6400699615478516 + }, + { + "auxiliary_loss_clip": 0.01093766, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.03461266, + "balance_loss_mlp": 1.01557398, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.8992331170015995, + "language_loss": 0.76790154, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78910828, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 2.5820133686065674 + }, + { + "auxiliary_loss_clip": 0.01074296, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.03256655, + "balance_loss_mlp": 1.01902139, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.4543903432320038, + "language_loss": 0.70490932, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72596931, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 2.698617458343506 + }, + { + "auxiliary_loss_clip": 0.0107284, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.03391421, + "balance_loss_mlp": 1.02201557, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.756965757502965, + "language_loss": 0.87376255, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89484614, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.6717007160186768 + }, + { + "auxiliary_loss_clip": 0.01104813, + "auxiliary_loss_mlp": 0.01027278, + "balance_loss_clip": 1.03458655, + "balance_loss_mlp": 1.01567256, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 2.1937290237650275, + "language_loss": 0.77875137, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80007225, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 2.6540749073028564 + }, + { + "auxiliary_loss_clip": 0.01067839, + "auxiliary_loss_mlp": 0.01025957, + "balance_loss_clip": 1.03537917, + "balance_loss_mlp": 1.01519251, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.4300899986914166, + "language_loss": 0.67069721, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69163513, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 4.27095890045166 + }, + { + "auxiliary_loss_clip": 0.01090812, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.03156447, + "balance_loss_mlp": 1.01673627, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.8563869178928412, + "language_loss": 0.73331535, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75451064, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 2.5903525352478027 + }, + { + "auxiliary_loss_clip": 0.01074712, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.0309422, + "balance_loss_mlp": 1.01726687, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 2.2845375487556483, + "language_loss": 0.62579453, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.6468308, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.7188634872436523 + }, + { + "auxiliary_loss_clip": 0.01092073, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.03407836, + "balance_loss_mlp": 1.0218997, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.6572434198585337, + "language_loss": 0.65924317, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68049896, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.6490042209625244 + }, + { + "auxiliary_loss_clip": 0.0105235, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.03090382, + "balance_loss_mlp": 1.01477826, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4853290974610969, + "language_loss": 0.67092419, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69171691, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 2.7254796028137207 + }, + { + "auxiliary_loss_clip": 0.01047572, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.03160536, + "balance_loss_mlp": 1.02095306, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.5309586139036255, + "language_loss": 0.75704819, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77786994, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 2.7346079349517822 + }, + { + "auxiliary_loss_clip": 0.01064865, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.03393483, + "balance_loss_mlp": 1.02125204, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.685347594675115, + "language_loss": 0.66004455, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68102682, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.739597797393799 + }, + { + "auxiliary_loss_clip": 0.01087631, + "auxiliary_loss_mlp": 0.00749361, + "balance_loss_clip": 1.03509629, + "balance_loss_mlp": 1.00023222, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.784768026454002, + "language_loss": 0.82935584, + "learning_rate": 1.856606505975565e-06, + "loss": 0.84772569, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 2.6533515453338623 + }, + { + "auxiliary_loss_clip": 0.01064991, + "auxiliary_loss_mlp": 0.0103231, + "balance_loss_clip": 1.03210926, + "balance_loss_mlp": 1.01984692, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.747185544094189, + "language_loss": 0.80019575, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82116872, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 2.623570680618286 + }, + { + "auxiliary_loss_clip": 0.01092001, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.03386188, + "balance_loss_mlp": 1.02754188, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.696166148302465, + "language_loss": 0.8374182, + "learning_rate": 1.855829598084659e-06, + "loss": 0.85873127, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.5740790367126465 + }, + { + "auxiliary_loss_clip": 0.0107137, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.03547263, + "balance_loss_mlp": 1.01805675, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.3446720973444954, + "language_loss": 0.72712779, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74813676, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.822305679321289 + }, + { + "auxiliary_loss_clip": 0.0107521, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.03103244, + "balance_loss_mlp": 1.0192368, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.155997562884368, + "language_loss": 0.81459624, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83568013, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.5596845149993896 + }, + { + "auxiliary_loss_clip": 0.01107055, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.0355823, + "balance_loss_mlp": 1.01714802, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.3468989763171875, + "language_loss": 0.80728436, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.8286382, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.51666259765625 + }, + { + "auxiliary_loss_clip": 0.00997952, + "auxiliary_loss_mlp": 0.01010659, + "balance_loss_clip": 1.00685608, + "balance_loss_mlp": 1.00905001, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7044159620819822, + "language_loss": 0.52503145, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54511756, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 3.1924548149108887 + }, + { + "auxiliary_loss_clip": 0.01063186, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.03293347, + "balance_loss_mlp": 1.01787174, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 1.8207317119678725, + "language_loss": 0.71894538, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73987478, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.62511944770813 + }, + { + "auxiliary_loss_clip": 0.01072416, + "auxiliary_loss_mlp": 0.01027318, + "balance_loss_clip": 1.03022146, + "balance_loss_mlp": 1.01614177, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.7080831231413691, + "language_loss": 0.79295385, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81395113, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.6296770572662354 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.03515983, + "balance_loss_mlp": 1.02484584, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.7804047750518477, + "language_loss": 0.70070446, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72212857, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 4.069922208786011 + }, + { + "auxiliary_loss_clip": 0.01016313, + "auxiliary_loss_mlp": 0.01002106, + "balance_loss_clip": 1.00491023, + "balance_loss_mlp": 1.00083685, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8372082946251685, + "language_loss": 0.59677565, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61695987, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.109086751937866 + }, + { + "auxiliary_loss_clip": 0.01061665, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03587627, + "balance_loss_mlp": 1.02015638, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 1.9165389136100885, + "language_loss": 0.77811652, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79906738, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 2.804810047149658 + }, + { + "auxiliary_loss_clip": 0.01094319, + "auxiliary_loss_mlp": 0.01030907, + "balance_loss_clip": 1.03380537, + "balance_loss_mlp": 1.019135, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.766896314102942, + "language_loss": 0.68885642, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.71010864, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.6129913330078125 + }, + { + "auxiliary_loss_clip": 0.01058889, + "auxiliary_loss_mlp": 0.01034592, + "balance_loss_clip": 1.0341692, + "balance_loss_mlp": 1.02275991, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.532727968137704, + "language_loss": 0.76880759, + "learning_rate": 1.851556998731498e-06, + "loss": 0.78974241, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.731816291809082 + }, + { + "auxiliary_loss_clip": 0.01092095, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.03426945, + "balance_loss_mlp": 1.01958025, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.9202398720743976, + "language_loss": 0.60233331, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62356389, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.677670955657959 + }, + { + "auxiliary_loss_clip": 0.01058913, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.03183246, + "balance_loss_mlp": 1.01803827, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.7319475064290621, + "language_loss": 0.79517281, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81605995, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.668496608734131 + }, + { + "auxiliary_loss_clip": 0.01054729, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.02851903, + "balance_loss_mlp": 1.02333224, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.8756512763187287, + "language_loss": 0.78136337, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80227792, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.6883487701416016 + }, + { + "auxiliary_loss_clip": 0.01088039, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.04066312, + "balance_loss_mlp": 1.01839173, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.8657704173295802, + "language_loss": 0.72446609, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74563831, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.750758409500122 + }, + { + "auxiliary_loss_clip": 0.0110118, + "auxiliary_loss_mlp": 0.00749469, + "balance_loss_clip": 1.03263474, + "balance_loss_mlp": 1.00023651, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.7985335070771242, + "language_loss": 0.75056738, + "learning_rate": 1.849615132097085e-06, + "loss": 0.76907384, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.5174567699432373 + }, + { + "auxiliary_loss_clip": 0.01080236, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.03558111, + "balance_loss_mlp": 1.01628566, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.3727953027230198, + "language_loss": 0.79857773, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81965977, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.641220808029175 + }, + { + "auxiliary_loss_clip": 0.01066092, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.03463221, + "balance_loss_mlp": 1.02128899, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 1.8755612248613178, + "language_loss": 0.80499691, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82600832, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.637507438659668 + }, + { + "auxiliary_loss_clip": 0.01105502, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.03702199, + "balance_loss_mlp": 1.01619411, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.2924637098386507, + "language_loss": 0.76229119, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78363252, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 2.5642242431640625 + }, + { + "auxiliary_loss_clip": 0.0108108, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.03498614, + "balance_loss_mlp": 1.02393866, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.7193311497011468, + "language_loss": 0.78658569, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80775803, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 4.195205211639404 + }, + { + "auxiliary_loss_clip": 0.0099923, + "auxiliary_loss_mlp": 0.01000319, + "balance_loss_clip": 1.00675964, + "balance_loss_mlp": 0.99913293, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8662710726687964, + "language_loss": 0.63445568, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65445125, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 4.681714296340942 + }, + { + "auxiliary_loss_clip": 0.00986542, + "auxiliary_loss_mlp": 0.00998452, + "balance_loss_clip": 1.00640905, + "balance_loss_mlp": 0.99714071, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7033219140184817, + "language_loss": 0.51645708, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53630698, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 3.2400755882263184 + }, + { + "auxiliary_loss_clip": 0.01098504, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.03892612, + "balance_loss_mlp": 1.0174998, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 3.1948754898791267, + "language_loss": 0.76771808, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.78900337, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 2.61588454246521 + }, + { + "auxiliary_loss_clip": 0.01058646, + "auxiliary_loss_mlp": 0.01028334, + "balance_loss_clip": 1.03383636, + "balance_loss_mlp": 1.01620436, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.0766322805168738, + "language_loss": 0.83716041, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.8580302, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 2.6381168365478516 + }, + { + "auxiliary_loss_clip": 0.01094078, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.03729081, + "balance_loss_mlp": 1.02129996, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.5700391624022747, + "language_loss": 0.78447247, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80574089, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 2.664311408996582 + }, + { + "auxiliary_loss_clip": 0.01070174, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.03335083, + "balance_loss_mlp": 1.02134705, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 3.4292171531326523, + "language_loss": 0.84487665, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86591089, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 2.683746814727783 + }, + { + "auxiliary_loss_clip": 0.01007963, + "auxiliary_loss_mlp": 0.01006605, + "balance_loss_clip": 1.00957346, + "balance_loss_mlp": 1.00549054, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7407201856490866, + "language_loss": 0.54210746, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56225312, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 3.0706944465637207 + }, + { + "auxiliary_loss_clip": 0.01007126, + "auxiliary_loss_mlp": 0.0100603, + "balance_loss_clip": 1.00580621, + "balance_loss_mlp": 1.00468862, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8030310655282858, + "language_loss": 0.63422608, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65435767, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 3.191749095916748 + }, + { + "auxiliary_loss_clip": 0.01042654, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.03134346, + "balance_loss_mlp": 1.02212632, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.4706076031874615, + "language_loss": 0.69815648, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.71892786, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 2.794844388961792 + }, + { + "auxiliary_loss_clip": 0.01077616, + "auxiliary_loss_mlp": 0.00749395, + "balance_loss_clip": 1.03532243, + "balance_loss_mlp": 1.00022388, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.318702707248915, + "language_loss": 0.82336307, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.8416332, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 2.5739009380340576 + }, + { + "auxiliary_loss_clip": 0.01104599, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.03719008, + "balance_loss_mlp": 1.01972723, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 2.355588858210406, + "language_loss": 0.72446597, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74582911, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 4.066432476043701 + }, + { + "auxiliary_loss_clip": 0.01080026, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.03218555, + "balance_loss_mlp": 1.02047455, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.5895512685809916, + "language_loss": 0.81908911, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.84020674, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.659506320953369 + }, + { + "auxiliary_loss_clip": 0.01071848, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.03350723, + "balance_loss_mlp": 1.01902592, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.4643308845637588, + "language_loss": 0.74020922, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.7612406, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 2.7321360111236572 + }, + { + "auxiliary_loss_clip": 0.01065196, + "auxiliary_loss_mlp": 0.00749531, + "balance_loss_clip": 1.02927065, + "balance_loss_mlp": 1.00023913, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 4.1962694358883965, + "language_loss": 0.82272768, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84087491, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.6966826915740967 + }, + { + "auxiliary_loss_clip": 0.01079686, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.03486764, + "balance_loss_mlp": 1.02045536, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.5187284740570512, + "language_loss": 0.75380749, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77492326, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 2.6632463932037354 + }, + { + "auxiliary_loss_clip": 0.01016975, + "auxiliary_loss_mlp": 0.01009176, + "balance_loss_clip": 1.00576031, + "balance_loss_mlp": 1.00757885, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8821707241550908, + "language_loss": 0.60308087, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62334239, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 3.1356325149536133 + }, + { + "auxiliary_loss_clip": 0.01091306, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.0330615, + "balance_loss_mlp": 1.02791309, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.6617713852655678, + "language_loss": 0.78899771, + "learning_rate": 1.841460870485045e-06, + "loss": 0.81031418, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 2.6105940341949463 + }, + { + "auxiliary_loss_clip": 0.0109969, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.03617096, + "balance_loss_mlp": 1.02322423, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 1.9533664922539853, + "language_loss": 0.73849249, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.75985426, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 2.6165144443511963 + }, + { + "auxiliary_loss_clip": 0.01026019, + "auxiliary_loss_mlp": 0.01001005, + "balance_loss_clip": 1.00478363, + "balance_loss_mlp": 0.99972975, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.725817118449633, + "language_loss": 0.51132321, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53159344, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.0652689933776855 + }, + { + "auxiliary_loss_clip": 0.01088943, + "auxiliary_loss_mlp": 0.0103867, + "balance_loss_clip": 1.03376961, + "balance_loss_mlp": 1.02653456, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.5986827208165224, + "language_loss": 0.7225368, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74381292, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.648256301879883 + }, + { + "auxiliary_loss_clip": 0.01087392, + "auxiliary_loss_mlp": 0.00749319, + "balance_loss_clip": 1.03280318, + "balance_loss_mlp": 1.00022113, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 2.1065106429720895, + "language_loss": 0.69822812, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71659517, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.5630929470062256 + }, + { + "auxiliary_loss_clip": 0.01035666, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.03451562, + "balance_loss_mlp": 1.0182476, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.585100102668884, + "language_loss": 0.7230345, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74370003, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 2.7581799030303955 + }, + { + "auxiliary_loss_clip": 0.01068049, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.0362072, + "balance_loss_mlp": 1.01764607, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 1.802620874872586, + "language_loss": 0.74398333, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76497567, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.6449387073516846 + }, + { + "auxiliary_loss_clip": 0.01049119, + "auxiliary_loss_mlp": 0.01049676, + "balance_loss_clip": 1.03284049, + "balance_loss_mlp": 1.0357703, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 2.126541452063155, + "language_loss": 0.76868773, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.78967565, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.7625370025634766 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.03425658, + "balance_loss_mlp": 1.02133584, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.83572968418686, + "language_loss": 0.81727737, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.83863217, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.6295547485351562 + }, + { + "auxiliary_loss_clip": 0.01092719, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.03247166, + "balance_loss_mlp": 1.02020395, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 2.1516183682175525, + "language_loss": 0.66453117, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68578899, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.5656747817993164 + }, + { + "auxiliary_loss_clip": 0.01071938, + "auxiliary_loss_mlp": 0.00749454, + "balance_loss_clip": 1.03644896, + "balance_loss_mlp": 1.00030506, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.9691813611306703, + "language_loss": 0.82677746, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84499139, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 4.297024726867676 + }, + { + "auxiliary_loss_clip": 0.01055834, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.03158712, + "balance_loss_mlp": 1.02519917, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.975941997316383, + "language_loss": 0.70686817, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72780859, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 2.70015025138855 + }, + { + "auxiliary_loss_clip": 0.0110918, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.03799248, + "balance_loss_mlp": 1.0186789, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.919680460670206, + "language_loss": 0.80318248, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82459235, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.5235800743103027 + }, + { + "auxiliary_loss_clip": 0.01057975, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.03100038, + "balance_loss_mlp": 1.01822543, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.448036852512145, + "language_loss": 0.79108644, + "learning_rate": 1.83641431418363e-06, + "loss": 0.81197727, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.6573967933654785 + }, + { + "auxiliary_loss_clip": 0.01084105, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.0313859, + "balance_loss_mlp": 1.02295566, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 3.2222476285003534, + "language_loss": 0.7660203, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.78722054, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.6109836101531982 + }, + { + "auxiliary_loss_clip": 0.0107782, + "auxiliary_loss_mlp": 0.01030704, + "balance_loss_clip": 1.03389812, + "balance_loss_mlp": 1.01848459, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 2.141167749788053, + "language_loss": 0.71580577, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73689097, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.617382049560547 + }, + { + "auxiliary_loss_clip": 0.01055834, + "auxiliary_loss_mlp": 0.01037512, + "balance_loss_clip": 1.03213954, + "balance_loss_mlp": 1.02509642, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.5940149128976926, + "language_loss": 0.67711473, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69804811, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.74118709564209 + }, + { + "auxiliary_loss_clip": 0.01091866, + "auxiliary_loss_mlp": 0.01039174, + "balance_loss_clip": 1.03367341, + "balance_loss_mlp": 1.02644813, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.550927325620733, + "language_loss": 0.77755958, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79886997, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.55843186378479 + }, + { + "auxiliary_loss_clip": 0.01089403, + "auxiliary_loss_mlp": 0.01028419, + "balance_loss_clip": 1.03330243, + "balance_loss_mlp": 1.01716518, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.656677389959631, + "language_loss": 0.69343752, + "learning_rate": 1.834473608367745e-06, + "loss": 0.7146157, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.5572774410247803 + }, + { + "auxiliary_loss_clip": 0.01038136, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.02765405, + "balance_loss_mlp": 1.02000141, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.7433030299196957, + "language_loss": 0.76264322, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78335893, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 2.6986732482910156 + }, + { + "auxiliary_loss_clip": 0.01074879, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.03157592, + "balance_loss_mlp": 1.0229454, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.367563082947785, + "language_loss": 0.76228309, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78339654, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.5953779220581055 + }, + { + "auxiliary_loss_clip": 0.01087381, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.03138041, + "balance_loss_mlp": 1.01934695, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.9409565942418747, + "language_loss": 0.70261812, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72380459, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 2.6142849922180176 + }, + { + "auxiliary_loss_clip": 0.01086967, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.0331111, + "balance_loss_mlp": 1.01687789, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.943890850744643, + "language_loss": 0.75499058, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77615547, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 2.6056785583496094 + }, + { + "auxiliary_loss_clip": 0.01090032, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.03473282, + "balance_loss_mlp": 1.02226353, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.8973322548780782, + "language_loss": 0.73552358, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75675607, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 4.107463836669922 + }, + { + "auxiliary_loss_clip": 0.01055211, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.03116834, + "balance_loss_mlp": 1.02268052, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.8453639245536522, + "language_loss": 0.73242044, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75331527, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 4.242637872695923 + }, + { + "auxiliary_loss_clip": 0.01102181, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.03469849, + "balance_loss_mlp": 1.01941061, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.006144218787103, + "language_loss": 0.71305126, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.73438787, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 2.5731754302978516 + }, + { + "auxiliary_loss_clip": 0.01067891, + "auxiliary_loss_mlp": 0.01040479, + "balance_loss_clip": 1.03216684, + "balance_loss_mlp": 1.02840316, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 2.123046180558943, + "language_loss": 0.70417088, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.7252546, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 2.8958404064178467 + }, + { + "auxiliary_loss_clip": 0.01077117, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.03381515, + "balance_loss_mlp": 1.02016926, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.0048522131837263, + "language_loss": 0.79710311, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.81820017, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 2.6725480556488037 + }, + { + "auxiliary_loss_clip": 0.01049648, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.03051877, + "balance_loss_mlp": 1.01889968, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.546197188984193, + "language_loss": 0.7294029, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.7502172, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 2.701547145843506 + }, + { + "auxiliary_loss_clip": 0.01066252, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.02988505, + "balance_loss_mlp": 1.02024162, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.4262013591320675, + "language_loss": 0.85072315, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87171876, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 2.589369058609009 + }, + { + "auxiliary_loss_clip": 0.01051633, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.03163826, + "balance_loss_mlp": 1.01975083, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.7684423871173702, + "language_loss": 0.78085375, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80167729, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.6812493801116943 + }, + { + "auxiliary_loss_clip": 0.01089082, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.03370905, + "balance_loss_mlp": 1.01838267, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 1.9489116864874612, + "language_loss": 0.69921845, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.72041965, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 2.59407639503479 + }, + { + "auxiliary_loss_clip": 0.01018554, + "auxiliary_loss_mlp": 0.0100049, + "balance_loss_clip": 1.00747967, + "balance_loss_mlp": 0.99920237, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9764649327784968, + "language_loss": 0.5913502, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61154062, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 3.245013952255249 + }, + { + "auxiliary_loss_clip": 0.01106004, + "auxiliary_loss_mlp": 0.00749545, + "balance_loss_clip": 1.0369966, + "balance_loss_mlp": 1.00025368, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 2.3433856519033553, + "language_loss": 0.78449398, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.80304945, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 2.571948766708374 + }, + { + "auxiliary_loss_clip": 0.01082112, + "auxiliary_loss_mlp": 0.01035267, + "balance_loss_clip": 1.0354073, + "balance_loss_mlp": 1.02423453, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.725128740958138, + "language_loss": 0.82997608, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85114992, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 4.178510904312134 + }, + { + "auxiliary_loss_clip": 0.01091191, + "auxiliary_loss_mlp": 0.01033989, + "balance_loss_clip": 1.03626561, + "balance_loss_mlp": 1.02141237, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 1.8513453564189213, + "language_loss": 0.67203975, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69329154, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 2.655566930770874 + }, + { + "auxiliary_loss_clip": 0.01108958, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.03691745, + "balance_loss_mlp": 1.0180043, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.1373231092147704, + "language_loss": 0.73823458, + "learning_rate": 1.827488379924234e-06, + "loss": 0.75963628, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 2.5242178440093994 + }, + { + "auxiliary_loss_clip": 0.01057317, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.03571677, + "balance_loss_mlp": 1.02257872, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.449099713872072, + "language_loss": 0.87605643, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89698482, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 2.6518869400024414 + }, + { + "auxiliary_loss_clip": 0.01103482, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.03567696, + "balance_loss_mlp": 1.02592528, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 2.4135366112436385, + "language_loss": 0.65307522, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67449015, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 2.641303300857544 + }, + { + "auxiliary_loss_clip": 0.0109398, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.02711058, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.303391462778753, + "language_loss": 0.78683102, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.80815923, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.6810715198516846 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.03460371, + "balance_loss_mlp": 1.02058232, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.8500407803398484, + "language_loss": 0.74391025, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76527315, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.561030864715576 + }, + { + "auxiliary_loss_clip": 0.01066917, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.03326964, + "balance_loss_mlp": 1.0187099, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.016558337864608, + "language_loss": 0.72261333, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74359494, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.686307668685913 + }, + { + "auxiliary_loss_clip": 0.01082284, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.03494215, + "balance_loss_mlp": 1.02174735, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 2.0008360264031797, + "language_loss": 0.80603063, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82719129, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.5485918521881104 + }, + { + "auxiliary_loss_clip": 0.01099515, + "auxiliary_loss_mlp": 0.01037262, + "balance_loss_clip": 1.03781021, + "balance_loss_mlp": 1.02458966, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 3.7761379759870777, + "language_loss": 0.81800842, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83937621, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 2.4928746223449707 + }, + { + "auxiliary_loss_clip": 0.01104598, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.03646445, + "balance_loss_mlp": 1.019485, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 2.2048883330394697, + "language_loss": 0.81590664, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83726174, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.421588659286499 + }, + { + "auxiliary_loss_clip": 0.01101345, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.03479624, + "balance_loss_mlp": 1.02118349, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.5584085132544279, + "language_loss": 0.77429771, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.7956419, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.439807415008545 + }, + { + "auxiliary_loss_clip": 0.0110406, + "auxiliary_loss_mlp": 0.01039367, + "balance_loss_clip": 1.0328449, + "balance_loss_mlp": 1.0263555, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 2.4292986514954573, + "language_loss": 0.66516066, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68659496, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 2.8393681049346924 + }, + { + "auxiliary_loss_clip": 0.01082075, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.03195941, + "balance_loss_mlp": 1.02084374, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.5181626496542031, + "language_loss": 0.69932401, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72047162, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.682131052017212 + }, + { + "auxiliary_loss_clip": 0.01063475, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.03058505, + "balance_loss_mlp": 1.02342856, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.487698090598217, + "language_loss": 0.80590576, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82689101, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.6800546646118164 + }, + { + "auxiliary_loss_clip": 0.01056089, + "auxiliary_loss_mlp": 0.01034898, + "balance_loss_clip": 1.03438592, + "balance_loss_mlp": 1.02294731, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.562598031542047, + "language_loss": 0.78879797, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80970776, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 2.64823055267334 + }, + { + "auxiliary_loss_clip": 0.01067275, + "auxiliary_loss_mlp": 0.00749433, + "balance_loss_clip": 1.03095949, + "balance_loss_mlp": 1.00027609, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 2.622516234734277, + "language_loss": 0.82169259, + "learning_rate": 1.822056885403915e-06, + "loss": 0.83985966, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 4.204733371734619 + }, + { + "auxiliary_loss_clip": 0.01088487, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.03435755, + "balance_loss_mlp": 1.01699257, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.537880503432786, + "language_loss": 0.7140159, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73518682, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.517026662826538 + }, + { + "auxiliary_loss_clip": 0.01092093, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.03306532, + "balance_loss_mlp": 1.0221802, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 10.274736103688683, + "language_loss": 0.65219915, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67345929, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.5901851654052734 + }, + { + "auxiliary_loss_clip": 0.01078622, + "auxiliary_loss_mlp": 0.0074921, + "balance_loss_clip": 1.03914523, + "balance_loss_mlp": 1.00023425, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 1.8294105863918333, + "language_loss": 0.74070722, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.75898558, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.6938693523406982 + }, + { + "auxiliary_loss_clip": 0.01076219, + "auxiliary_loss_mlp": 0.01037986, + "balance_loss_clip": 1.03026056, + "balance_loss_mlp": 1.02477813, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.8782316290006478, + "language_loss": 0.78614771, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80728972, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.6348280906677246 + }, + { + "auxiliary_loss_clip": 0.00996571, + "auxiliary_loss_mlp": 0.01016557, + "balance_loss_clip": 1.00684452, + "balance_loss_mlp": 1.01543033, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7523090336182688, + "language_loss": 0.56523794, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58536923, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.24957275390625 + }, + { + "auxiliary_loss_clip": 0.01060523, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.03498459, + "balance_loss_mlp": 1.01476288, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 1.9210469199906273, + "language_loss": 0.77812862, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.79901087, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.6756134033203125 + }, + { + "auxiliary_loss_clip": 0.01055475, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.03252769, + "balance_loss_mlp": 1.01379752, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.946598815139114, + "language_loss": 0.82963711, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85046095, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 2.715080738067627 + }, + { + "auxiliary_loss_clip": 0.01101279, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.03530002, + "balance_loss_mlp": 1.01972675, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 2.226697928958198, + "language_loss": 0.74663615, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.7679612, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.546355962753296 + }, + { + "auxiliary_loss_clip": 0.01080047, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.03129339, + "balance_loss_mlp": 1.0189954, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 2.4605786372141245, + "language_loss": 0.85081369, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87192243, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 2.6169395446777344 + }, + { + "auxiliary_loss_clip": 0.01085725, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.03518009, + "balance_loss_mlp": 1.02208877, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.7469717736211445, + "language_loss": 0.74249673, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.7636981, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 2.5754337310791016 + }, + { + "auxiliary_loss_clip": 0.01066747, + "auxiliary_loss_mlp": 0.01036035, + "balance_loss_clip": 1.03351736, + "balance_loss_mlp": 1.02320814, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.6536424941651569, + "language_loss": 0.75628519, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77731299, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 2.7419626712799072 + }, + { + "auxiliary_loss_clip": 0.01062784, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.03464711, + "balance_loss_mlp": 1.01775479, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.6127458233865521, + "language_loss": 0.84000289, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86091602, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 2.6146132946014404 + }, + { + "auxiliary_loss_clip": 0.00996382, + "auxiliary_loss_mlp": 0.01000277, + "balance_loss_clip": 1.00557923, + "balance_loss_mlp": 0.99901313, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7160215129181174, + "language_loss": 0.55846226, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57842886, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 3.163550615310669 + }, + { + "auxiliary_loss_clip": 0.01048626, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.03285599, + "balance_loss_mlp": 1.01963782, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.7013665702109533, + "language_loss": 0.75260139, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77339828, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 4.161325216293335 + }, + { + "auxiliary_loss_clip": 0.01073354, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.03366876, + "balance_loss_mlp": 1.01834619, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.7986713434166481, + "language_loss": 0.66459483, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.6856271, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 4.1675591468811035 + }, + { + "auxiliary_loss_clip": 0.01086145, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.03080535, + "balance_loss_mlp": 1.0193336, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.8249755512435009, + "language_loss": 0.77981126, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80097413, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 2.5880074501037598 + }, + { + "auxiliary_loss_clip": 0.01054781, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.02274871, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.654779136649486, + "language_loss": 0.76703793, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78793275, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 2.7177469730377197 + }, + { + "auxiliary_loss_clip": 0.01007327, + "auxiliary_loss_mlp": 0.00998723, + "balance_loss_clip": 1.00596189, + "balance_loss_mlp": 0.99743575, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6704440769344903, + "language_loss": 0.52424586, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54430628, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.1382670402526855 + }, + { + "auxiliary_loss_clip": 0.01077555, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.03372192, + "balance_loss_mlp": 1.02121401, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.6353262036602028, + "language_loss": 0.76123214, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78233314, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 2.7648937702178955 + }, + { + "auxiliary_loss_clip": 0.01065753, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.03241134, + "balance_loss_mlp": 1.01753092, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.9563276720446907, + "language_loss": 0.67668664, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69762671, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.6756465435028076 + }, + { + "auxiliary_loss_clip": 0.0105754, + "auxiliary_loss_mlp": 0.01027488, + "balance_loss_clip": 1.02922082, + "balance_loss_mlp": 1.01627612, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.5320588479902033, + "language_loss": 0.84071362, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86156392, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 2.6901049613952637 + }, + { + "auxiliary_loss_clip": 0.01105821, + "auxiliary_loss_mlp": 0.0103127, + "balance_loss_clip": 1.03572464, + "balance_loss_mlp": 1.01899099, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 1.6041962715375044, + "language_loss": 0.61836785, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.63973874, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.5678040981292725 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.03555739, + "balance_loss_mlp": 1.0189178, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 2.140678530797868, + "language_loss": 0.70065463, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72199059, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.570661783218384 + }, + { + "auxiliary_loss_clip": 0.01098544, + "auxiliary_loss_mlp": 0.01026198, + "balance_loss_clip": 1.03344977, + "balance_loss_mlp": 1.01518846, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.5686577764395009, + "language_loss": 0.77086723, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79211462, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.5710179805755615 + }, + { + "auxiliary_loss_clip": 0.01073284, + "auxiliary_loss_mlp": 0.01038416, + "balance_loss_clip": 1.0326488, + "balance_loss_mlp": 1.02526784, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.644808460394388, + "language_loss": 0.72498167, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74609864, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 4.070141077041626 + }, + { + "auxiliary_loss_clip": 0.01045883, + "auxiliary_loss_mlp": 0.0104074, + "balance_loss_clip": 1.02888691, + "balance_loss_mlp": 1.02619696, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.215545291463462, + "language_loss": 0.9316414, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95250773, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 2.6713058948516846 + }, + { + "auxiliary_loss_clip": 0.01087763, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.03179252, + "balance_loss_mlp": 1.02059782, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.741597218584075, + "language_loss": 0.74029422, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76148045, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.578733444213867 + }, + { + "auxiliary_loss_clip": 0.0109344, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.03607547, + "balance_loss_mlp": 1.01521099, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 2.0223822611480604, + "language_loss": 0.67426509, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69546366, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 2.6348819732666016 + }, + { + "auxiliary_loss_clip": 0.0106211, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.03082752, + "balance_loss_mlp": 1.02238274, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.6278843810194197, + "language_loss": 0.67438525, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69534242, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.7441298961639404 + }, + { + "auxiliary_loss_clip": 0.01104035, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.03541756, + "balance_loss_mlp": 1.01911998, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.8345214200273388, + "language_loss": 0.92859995, + "learning_rate": 1.810422473773436e-06, + "loss": 0.94994867, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.5010452270507812 + }, + { + "auxiliary_loss_clip": 0.01074959, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.03339875, + "balance_loss_mlp": 1.02446222, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.0292211285312876, + "language_loss": 0.83842218, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85953236, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.533012628555298 + }, + { + "auxiliary_loss_clip": 0.01068705, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.03251243, + "balance_loss_mlp": 1.02076399, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.0101843780537823, + "language_loss": 0.68563783, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70665205, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 2.5765457153320312 + }, + { + "auxiliary_loss_clip": 0.00987443, + "auxiliary_loss_mlp": 0.01002908, + "balance_loss_clip": 1.00580049, + "balance_loss_mlp": 1.00179374, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7378909712873929, + "language_loss": 0.57673216, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59663564, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.173894166946411 + }, + { + "auxiliary_loss_clip": 0.01069169, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.0346235, + "balance_loss_mlp": 1.02255726, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.727765583035859, + "language_loss": 0.69495296, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71599436, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 2.586132049560547 + }, + { + "auxiliary_loss_clip": 0.01090793, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.03510714, + "balance_loss_mlp": 1.02188516, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.044173436955047, + "language_loss": 0.75095081, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77218801, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.6200172901153564 + }, + { + "auxiliary_loss_clip": 0.00997248, + "auxiliary_loss_mlp": 0.01020097, + "balance_loss_clip": 1.00919676, + "balance_loss_mlp": 1.01890457, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.9175303619173464, + "language_loss": 0.62617588, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64634931, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.233508348464966 + }, + { + "auxiliary_loss_clip": 0.01088459, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.03330135, + "balance_loss_mlp": 1.01954222, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.8926690565976774, + "language_loss": 0.79336667, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81456214, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.540053606033325 + }, + { + "auxiliary_loss_clip": 0.0109276, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.03442025, + "balance_loss_mlp": 1.01898146, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.728109188848299, + "language_loss": 0.795632, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81686926, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.645824432373047 + }, + { + "auxiliary_loss_clip": 0.01090475, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.03469014, + "balance_loss_mlp": 1.01763535, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.7129142575439924, + "language_loss": 0.87147486, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89267302, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 2.5297656059265137 + }, + { + "auxiliary_loss_clip": 0.01070946, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.03113937, + "balance_loss_mlp": 1.02190614, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.9865903837689782, + "language_loss": 0.82114524, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84221393, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 4.0736401081085205 + }, + { + "auxiliary_loss_clip": 0.01102179, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.03449821, + "balance_loss_mlp": 1.01878262, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.6504198117074473, + "language_loss": 0.63394469, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65528059, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.4564170837402344 + }, + { + "auxiliary_loss_clip": 0.01105312, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.03627634, + "balance_loss_mlp": 1.02262664, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.6656776692107065, + "language_loss": 0.79664189, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81804764, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 2.542663812637329 + }, + { + "auxiliary_loss_clip": 0.01065244, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.03319693, + "balance_loss_mlp": 1.01919818, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 1.8388251318028404, + "language_loss": 0.78148127, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80243373, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.5969152450561523 + }, + { + "auxiliary_loss_clip": 0.01087679, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.03271914, + "balance_loss_mlp": 1.01897144, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.6348734008551622, + "language_loss": 0.7562803, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.77747285, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.5956945419311523 + }, + { + "auxiliary_loss_clip": 0.0107093, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.03539944, + "balance_loss_mlp": 1.02065289, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 1.9042510879124719, + "language_loss": 0.62906021, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.6501177, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.715057373046875 + }, + { + "auxiliary_loss_clip": 0.01062501, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.03680825, + "balance_loss_mlp": 1.02655256, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.5280209761721173, + "language_loss": 0.72152227, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74252868, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 2.660900831222534 + }, + { + "auxiliary_loss_clip": 0.01100539, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.03655958, + "balance_loss_mlp": 1.01588297, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.8525737173948937, + "language_loss": 0.73978686, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.7610563, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 2.4614381790161133 + }, + { + "auxiliary_loss_clip": 0.01087012, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.0336988, + "balance_loss_mlp": 1.01736033, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.919697685913893, + "language_loss": 0.60839039, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62955135, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 2.5784146785736084 + }, + { + "auxiliary_loss_clip": 0.01028967, + "auxiliary_loss_mlp": 0.00999962, + "balance_loss_clip": 1.00789118, + "balance_loss_mlp": 0.998716, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.7014245223301692, + "language_loss": 0.57132798, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59161729, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.188906669616699 + }, + { + "auxiliary_loss_clip": 0.01069478, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.03149438, + "balance_loss_mlp": 1.01983154, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.7156675870628668, + "language_loss": 0.6988529, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.71987474, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 2.5949511528015137 + }, + { + "auxiliary_loss_clip": 0.0107547, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.03179872, + "balance_loss_mlp": 1.02123225, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 2.3037886678258626, + "language_loss": 0.71993405, + "learning_rate": 1.802282211606627e-06, + "loss": 0.74100959, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 2.5976767539978027 + }, + { + "auxiliary_loss_clip": 0.01088842, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.03346825, + "balance_loss_mlp": 1.02449477, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 2.4249607223982417, + "language_loss": 0.68456846, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70581686, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 2.5827276706695557 + }, + { + "auxiliary_loss_clip": 0.01089443, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.03476906, + "balance_loss_mlp": 1.02017212, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.7381194206134727, + "language_loss": 0.80542886, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.82662994, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 2.581780195236206 + }, + { + "auxiliary_loss_clip": 0.01090303, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03287339, + "balance_loss_mlp": 1.01959062, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 4.480854357040741, + "language_loss": 0.80192804, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82313669, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 4.032148361206055 + }, + { + "auxiliary_loss_clip": 0.01084767, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.01784122, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 2.6543316318347383, + "language_loss": 0.67947572, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.70061409, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 4.127180337905884 + }, + { + "auxiliary_loss_clip": 0.01094414, + "auxiliary_loss_mlp": 0.01034255, + "balance_loss_clip": 1.03480089, + "balance_loss_mlp": 1.02220297, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 5.303836715987075, + "language_loss": 0.81087446, + "learning_rate": 1.800344536188764e-06, + "loss": 0.83216119, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 2.5659730434417725 + }, + { + "auxiliary_loss_clip": 0.011066, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.03525639, + "balance_loss_mlp": 1.02112567, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 5.763719932600256, + "language_loss": 0.75760651, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77901351, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 2.5393450260162354 + }, + { + "auxiliary_loss_clip": 0.0106208, + "auxiliary_loss_mlp": 0.01036842, + "balance_loss_clip": 1.0312469, + "balance_loss_mlp": 1.02419376, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 1.9759752920926899, + "language_loss": 0.83061361, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85160285, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 2.6242434978485107 + }, + { + "auxiliary_loss_clip": 0.01105936, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.03595209, + "balance_loss_mlp": 1.01816797, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.6268600904235442, + "language_loss": 0.6991536, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72051442, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.472520351409912 + }, + { + "auxiliary_loss_clip": 0.01097298, + "auxiliary_loss_mlp": 0.0102437, + "balance_loss_clip": 1.03241301, + "balance_loss_mlp": 1.0131284, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 2.1281584069217314, + "language_loss": 0.66441619, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68563294, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 2.5873947143554688 + }, + { + "auxiliary_loss_clip": 0.01075722, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.03414619, + "balance_loss_mlp": 1.01685667, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.5050552525045182, + "language_loss": 0.78770071, + "learning_rate": 1.798407050044766e-06, + "loss": 0.80873775, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.6045374870300293 + }, + { + "auxiliary_loss_clip": 0.01092154, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.03405201, + "balance_loss_mlp": 1.02035761, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 1.7551388227771703, + "language_loss": 0.75447226, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.7757138, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.5567026138305664 + }, + { + "auxiliary_loss_clip": 0.01076998, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.03127313, + "balance_loss_mlp": 1.01933885, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 2.4436102158711535, + "language_loss": 0.7466166, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76770073, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.5951945781707764 + }, + { + "auxiliary_loss_clip": 0.01088339, + "auxiliary_loss_mlp": 0.01028157, + "balance_loss_clip": 1.03435063, + "balance_loss_mlp": 1.01624227, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.6559690208525906, + "language_loss": 0.76849812, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.78966308, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.609922409057617 + }, + { + "auxiliary_loss_clip": 0.0109511, + "auxiliary_loss_mlp": 0.01039434, + "balance_loss_clip": 1.03562272, + "balance_loss_mlp": 1.02645183, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.9174638010084062, + "language_loss": 0.77747494, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79882038, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 4.146601915359497 + }, + { + "auxiliary_loss_clip": 0.00977692, + "auxiliary_loss_mlp": 0.01000317, + "balance_loss_clip": 1.01480699, + "balance_loss_mlp": 0.99895209, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7281753197100506, + "language_loss": 0.57739216, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.5971722, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.5337467193603516 + }, + { + "auxiliary_loss_clip": 0.01065209, + "auxiliary_loss_mlp": 0.01035489, + "balance_loss_clip": 1.03256631, + "balance_loss_mlp": 1.02366376, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.683923495106028, + "language_loss": 0.76721263, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78821957, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 3.1443023681640625 + }, + { + "auxiliary_loss_clip": 0.010837, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.03099751, + "balance_loss_mlp": 1.02151513, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 4.497767847642081, + "language_loss": 0.73464882, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75583786, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.5313920974731445 + }, + { + "auxiliary_loss_clip": 0.01084805, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.03607297, + "balance_loss_mlp": 1.02475166, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.7597274530754017, + "language_loss": 0.77899873, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80021369, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 2.6189966201782227 + }, + { + "auxiliary_loss_clip": 0.01105343, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.03583241, + "balance_loss_mlp": 1.02019536, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.6655000139068523, + "language_loss": 0.74927849, + "learning_rate": 1.794920057818476e-06, + "loss": 0.77065802, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.5020229816436768 + }, + { + "auxiliary_loss_clip": 0.01093064, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.03319001, + "balance_loss_mlp": 1.02264071, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 2.8449459430974713, + "language_loss": 0.68903255, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71032691, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.517841339111328 + }, + { + "auxiliary_loss_clip": 0.01082094, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.03541517, + "balance_loss_mlp": 1.02275038, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 2.7484319401716926, + "language_loss": 0.66912878, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.69029152, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.58620548248291 + }, + { + "auxiliary_loss_clip": 0.01068963, + "auxiliary_loss_mlp": 0.01034551, + "balance_loss_clip": 1.03490877, + "balance_loss_mlp": 1.0231967, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.3693466209014207, + "language_loss": 0.66517234, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.68620747, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.6718313694000244 + }, + { + "auxiliary_loss_clip": 0.01000856, + "auxiliary_loss_mlp": 0.01003474, + "balance_loss_clip": 1.00948358, + "balance_loss_mlp": 1.0022999, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7481308560867781, + "language_loss": 0.57540315, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59544647, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.2656381130218506 + }, + { + "auxiliary_loss_clip": 0.01023353, + "auxiliary_loss_mlp": 0.01012886, + "balance_loss_clip": 1.01484752, + "balance_loss_mlp": 1.01179504, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9163957838403978, + "language_loss": 0.64869559, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66905797, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.0619630813598633 + }, + { + "auxiliary_loss_clip": 0.01095325, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.0355978, + "balance_loss_mlp": 1.02235579, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.5762482829810296, + "language_loss": 0.73091853, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75221908, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.5541460514068604 + }, + { + "auxiliary_loss_clip": 0.01079444, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.03407574, + "balance_loss_mlp": 1.02267456, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.9028688685051403, + "language_loss": 0.72344238, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74457097, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.6842310428619385 + }, + { + "auxiliary_loss_clip": 0.01088275, + "auxiliary_loss_mlp": 0.00749312, + "balance_loss_clip": 1.03441429, + "balance_loss_mlp": 1.00018537, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.4648062852690074, + "language_loss": 0.67651021, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69488609, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.726151466369629 + }, + { + "auxiliary_loss_clip": 0.01103345, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.0351516, + "balance_loss_mlp": 1.01900983, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 4.463662205928115, + "language_loss": 0.77943993, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80078065, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 2.5259547233581543 + }, + { + "auxiliary_loss_clip": 0.01066233, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.03363228, + "balance_loss_mlp": 1.02451253, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.4692575423520482, + "language_loss": 0.72516191, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74619013, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.660050392150879 + }, + { + "auxiliary_loss_clip": 0.01067757, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.03321123, + "balance_loss_mlp": 1.01604152, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.1790688830302984, + "language_loss": 0.65158039, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67253512, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 4.6916303634643555 + }, + { + "auxiliary_loss_clip": 0.01091902, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.03703308, + "balance_loss_mlp": 1.01639652, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.712614772924298, + "language_loss": 0.81737053, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83858401, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 2.543039083480835 + }, + { + "auxiliary_loss_clip": 0.01100995, + "auxiliary_loss_mlp": 0.01029032, + "balance_loss_clip": 1.03437495, + "balance_loss_mlp": 1.01776671, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.7165428311349133, + "language_loss": 0.80438769, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82568794, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 2.585057020187378 + }, + { + "auxiliary_loss_clip": 0.01095217, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.0370779, + "balance_loss_mlp": 1.02358675, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 2.0771085895640127, + "language_loss": 0.69978619, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.72109151, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.6368095874786377 + }, + { + "auxiliary_loss_clip": 0.01094842, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.03445411, + "balance_loss_mlp": 1.01998925, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.8246976393780705, + "language_loss": 0.63093233, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65219998, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 2.5316200256347656 + }, + { + "auxiliary_loss_clip": 0.01101127, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.03418684, + "balance_loss_mlp": 1.01622772, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 2.1876764762191683, + "language_loss": 0.74884921, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77013361, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 2.517448902130127 + }, + { + "auxiliary_loss_clip": 0.01070812, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.03250408, + "balance_loss_mlp": 1.01644731, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.8441783402774152, + "language_loss": 0.77903283, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.80002987, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.5815200805664062 + }, + { + "auxiliary_loss_clip": 0.01088354, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.03434479, + "balance_loss_mlp": 1.017277, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 2.2184690935312656, + "language_loss": 0.70996189, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73112893, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.5972378253936768 + }, + { + "auxiliary_loss_clip": 0.01092344, + "auxiliary_loss_mlp": 0.01034305, + "balance_loss_clip": 1.03474426, + "balance_loss_mlp": 1.02246141, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 2.0354012796253036, + "language_loss": 0.71362388, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73489034, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 2.55039644241333 + }, + { + "auxiliary_loss_clip": 0.01050329, + "auxiliary_loss_mlp": 0.01030801, + "balance_loss_clip": 1.03268194, + "balance_loss_mlp": 1.01910686, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 1.9117280005064254, + "language_loss": 0.88184106, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90265238, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 2.6859984397888184 + }, + { + "auxiliary_loss_clip": 0.01055253, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.03611231, + "balance_loss_mlp": 1.01624346, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 2.0437788377350965, + "language_loss": 0.73135889, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75220144, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 2.7268612384796143 + }, + { + "auxiliary_loss_clip": 0.01073503, + "auxiliary_loss_mlp": 0.0074931, + "balance_loss_clip": 1.03037751, + "balance_loss_mlp": 1.00018203, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.8292366447056192, + "language_loss": 0.72075737, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.73898548, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 2.6506755352020264 + }, + { + "auxiliary_loss_clip": 0.01062501, + "auxiliary_loss_mlp": 0.00749797, + "balance_loss_clip": 1.03004122, + "balance_loss_mlp": 1.00023437, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.55711275896575, + "language_loss": 0.71904778, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.7371707, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 4.280977010726929 + }, + { + "auxiliary_loss_clip": 0.01076331, + "auxiliary_loss_mlp": 0.01034319, + "balance_loss_clip": 1.03434336, + "balance_loss_mlp": 1.02246332, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 1.846455047678595, + "language_loss": 0.76434255, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78544903, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 2.6533663272857666 + }, + { + "auxiliary_loss_clip": 0.01051899, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.02897215, + "balance_loss_mlp": 1.02031684, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.6786149879917134, + "language_loss": 0.62633479, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64717519, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 4.304571628570557 + }, + { + "auxiliary_loss_clip": 0.01106897, + "auxiliary_loss_mlp": 0.01029722, + "balance_loss_clip": 1.03730798, + "balance_loss_mlp": 1.01696038, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 1.693505385535638, + "language_loss": 0.79143798, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81280422, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 2.5150985717773438 + }, + { + "auxiliary_loss_clip": 0.01076325, + "auxiliary_loss_mlp": 0.00749222, + "balance_loss_clip": 1.03485227, + "balance_loss_mlp": 1.0001986, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.662772057215716, + "language_loss": 0.82526183, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84351724, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.6486947536468506 + }, + { + "auxiliary_loss_clip": 0.01072412, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.03476894, + "balance_loss_mlp": 1.02337408, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.6755702822043834, + "language_loss": 0.80392861, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82501411, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 2.616919755935669 + }, + { + "auxiliary_loss_clip": 0.01058107, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.03257132, + "balance_loss_mlp": 1.02269721, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 1.782912222206573, + "language_loss": 0.6115551, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63249451, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.7358920574188232 + }, + { + "auxiliary_loss_clip": 0.0108685, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03943825, + "balance_loss_mlp": 1.02172709, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.700527443838597, + "language_loss": 0.71681201, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73800141, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 2.7207143306732178 + }, + { + "auxiliary_loss_clip": 0.01103012, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.03523302, + "balance_loss_mlp": 1.01767468, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 2.257636608875769, + "language_loss": 0.83530247, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85662019, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.5037009716033936 + }, + { + "auxiliary_loss_clip": 0.01080276, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.03798032, + "balance_loss_mlp": 1.02053285, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.562348249627809, + "language_loss": 0.80307931, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.8242054, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 2.6633365154266357 + }, + { + "auxiliary_loss_clip": 0.01094041, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.03579366, + "balance_loss_mlp": 1.01605248, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 1.9624428975706496, + "language_loss": 0.74534422, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76656455, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.5263798236846924 + }, + { + "auxiliary_loss_clip": 0.01088642, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.03338194, + "balance_loss_mlp": 1.02112436, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.5057260694620096, + "language_loss": 0.66621113, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.6874423, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 2.571065902709961 + }, + { + "auxiliary_loss_clip": 0.01057899, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.03217864, + "balance_loss_mlp": 1.02016461, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.6788497258766801, + "language_loss": 0.83357453, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85448366, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 4.208374977111816 + }, + { + "auxiliary_loss_clip": 0.01053338, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.03043425, + "balance_loss_mlp": 1.02158856, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 2.1271324172971497, + "language_loss": 0.73885059, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.75972366, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 2.6738433837890625 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.03121257, + "balance_loss_mlp": 1.01501179, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 4.704192579469214, + "language_loss": 0.63331485, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65415931, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 2.604132890701294 + }, + { + "auxiliary_loss_clip": 0.01105537, + "auxiliary_loss_mlp": 0.00749368, + "balance_loss_clip": 1.03601003, + "balance_loss_mlp": 1.00019836, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 1.9392649717268524, + "language_loss": 0.63367474, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65222383, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.547712802886963 + }, + { + "auxiliary_loss_clip": 0.01089328, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.03261745, + "balance_loss_mlp": 1.02074814, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.6689403952704684, + "language_loss": 0.74981254, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.77104294, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.515244483947754 + }, + { + "auxiliary_loss_clip": 0.01088013, + "auxiliary_loss_mlp": 0.0102726, + "balance_loss_clip": 1.03073001, + "balance_loss_mlp": 1.01606035, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.5482749187241254, + "language_loss": 0.81531918, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83647192, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.5632283687591553 + }, + { + "auxiliary_loss_clip": 0.01078586, + "auxiliary_loss_mlp": 0.00749353, + "balance_loss_clip": 1.03363287, + "balance_loss_mlp": 1.00019598, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 1.8625935184650473, + "language_loss": 0.70350158, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72178096, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.618387222290039 + }, + { + "auxiliary_loss_clip": 0.01066933, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.0330894, + "balance_loss_mlp": 1.02019715, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 3.924981475266302, + "language_loss": 0.61426568, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63525259, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 2.82017183303833 + }, + { + "auxiliary_loss_clip": 0.01094786, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.03470182, + "balance_loss_mlp": 1.01976347, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 2.347145172823006, + "language_loss": 0.72342145, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74469948, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.5806777477264404 + }, + { + "auxiliary_loss_clip": 0.01037938, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.0309149, + "balance_loss_mlp": 1.02244973, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 2.0749638871269918, + "language_loss": 0.6795215, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70025826, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.7104756832122803 + }, + { + "auxiliary_loss_clip": 0.01021983, + "auxiliary_loss_mlp": 0.01001428, + "balance_loss_clip": 1.01044416, + "balance_loss_mlp": 1.00019979, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.7455477204671354, + "language_loss": 0.65277052, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67300463, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.1468393802642822 + }, + { + "auxiliary_loss_clip": 0.01091339, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.03525949, + "balance_loss_mlp": 1.01993513, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 2.212773770333482, + "language_loss": 0.75288022, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77411592, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.5788028240203857 + }, + { + "auxiliary_loss_clip": 0.01093341, + "auxiliary_loss_mlp": 0.01028227, + "balance_loss_clip": 1.03468084, + "balance_loss_mlp": 1.01653194, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.7630383262977767, + "language_loss": 0.71250236, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73371804, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.505988597869873 + }, + { + "auxiliary_loss_clip": 0.01072305, + "auxiliary_loss_mlp": 0.01035846, + "balance_loss_clip": 1.03033924, + "balance_loss_mlp": 1.0233407, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.7074018529251898, + "language_loss": 0.76518989, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78627133, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 2.5996391773223877 + }, + { + "auxiliary_loss_clip": 0.01064985, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.03498411, + "balance_loss_mlp": 1.02124774, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.7498467607667385, + "language_loss": 0.75126398, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77223545, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.6455368995666504 + }, + { + "auxiliary_loss_clip": 0.01077227, + "auxiliary_loss_mlp": 0.01034636, + "balance_loss_clip": 1.03498721, + "balance_loss_mlp": 1.02193403, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 3.049192691143644, + "language_loss": 0.7672168, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78833544, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 2.6068625450134277 + }, + { + "auxiliary_loss_clip": 0.01069884, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.03125668, + "balance_loss_mlp": 1.02035856, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 3.0893350122230783, + "language_loss": 0.79554594, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.81657195, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 4.051831960678101 + }, + { + "auxiliary_loss_clip": 0.01092518, + "auxiliary_loss_mlp": 0.01033659, + "balance_loss_clip": 1.03584552, + "balance_loss_mlp": 1.02185166, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 2.0590704694293347, + "language_loss": 0.71028513, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73154694, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 2.612971305847168 + }, + { + "auxiliary_loss_clip": 0.01087757, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.03269434, + "balance_loss_mlp": 1.01866257, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.54286829145266, + "language_loss": 0.69989562, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72106791, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.634681224822998 + }, + { + "auxiliary_loss_clip": 0.01074169, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.03243852, + "balance_loss_mlp": 1.01746714, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.9296649079390926, + "language_loss": 0.63926709, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66028953, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 2.589433431625366 + }, + { + "auxiliary_loss_clip": 0.01103128, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.03700876, + "balance_loss_mlp": 1.02027762, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 4.133831307403357, + "language_loss": 0.80641162, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.8277598, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.510849714279175 + }, + { + "auxiliary_loss_clip": 0.01082504, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.03507471, + "balance_loss_mlp": 1.02548981, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.6352409659010856, + "language_loss": 0.78723574, + "learning_rate": 1.773237789559453e-06, + "loss": 0.80843008, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 2.6421895027160645 + }, + { + "auxiliary_loss_clip": 0.01065691, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.03324223, + "balance_loss_mlp": 1.01752901, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 1.9177722070468832, + "language_loss": 0.71345663, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.73440444, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 2.6031010150909424 + }, + { + "auxiliary_loss_clip": 0.01080323, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.03219342, + "balance_loss_mlp": 1.01747584, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 1.6501657891915447, + "language_loss": 0.74921095, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77031267, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 2.609278440475464 + }, + { + "auxiliary_loss_clip": 0.01072865, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.03198957, + "balance_loss_mlp": 1.0193516, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.9565156005902233, + "language_loss": 0.76207668, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78310686, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 2.582979917526245 + }, + { + "auxiliary_loss_clip": 0.01079242, + "auxiliary_loss_mlp": 0.01037415, + "balance_loss_clip": 1.03403997, + "balance_loss_mlp": 1.0260129, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 4.564792442083014, + "language_loss": 0.82285249, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84401906, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 2.598381757736206 + }, + { + "auxiliary_loss_clip": 0.01091158, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.03604078, + "balance_loss_mlp": 1.02003253, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 2.221743709647622, + "language_loss": 0.74103516, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76226127, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 2.65386700630188 + }, + { + "auxiliary_loss_clip": 0.01082182, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.03320599, + "balance_loss_mlp": 1.0211513, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.7256398680983416, + "language_loss": 0.72575009, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74691147, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 4.184408187866211 + }, + { + "auxiliary_loss_clip": 0.01007584, + "auxiliary_loss_mlp": 0.01003386, + "balance_loss_clip": 1.0059104, + "balance_loss_mlp": 1.00212193, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7449439700023196, + "language_loss": 0.55348796, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57359761, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 3.2524139881134033 + }, + { + "auxiliary_loss_clip": 0.01088628, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.03310466, + "balance_loss_mlp": 1.02264488, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.7116139097760505, + "language_loss": 0.82791471, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84914017, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 4.0643792152404785 + }, + { + "auxiliary_loss_clip": 0.01108235, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.03696716, + "balance_loss_mlp": 1.02027822, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.5294282102978554, + "language_loss": 0.75324404, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77465808, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.529020071029663 + }, + { + "auxiliary_loss_clip": 0.01066848, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.04095113, + "balance_loss_mlp": 1.02145267, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.5847705885799626, + "language_loss": 0.6973778, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71837306, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 2.688291311264038 + }, + { + "auxiliary_loss_clip": 0.01053454, + "auxiliary_loss_mlp": 0.00749198, + "balance_loss_clip": 1.03256965, + "balance_loss_mlp": 1.00013876, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 2.1640553006182937, + "language_loss": 0.68323737, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.7012639, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 2.7472445964813232 + }, + { + "auxiliary_loss_clip": 0.0110083, + "auxiliary_loss_mlp": 0.01032624, + "balance_loss_clip": 1.03567576, + "balance_loss_mlp": 1.02114439, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 2.2684668681122675, + "language_loss": 0.71792084, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.73925543, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.4856619834899902 + }, + { + "auxiliary_loss_clip": 0.01092775, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.03586245, + "balance_loss_mlp": 1.02393317, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 1.645243556311395, + "language_loss": 0.69848287, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71976489, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.581110954284668 + }, + { + "auxiliary_loss_clip": 0.01099388, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.0337013, + "balance_loss_mlp": 1.02025747, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6056738221308682, + "language_loss": 0.85705906, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87836665, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.552130699157715 + }, + { + "auxiliary_loss_clip": 0.01068005, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.03351474, + "balance_loss_mlp": 1.01911569, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.5460510407897732, + "language_loss": 0.80611742, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.82709944, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 2.573065996170044 + }, + { + "auxiliary_loss_clip": 0.01085261, + "auxiliary_loss_mlp": 0.01024867, + "balance_loss_clip": 1.03691292, + "balance_loss_mlp": 1.0128386, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.855970555238591, + "language_loss": 0.73451138, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75561267, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 2.619959831237793 + }, + { + "auxiliary_loss_clip": 0.01080734, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.03091741, + "balance_loss_mlp": 1.01807404, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 2.6099418158248384, + "language_loss": 0.78962648, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81073707, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.4768669605255127 + }, + { + "auxiliary_loss_clip": 0.0107129, + "auxiliary_loss_mlp": 0.01026826, + "balance_loss_clip": 1.03425527, + "balance_loss_mlp": 1.01535201, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.268050427696447, + "language_loss": 0.76153702, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78251815, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 2.556523084640503 + }, + { + "auxiliary_loss_clip": 0.010909, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.03423095, + "balance_loss_mlp": 1.0132575, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.497075506983856, + "language_loss": 0.80654013, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82770133, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 4.272291660308838 + }, + { + "auxiliary_loss_clip": 0.01094454, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.03606439, + "balance_loss_mlp": 1.02295351, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.6699257621578245, + "language_loss": 0.6859321, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.70722365, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 2.568514347076416 + }, + { + "auxiliary_loss_clip": 0.01085181, + "auxiliary_loss_mlp": 0.01023582, + "balance_loss_clip": 1.03174996, + "balance_loss_mlp": 1.01288891, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.0495452651682613, + "language_loss": 0.85577631, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87686396, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.5675976276397705 + }, + { + "auxiliary_loss_clip": 0.01005527, + "auxiliary_loss_mlp": 0.01003624, + "balance_loss_clip": 1.00994492, + "balance_loss_mlp": 1.0022589, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7805944992069133, + "language_loss": 0.59883618, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.6189276, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.195232391357422 + }, + { + "auxiliary_loss_clip": 0.01065377, + "auxiliary_loss_mlp": 0.0103513, + "balance_loss_clip": 1.03234398, + "balance_loss_mlp": 1.02354908, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.5840504930740624, + "language_loss": 0.70430005, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72530508, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.6365723609924316 + }, + { + "auxiliary_loss_clip": 0.0109896, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.03308845, + "balance_loss_mlp": 1.01628399, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.7450615145322017, + "language_loss": 0.75665528, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77792418, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.5093746185302734 + }, + { + "auxiliary_loss_clip": 0.01066251, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.03354502, + "balance_loss_mlp": 1.01742029, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.6507674747642016, + "language_loss": 0.75060868, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.77156115, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.6255569458007812 + }, + { + "auxiliary_loss_clip": 0.01082405, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.03421748, + "balance_loss_mlp": 1.01795912, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.7514975352604047, + "language_loss": 0.72593206, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74705118, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.603152275085449 + }, + { + "auxiliary_loss_clip": 0.01092979, + "auxiliary_loss_mlp": 0.01034371, + "balance_loss_clip": 1.03551996, + "balance_loss_mlp": 1.02310586, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.8215227865598447, + "language_loss": 0.69474179, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71601522, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.5659327507019043 + }, + { + "auxiliary_loss_clip": 0.0109357, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.03722429, + "balance_loss_mlp": 1.01942945, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.5721940606922753, + "language_loss": 0.70875061, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.72999561, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.5898234844207764 + }, + { + "auxiliary_loss_clip": 0.01093634, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.03714621, + "balance_loss_mlp": 1.01660347, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.9556095914425562, + "language_loss": 0.80097812, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82219225, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.5403614044189453 + }, + { + "auxiliary_loss_clip": 0.01042166, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.03487587, + "balance_loss_mlp": 1.02946055, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.573025370826102, + "language_loss": 0.75120819, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77205265, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.6996328830718994 + }, + { + "auxiliary_loss_clip": 0.01094751, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.03694558, + "balance_loss_mlp": 1.02516627, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 2.2953025395215647, + "language_loss": 0.70130658, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72262043, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 2.692216634750366 + }, + { + "auxiliary_loss_clip": 0.01085887, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.03375363, + "balance_loss_mlp": 1.024575, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 3.85884917633517, + "language_loss": 0.67285538, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69408298, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.5431294441223145 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.03629589, + "balance_loss_mlp": 1.02206111, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.9170644494454023, + "language_loss": 0.790205, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81161547, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 2.534569501876831 + }, + { + "auxiliary_loss_clip": 0.01068566, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.03453398, + "balance_loss_mlp": 1.01704931, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 2.686207383352333, + "language_loss": 0.83451176, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.85548627, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.7041733264923096 + }, + { + "auxiliary_loss_clip": 0.01081571, + "auxiliary_loss_mlp": 0.01026751, + "balance_loss_clip": 1.03479648, + "balance_loss_mlp": 1.01514626, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.2500068489757774, + "language_loss": 0.67034864, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69143188, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 4.128661870956421 + }, + { + "auxiliary_loss_clip": 0.01091633, + "auxiliary_loss_mlp": 0.01024215, + "balance_loss_clip": 1.03482044, + "balance_loss_mlp": 1.01234138, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.502695852702679, + "language_loss": 0.76302737, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78418589, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.6976678371429443 + }, + { + "auxiliary_loss_clip": 0.01061632, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.03235531, + "balance_loss_mlp": 1.01938796, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.9645462342464564, + "language_loss": 0.74127114, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76220536, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 2.6886227130889893 + }, + { + "auxiliary_loss_clip": 0.01076383, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.03808117, + "balance_loss_mlp": 1.02324057, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 1.9867008513096664, + "language_loss": 0.66962326, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.69072759, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 2.6203184127807617 + }, + { + "auxiliary_loss_clip": 0.01084382, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.03756273, + "balance_loss_mlp": 1.01588905, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.66229283690318, + "language_loss": 0.77549994, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79661888, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 2.571153402328491 + }, + { + "auxiliary_loss_clip": 0.01075957, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.03354943, + "balance_loss_mlp": 1.02153039, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 2.0222057579781434, + "language_loss": 0.81563729, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83672953, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 2.5727365016937256 + }, + { + "auxiliary_loss_clip": 0.01090081, + "auxiliary_loss_mlp": 0.00749133, + "balance_loss_clip": 1.03729272, + "balance_loss_mlp": 1.00021124, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 2.043058918563345, + "language_loss": 0.76683468, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78522676, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.686553955078125 + }, + { + "auxiliary_loss_clip": 0.01107593, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.03646278, + "balance_loss_mlp": 1.022861, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 2.2896587364081498, + "language_loss": 0.79048645, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81192291, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 2.592482805252075 + }, + { + "auxiliary_loss_clip": 0.01029319, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.02926993, + "balance_loss_mlp": 1.02196717, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 1.8308197312923309, + "language_loss": 0.68516731, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70580232, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 2.7285470962524414 + }, + { + "auxiliary_loss_clip": 0.01091661, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.03607249, + "balance_loss_mlp": 1.0195744, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 2.17208801802428, + "language_loss": 0.77508307, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79629731, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 2.564227819442749 + }, + { + "auxiliary_loss_clip": 0.01062518, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.02984273, + "balance_loss_mlp": 1.02581048, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.5603806460678566, + "language_loss": 0.78659338, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80759311, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 2.6558539867401123 + }, + { + "auxiliary_loss_clip": 0.01071517, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.03620529, + "balance_loss_mlp": 1.02192461, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 2.050729785503662, + "language_loss": 0.70014203, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.72119629, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 2.799255132675171 + }, + { + "auxiliary_loss_clip": 0.01088462, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.03590417, + "balance_loss_mlp": 1.02062011, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.002144453147732, + "language_loss": 0.74283445, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76404941, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 4.071580171585083 + }, + { + "auxiliary_loss_clip": 0.01082822, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.03398871, + "balance_loss_mlp": 1.02406061, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5850236110082379, + "language_loss": 0.77045065, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.7916348, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 4.023122549057007 + }, + { + "auxiliary_loss_clip": 0.01081103, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.03369403, + "balance_loss_mlp": 1.01762629, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.5783011202643673, + "language_loss": 0.76198125, + "learning_rate": 1.754287837093407e-06, + "loss": 0.783077, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.8394882678985596 + }, + { + "auxiliary_loss_clip": 0.01099232, + "auxiliary_loss_mlp": 0.01023488, + "balance_loss_clip": 1.03320944, + "balance_loss_mlp": 1.01294398, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.4839357464300424, + "language_loss": 0.7904529, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81168008, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 2.549156427383423 + }, + { + "auxiliary_loss_clip": 0.01060639, + "auxiliary_loss_mlp": 0.01039612, + "balance_loss_clip": 1.03367198, + "balance_loss_mlp": 1.02719092, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 2.4616311629438155, + "language_loss": 0.63719988, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.65820241, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.6317007541656494 + }, + { + "auxiliary_loss_clip": 0.01085958, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.03664696, + "balance_loss_mlp": 1.01792395, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.429341953770012, + "language_loss": 0.66173685, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68289852, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 2.7167561054229736 + }, + { + "auxiliary_loss_clip": 0.01086878, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.03366256, + "balance_loss_mlp": 1.01843822, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 9.35740943635843, + "language_loss": 0.60981375, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63098621, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.592633008956909 + }, + { + "auxiliary_loss_clip": 0.01090016, + "auxiliary_loss_mlp": 0.00749071, + "balance_loss_clip": 1.03519487, + "balance_loss_mlp": 1.00015724, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.6235382098461943, + "language_loss": 0.64586717, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.664258, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 2.5229949951171875 + }, + { + "auxiliary_loss_clip": 0.01089917, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.03346872, + "balance_loss_mlp": 1.01861119, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.6831502028997893, + "language_loss": 0.63902009, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.66021192, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.56099271774292 + }, + { + "auxiliary_loss_clip": 0.01087644, + "auxiliary_loss_mlp": 0.01024999, + "balance_loss_clip": 1.03356171, + "balance_loss_mlp": 1.01483059, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.4692700402569043, + "language_loss": 0.77264804, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79377449, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.6223304271698 + }, + { + "auxiliary_loss_clip": 0.01037823, + "auxiliary_loss_mlp": 0.01038652, + "balance_loss_clip": 1.02892637, + "balance_loss_mlp": 1.02668321, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.45270830214289, + "language_loss": 0.72227895, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74304366, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 2.7637648582458496 + }, + { + "auxiliary_loss_clip": 0.01051615, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.03205466, + "balance_loss_mlp": 1.02298295, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.8291607987943057, + "language_loss": 0.7545051, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77536106, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 2.606735944747925 + }, + { + "auxiliary_loss_clip": 0.01073907, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.03632534, + "balance_loss_mlp": 1.01860595, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.273976530495513, + "language_loss": 0.62023222, + "learning_rate": 1.750423192272189e-06, + "loss": 0.64127743, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 4.051743507385254 + }, + { + "auxiliary_loss_clip": 0.01101529, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.0344907, + "balance_loss_mlp": 1.02092695, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.069807678817196, + "language_loss": 0.64274603, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66407931, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.4722328186035156 + }, + { + "auxiliary_loss_clip": 0.0106533, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.03394473, + "balance_loss_mlp": 1.02153373, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.9417127926477766, + "language_loss": 0.82638079, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84737003, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.654841184616089 + }, + { + "auxiliary_loss_clip": 0.01079668, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.03353047, + "balance_loss_mlp": 1.01727533, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 7.523588027648396, + "language_loss": 0.73079324, + "learning_rate": 1.74926398270663e-06, + "loss": 0.7518667, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.617677688598633 + }, + { + "auxiliary_loss_clip": 0.01071585, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.03331995, + "balance_loss_mlp": 1.02208483, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.7968836320844783, + "language_loss": 0.66881239, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68987465, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 2.6191396713256836 + }, + { + "auxiliary_loss_clip": 0.01075417, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.03651142, + "balance_loss_mlp": 1.0130024, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.4398710443438016, + "language_loss": 0.51568854, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53669918, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.797478437423706 + }, + { + "auxiliary_loss_clip": 0.01075031, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.03644085, + "balance_loss_mlp": 1.02002978, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.9168515536802555, + "language_loss": 0.8583132, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.8793813, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.6836256980895996 + }, + { + "auxiliary_loss_clip": 0.01088985, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.02034104, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.7888925205385955, + "language_loss": 0.70385665, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72505593, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.6621830463409424 + }, + { + "auxiliary_loss_clip": 0.01072855, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.0330478, + "balance_loss_mlp": 1.01591825, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.821772751624995, + "language_loss": 0.72817969, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.74918765, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.7144510746002197 + }, + { + "auxiliary_loss_clip": 0.01076646, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.0341624, + "balance_loss_mlp": 1.01707172, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 2.4910327592104404, + "language_loss": 0.71611923, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73717022, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.633291244506836 + }, + { + "auxiliary_loss_clip": 0.01081417, + "auxiliary_loss_mlp": 0.01026294, + "balance_loss_clip": 1.03268719, + "balance_loss_mlp": 1.01560664, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.6691098187492963, + "language_loss": 0.78201234, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80308938, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.5235564708709717 + }, + { + "auxiliary_loss_clip": 0.01056783, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.03006637, + "balance_loss_mlp": 1.02001333, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.736906119311492, + "language_loss": 0.72260249, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74350053, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 2.638232469558716 + }, + { + "auxiliary_loss_clip": 0.0108888, + "auxiliary_loss_mlp": 0.01039526, + "balance_loss_clip": 1.03691566, + "balance_loss_mlp": 1.02802849, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.5103547208091561, + "language_loss": 0.71362174, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73490584, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 2.519015312194824 + }, + { + "auxiliary_loss_clip": 0.01100535, + "auxiliary_loss_mlp": 0.01025502, + "balance_loss_clip": 1.03493452, + "balance_loss_mlp": 1.01492262, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.7608735304780498, + "language_loss": 0.79138339, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81264377, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.4831745624542236 + }, + { + "auxiliary_loss_clip": 0.0107006, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.03822827, + "balance_loss_mlp": 1.0182879, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 2.1814403920424916, + "language_loss": 0.8348704, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.85586703, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.620457410812378 + }, + { + "auxiliary_loss_clip": 0.01081974, + "auxiliary_loss_mlp": 0.00749312, + "balance_loss_clip": 1.04119325, + "balance_loss_mlp": 1.00018382, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.6543941952795198, + "language_loss": 0.75125384, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.76956671, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.691265344619751 + }, + { + "auxiliary_loss_clip": 0.01071283, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.03284752, + "balance_loss_mlp": 1.01513648, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.8064164560793887, + "language_loss": 0.82128567, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.8422755, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 4.222588300704956 + }, + { + "auxiliary_loss_clip": 0.01088583, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.03594482, + "balance_loss_mlp": 1.02827454, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 1.7519677984839395, + "language_loss": 0.57323867, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59451956, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.5732736587524414 + }, + { + "auxiliary_loss_clip": 0.01094096, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.03500688, + "balance_loss_mlp": 1.02329862, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5184792742370006, + "language_loss": 0.67857039, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69986093, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 2.6113364696502686 + }, + { + "auxiliary_loss_clip": 0.01063011, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.02927053, + "balance_loss_mlp": 1.01818275, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.8433795326862752, + "language_loss": 0.7431618, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76408279, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 2.6304666996002197 + }, + { + "auxiliary_loss_clip": 0.01066967, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.03518403, + "balance_loss_mlp": 1.02067029, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 2.1563065924932645, + "language_loss": 0.73578227, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75678092, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.6798646450042725 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.03433251, + "balance_loss_mlp": 1.01695919, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 1.635704180371344, + "language_loss": 0.75937247, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.78067505, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 2.5273613929748535 + }, + { + "auxiliary_loss_clip": 0.01092976, + "auxiliary_loss_mlp": 0.0074929, + "balance_loss_clip": 1.03684831, + "balance_loss_mlp": 1.00019729, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.899902630722678, + "language_loss": 0.68443459, + "learning_rate": 1.741924325613172e-06, + "loss": 0.7028572, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.601649761199951 + }, + { + "auxiliary_loss_clip": 0.0105785, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.03540564, + "balance_loss_mlp": 1.02254438, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.126819407019468, + "language_loss": 0.68187565, + "learning_rate": 1.741538124855163e-06, + "loss": 0.7028023, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 2.7050909996032715 + }, + { + "auxiliary_loss_clip": 0.01105993, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0354116, + "balance_loss_mlp": 1.0203743, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.8845574110385206, + "language_loss": 0.781214, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80260497, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 2.5133213996887207 + }, + { + "auxiliary_loss_clip": 0.01053189, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.02918148, + "balance_loss_mlp": 1.0211159, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.641593398113633, + "language_loss": 0.82401246, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84486532, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 2.6270837783813477 + }, + { + "auxiliary_loss_clip": 0.01087444, + "auxiliary_loss_mlp": 0.01037042, + "balance_loss_clip": 1.03265524, + "balance_loss_mlp": 1.02528787, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.0720206651532447, + "language_loss": 0.74986553, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77111042, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 2.513402223587036 + }, + { + "auxiliary_loss_clip": 0.01076054, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.03171945, + "balance_loss_mlp": 1.0164876, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 1.82418731566553, + "language_loss": 0.6472255, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.6682601, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 2.5582313537597656 + }, + { + "auxiliary_loss_clip": 0.01039789, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.0263176, + "balance_loss_mlp": 1.01926255, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.9639536888692501, + "language_loss": 0.67560679, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.69631815, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 4.090883493423462 + }, + { + "auxiliary_loss_clip": 0.01097495, + "auxiliary_loss_mlp": 0.01024654, + "balance_loss_clip": 1.03364551, + "balance_loss_mlp": 1.01392496, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 2.5452549443631507, + "language_loss": 0.86420763, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88542914, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 4.036989688873291 + }, + { + "auxiliary_loss_clip": 0.01088744, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.03326809, + "balance_loss_mlp": 1.01860666, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.6972752229118317, + "language_loss": 0.73567748, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75686151, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 2.5499203205108643 + }, + { + "auxiliary_loss_clip": 0.0108456, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.03138494, + "balance_loss_mlp": 1.01766622, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 2.148796572387102, + "language_loss": 0.78323489, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80437309, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 2.797214984893799 + }, + { + "auxiliary_loss_clip": 0.01082138, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.03496039, + "balance_loss_mlp": 1.01644492, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.8603647322878338, + "language_loss": 0.79876566, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.81986362, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.5861518383026123 + }, + { + "auxiliary_loss_clip": 0.0107723, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.03209877, + "balance_loss_mlp": 1.01808453, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.887762725868638, + "language_loss": 0.65074313, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67180824, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.571800947189331 + }, + { + "auxiliary_loss_clip": 0.01092564, + "auxiliary_loss_mlp": 0.00749256, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.0002799, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 5.965486510196011, + "language_loss": 0.72977841, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.7481966, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 2.5450878143310547 + }, + { + "auxiliary_loss_clip": 0.01088823, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.03882122, + "balance_loss_mlp": 1.01897979, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 3.5163224096516874, + "language_loss": 0.63808858, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.65929031, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 2.616732597351074 + }, + { + "auxiliary_loss_clip": 0.01075906, + "auxiliary_loss_mlp": 0.00749163, + "balance_loss_clip": 1.03627348, + "balance_loss_mlp": 1.00024486, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 2.293120004502629, + "language_loss": 0.75214183, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77039242, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 2.627671003341675 + }, + { + "auxiliary_loss_clip": 0.01071313, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.03022611, + "balance_loss_mlp": 1.02007067, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.2749525030388833, + "language_loss": 0.74392247, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76493835, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 2.5937514305114746 + }, + { + "auxiliary_loss_clip": 0.01078442, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.03387308, + "balance_loss_mlp": 1.01794612, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.2263400695276663, + "language_loss": 0.79542804, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.8165127, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 2.6339473724365234 + }, + { + "auxiliary_loss_clip": 0.01101824, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.03481817, + "balance_loss_mlp": 1.02235746, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.8170792887257894, + "language_loss": 0.73857903, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.75993025, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 2.475087881088257 + }, + { + "auxiliary_loss_clip": 0.01075763, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.03382337, + "balance_loss_mlp": 1.02177608, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 3.3350884455249634, + "language_loss": 0.75491405, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.77601129, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 4.15275764465332 + }, + { + "auxiliary_loss_clip": 0.0098801, + "auxiliary_loss_mlp": 0.0101472, + "balance_loss_clip": 1.0063796, + "balance_loss_mlp": 1.01336098, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.845079045317156, + "language_loss": 0.59411126, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.6141386, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.306577682495117 + }, + { + "auxiliary_loss_clip": 0.01098162, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.0316025, + "balance_loss_mlp": 1.01930368, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 2.2284135115344763, + "language_loss": 0.79597497, + "learning_rate": 1.734202189316832e-06, + "loss": 0.8172636, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.470735788345337 + }, + { + "auxiliary_loss_clip": 0.01080323, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.03339851, + "balance_loss_mlp": 1.01774645, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 2.419764810677252, + "language_loss": 0.6911962, + "learning_rate": 1.733816187358836e-06, + "loss": 0.71229488, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.5574867725372314 + }, + { + "auxiliary_loss_clip": 0.01090407, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.03328729, + "balance_loss_mlp": 1.01676083, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.6113699943066317, + "language_loss": 0.75616324, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77734631, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.5627825260162354 + }, + { + "auxiliary_loss_clip": 0.01086343, + "auxiliary_loss_mlp": 0.01039092, + "balance_loss_clip": 1.0327673, + "balance_loss_mlp": 1.02612853, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.612391095062654, + "language_loss": 0.72830975, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74956411, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 2.572169542312622 + }, + { + "auxiliary_loss_clip": 0.01073735, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.03597832, + "balance_loss_mlp": 1.01824236, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 2.621536716805197, + "language_loss": 0.83555478, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85658306, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.6480212211608887 + }, + { + "auxiliary_loss_clip": 0.00998808, + "auxiliary_loss_mlp": 0.01000533, + "balance_loss_clip": 1.00754857, + "balance_loss_mlp": 0.99926913, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.867876629171466, + "language_loss": 0.64864659, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66864002, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 3.0027496814727783 + }, + { + "auxiliary_loss_clip": 0.01092644, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.03679419, + "balance_loss_mlp": 1.02228522, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.7795017082110618, + "language_loss": 0.69305545, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71431696, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.5616796016693115 + }, + { + "auxiliary_loss_clip": 0.01065643, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.03201425, + "balance_loss_mlp": 1.01890421, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 2.8049557982637277, + "language_loss": 0.7583431, + "learning_rate": 1.73150038809119e-06, + "loss": 0.77929544, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 2.5951225757598877 + }, + { + "auxiliary_loss_clip": 0.01057227, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.032969, + "balance_loss_mlp": 1.02121353, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 1.9684272339568674, + "language_loss": 0.60638666, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.6272794, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.730419874191284 + }, + { + "auxiliary_loss_clip": 0.01063463, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.03055418, + "balance_loss_mlp": 1.01831341, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.6851620902548579, + "language_loss": 0.78673464, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.8076756, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 2.694852352142334 + }, + { + "auxiliary_loss_clip": 0.0107273, + "auxiliary_loss_mlp": 0.01031664, + "balance_loss_clip": 1.03446126, + "balance_loss_mlp": 1.01973724, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.886585092873807, + "language_loss": 0.81281835, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.8338623, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.6233885288238525 + }, + { + "auxiliary_loss_clip": 0.01102297, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.03531134, + "balance_loss_mlp": 1.02161956, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.5772379249398576, + "language_loss": 0.68776637, + "learning_rate": 1.729956725348256e-06, + "loss": 0.70912176, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.592432737350464 + }, + { + "auxiliary_loss_clip": 0.00998126, + "auxiliary_loss_mlp": 0.01001267, + "balance_loss_clip": 1.00686598, + "balance_loss_mlp": 0.99988443, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7298286300539355, + "language_loss": 0.61157966, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63157362, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 3.1389732360839844 + }, + { + "auxiliary_loss_clip": 0.01091183, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.0335536, + "balance_loss_mlp": 1.02327836, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.7382340332785398, + "language_loss": 0.64608639, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66734284, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 2.599396228790283 + }, + { + "auxiliary_loss_clip": 0.01075641, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.03144717, + "balance_loss_mlp": 1.01452911, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 1.7336789193297086, + "language_loss": 0.73340929, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75442302, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 4.025383949279785 + }, + { + "auxiliary_loss_clip": 0.01076894, + "auxiliary_loss_mlp": 0.01025867, + "balance_loss_clip": 1.03957474, + "balance_loss_mlp": 1.0144887, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 2.140378482541645, + "language_loss": 0.7618233, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78285092, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 2.5961859226226807 + }, + { + "auxiliary_loss_clip": 0.01073889, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.03443122, + "balance_loss_mlp": 1.01926005, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.3456111321487443, + "language_loss": 0.71154886, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73258209, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.58402943611145 + }, + { + "auxiliary_loss_clip": 0.01074986, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.03189206, + "balance_loss_mlp": 1.02304947, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.9453838332521274, + "language_loss": 0.68221939, + "learning_rate": 1.727641538728533e-06, + "loss": 0.70331013, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 2.6221401691436768 + }, + { + "auxiliary_loss_clip": 0.01085957, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.03268886, + "balance_loss_mlp": 1.02602053, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 2.210102755969541, + "language_loss": 0.7445246, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76575071, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.5316574573516846 + }, + { + "auxiliary_loss_clip": 0.01083987, + "auxiliary_loss_mlp": 0.00749057, + "balance_loss_clip": 1.0341053, + "balance_loss_mlp": 1.00024509, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 1.9171811494263848, + "language_loss": 0.7508496, + "learning_rate": 1.726869892322104e-06, + "loss": 0.76918006, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.6127240657806396 + }, + { + "auxiliary_loss_clip": 0.01060987, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.02993071, + "balance_loss_mlp": 1.02201676, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.611330618301546, + "language_loss": 0.82881856, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84977305, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 2.647104024887085 + }, + { + "auxiliary_loss_clip": 0.01060363, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.03506684, + "balance_loss_mlp": 1.02000225, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.987210285861477, + "language_loss": 0.79456288, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81548482, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 2.705292224884033 + }, + { + "auxiliary_loss_clip": 0.01079641, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.03281748, + "balance_loss_mlp": 1.01625824, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.8586122512511767, + "language_loss": 0.90241349, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92348695, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.6066155433654785 + }, + { + "auxiliary_loss_clip": 0.01066418, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.03284478, + "balance_loss_mlp": 1.0166136, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 2.4172223745849424, + "language_loss": 0.83165157, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.85259509, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 2.6915550231933594 + }, + { + "auxiliary_loss_clip": 0.01093172, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.03536892, + "balance_loss_mlp": 1.02169847, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 1.9713662885816554, + "language_loss": 0.74481881, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76609462, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 2.577054738998413 + }, + { + "auxiliary_loss_clip": 0.01082323, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.03731489, + "balance_loss_mlp": 1.02238941, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 3.0270539079272276, + "language_loss": 0.78659666, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.80777454, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.522800922393799 + }, + { + "auxiliary_loss_clip": 0.01078366, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.03561425, + "balance_loss_mlp": 1.01582527, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 2.1719619339258602, + "language_loss": 0.74827218, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76933181, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 4.01890754699707 + }, + { + "auxiliary_loss_clip": 0.01078481, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.03261232, + "balance_loss_mlp": 1.02005696, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.6683221559970864, + "language_loss": 0.75439578, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77549231, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 4.081066608428955 + }, + { + "auxiliary_loss_clip": 0.01098764, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.03373957, + "balance_loss_mlp": 1.02065277, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.568257203805903, + "language_loss": 0.71647549, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.7377767, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.520707130432129 + }, + { + "auxiliary_loss_clip": 0.01066498, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.03436852, + "balance_loss_mlp": 1.0195967, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 3.307015403753447, + "language_loss": 0.75666457, + "learning_rate": 1.723012284057868e-06, + "loss": 0.7776435, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.6363525390625 + }, + { + "auxiliary_loss_clip": 0.01074263, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.0300982, + "balance_loss_mlp": 1.02068496, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 2.0823316460632504, + "language_loss": 0.67562944, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69669104, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 2.583221912384033 + }, + { + "auxiliary_loss_clip": 0.010865, + "auxiliary_loss_mlp": 0.01034221, + "balance_loss_clip": 1.03247786, + "balance_loss_mlp": 1.02271748, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.6043811207103595, + "language_loss": 0.73125154, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75245875, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.577305555343628 + }, + { + "auxiliary_loss_clip": 0.01068098, + "auxiliary_loss_mlp": 0.00749365, + "balance_loss_clip": 1.0328877, + "balance_loss_mlp": 1.00029027, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 3.5872679359889954, + "language_loss": 0.75823057, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77640522, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.634988784790039 + }, + { + "auxiliary_loss_clip": 0.01040849, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.03208947, + "balance_loss_mlp": 1.01724124, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.806180158506875, + "language_loss": 0.66113967, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68183655, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.6690917015075684 + }, + { + "auxiliary_loss_clip": 0.01071301, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.03673458, + "balance_loss_mlp": 1.0172435, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 2.2830821066641867, + "language_loss": 0.8270694, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.84805852, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 2.6048996448516846 + }, + { + "auxiliary_loss_clip": 0.01081103, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.03466988, + "balance_loss_mlp": 1.02280903, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.1976414342994905, + "language_loss": 0.85068846, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87184161, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 2.5734243392944336 + }, + { + "auxiliary_loss_clip": 0.01077014, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.03394318, + "balance_loss_mlp": 1.02142859, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 2.161671682476369, + "language_loss": 0.73925716, + "learning_rate": 1.720312582354912e-06, + "loss": 0.76035416, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 2.5968782901763916 + }, + { + "auxiliary_loss_clip": 0.01103642, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.03553557, + "balance_loss_mlp": 1.01914346, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.7216832382723337, + "language_loss": 0.74157572, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76291466, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.6018381118774414 + }, + { + "auxiliary_loss_clip": 0.01067053, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.03431368, + "balance_loss_mlp": 1.01915336, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.845677799834858, + "language_loss": 0.7460435, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.76703352, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 4.2087767124176025 + }, + { + "auxiliary_loss_clip": 0.01081101, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.03539026, + "balance_loss_mlp": 1.02497089, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 2.5746232309494848, + "language_loss": 0.77627587, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79746771, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.585907220840454 + }, + { + "auxiliary_loss_clip": 0.01075135, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.03556895, + "balance_loss_mlp": 1.01886213, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.7417142235336764, + "language_loss": 0.6095677, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63062966, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.7067360877990723 + }, + { + "auxiliary_loss_clip": 0.01052314, + "auxiliary_loss_mlp": 0.01028532, + "balance_loss_clip": 1.03186917, + "balance_loss_mlp": 1.01678991, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 2.7450393929277874, + "language_loss": 0.67725086, + "learning_rate": 1.7183845418764e-06, + "loss": 0.69805932, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 2.664973735809326 + }, + { + "auxiliary_loss_clip": 0.01073375, + "auxiliary_loss_mlp": 0.01035099, + "balance_loss_clip": 1.0343715, + "balance_loss_mlp": 1.02283263, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 1.7167124507680656, + "language_loss": 0.83868742, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.85977221, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.6574926376342773 + }, + { + "auxiliary_loss_clip": 0.01073443, + "auxiliary_loss_mlp": 0.01039399, + "balance_loss_clip": 1.03400934, + "balance_loss_mlp": 1.0274899, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 2.0143430492248675, + "language_loss": 0.73922455, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.76035297, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 2.6758458614349365 + }, + { + "auxiliary_loss_clip": 0.01068893, + "auxiliary_loss_mlp": 0.01035375, + "balance_loss_clip": 1.03391385, + "balance_loss_mlp": 1.02454448, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 3.161020909746229, + "language_loss": 0.72582895, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.74687165, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 2.6332147121429443 + }, + { + "auxiliary_loss_clip": 0.01082242, + "auxiliary_loss_mlp": 0.00749344, + "balance_loss_clip": 1.03551579, + "balance_loss_mlp": 1.00024641, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 6.724439621594341, + "language_loss": 0.68775946, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70607531, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.6526222229003906 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.03759718, + "balance_loss_mlp": 1.02055752, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.7577779312121316, + "language_loss": 0.80637729, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82774615, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.552269220352173 + }, + { + "auxiliary_loss_clip": 0.01091544, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.03554666, + "balance_loss_mlp": 1.01773667, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.5817550666681772, + "language_loss": 0.6521585, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67336297, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.5245320796966553 + }, + { + "auxiliary_loss_clip": 0.01074456, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.03718734, + "balance_loss_mlp": 1.02424669, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.7020685436834329, + "language_loss": 0.75264764, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77376187, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.6214542388916016 + }, + { + "auxiliary_loss_clip": 0.01009315, + "auxiliary_loss_mlp": 0.01023529, + "balance_loss_clip": 1.00803697, + "balance_loss_mlp": 1.02196777, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6864430743961508, + "language_loss": 0.52399421, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54432261, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 3.178696870803833 + }, + { + "auxiliary_loss_clip": 0.01083842, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.03357339, + "balance_loss_mlp": 1.01994741, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.9682314260963838, + "language_loss": 0.68262982, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70377743, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.7055552005767822 + }, + { + "auxiliary_loss_clip": 0.0104195, + "auxiliary_loss_mlp": 0.01052736, + "balance_loss_clip": 1.02856195, + "balance_loss_mlp": 1.03829944, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 2.503580262815979, + "language_loss": 0.81938553, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84033239, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 2.6712636947631836 + }, + { + "auxiliary_loss_clip": 0.01102267, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.03449094, + "balance_loss_mlp": 1.01500154, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 2.0277413587903705, + "language_loss": 0.67759812, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69889128, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 2.547299861907959 + }, + { + "auxiliary_loss_clip": 0.01067888, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.03436482, + "balance_loss_mlp": 1.01432514, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 2.0730756263640275, + "language_loss": 0.70676363, + "learning_rate": 1.713758337453878e-06, + "loss": 0.7277118, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.5991368293762207 + }, + { + "auxiliary_loss_clip": 0.01024368, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.02959073, + "balance_loss_mlp": 1.02601242, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.9220641560144571, + "language_loss": 0.72636652, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74699926, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 4.372623682022095 + }, + { + "auxiliary_loss_clip": 0.01089072, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.03283942, + "balance_loss_mlp": 1.01734066, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.8591685690078328, + "language_loss": 0.77817059, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.79934633, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.6119279861450195 + }, + { + "auxiliary_loss_clip": 0.01054317, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.03442383, + "balance_loss_mlp": 1.01642752, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.5342644845459805, + "language_loss": 0.69153023, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.71234781, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 2.6518659591674805 + }, + { + "auxiliary_loss_clip": 0.01009757, + "auxiliary_loss_mlp": 0.01006363, + "balance_loss_clip": 1.01127672, + "balance_loss_mlp": 1.00515866, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9217970477525955, + "language_loss": 0.60287786, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62303901, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.254835605621338 + }, + { + "auxiliary_loss_clip": 0.01087432, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.03610349, + "balance_loss_mlp": 1.02109742, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.6369111419472657, + "language_loss": 0.74075782, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76195717, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 2.567453622817993 + }, + { + "auxiliary_loss_clip": 0.0102557, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.02607656, + "balance_loss_mlp": 1.02284992, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.8950502731897723, + "language_loss": 0.696962, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71757936, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 2.7353463172912598 + }, + { + "auxiliary_loss_clip": 0.01080297, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.0361402, + "balance_loss_mlp": 1.01765656, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 1.9042192805883187, + "language_loss": 0.75175798, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77287239, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 2.6216835975646973 + }, + { + "auxiliary_loss_clip": 0.01095058, + "auxiliary_loss_mlp": 0.01035372, + "balance_loss_clip": 1.03661227, + "balance_loss_mlp": 1.02224672, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 2.781516091370349, + "language_loss": 0.6926291, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71393341, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 2.590494155883789 + }, + { + "auxiliary_loss_clip": 0.0108803, + "auxiliary_loss_mlp": 0.01027645, + "balance_loss_clip": 1.03277969, + "balance_loss_mlp": 1.01638603, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.7688385677354546, + "language_loss": 0.7244581, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74561489, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 2.5423150062561035 + }, + { + "auxiliary_loss_clip": 0.01068321, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.03817058, + "balance_loss_mlp": 1.01880169, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 2.100592322410095, + "language_loss": 0.8937037, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91469264, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.6414530277252197 + }, + { + "auxiliary_loss_clip": 0.01060328, + "auxiliary_loss_mlp": 0.01038272, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.02586293, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.3859031536830348, + "language_loss": 0.77743989, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79842591, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 2.6692392826080322 + }, + { + "auxiliary_loss_clip": 0.01068016, + "auxiliary_loss_mlp": 0.01026888, + "balance_loss_clip": 1.03410256, + "balance_loss_mlp": 1.01516974, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 2.1380917907933314, + "language_loss": 0.70458281, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72553182, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 4.234712600708008 + }, + { + "auxiliary_loss_clip": 0.01081302, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.03299332, + "balance_loss_mlp": 1.01805139, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 1.6826530309299161, + "language_loss": 0.66732323, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.6884405, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 2.7315852642059326 + }, + { + "auxiliary_loss_clip": 0.01063088, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.03204584, + "balance_loss_mlp": 1.02125156, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 5.086974617302974, + "language_loss": 0.86381578, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88479733, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 4.072376489639282 + }, + { + "auxiliary_loss_clip": 0.01095218, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.03494787, + "balance_loss_mlp": 1.02150345, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.634611202334249, + "language_loss": 0.77040839, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79171264, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 2.5541000366210938 + }, + { + "auxiliary_loss_clip": 0.01089827, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.03495502, + "balance_loss_mlp": 1.02505064, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.53939476655574, + "language_loss": 0.76220679, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78346276, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.5458433628082275 + }, + { + "auxiliary_loss_clip": 0.01089187, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.03473198, + "balance_loss_mlp": 1.02222276, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.7882006538078639, + "language_loss": 0.85293508, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87416071, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.587468385696411 + }, + { + "auxiliary_loss_clip": 0.01019294, + "auxiliary_loss_mlp": 0.00998057, + "balance_loss_clip": 1.00781727, + "balance_loss_mlp": 0.99678725, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.748313180813913, + "language_loss": 0.52576482, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54593831, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 2.922452688217163 + }, + { + "auxiliary_loss_clip": 0.01078922, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.03424811, + "balance_loss_mlp": 1.01895154, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.376115665466353, + "language_loss": 0.74443465, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76552308, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 2.593644618988037 + }, + { + "auxiliary_loss_clip": 0.01102998, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.03543377, + "balance_loss_mlp": 1.02001119, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.563968890143453, + "language_loss": 0.73810244, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.75945675, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 2.652745008468628 + }, + { + "auxiliary_loss_clip": 0.01082234, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.03704834, + "balance_loss_mlp": 1.02048063, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.642966185011087, + "language_loss": 0.61946094, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.64061159, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.5862820148468018 + }, + { + "auxiliary_loss_clip": 0.01047686, + "auxiliary_loss_mlp": 0.01034765, + "balance_loss_clip": 1.02976787, + "balance_loss_mlp": 1.02172971, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 2.0984519273476714, + "language_loss": 0.8777023, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89852679, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 2.6039443016052246 + }, + { + "auxiliary_loss_clip": 0.01077949, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.03249586, + "balance_loss_mlp": 1.02168298, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 2.087925593626408, + "language_loss": 0.73825246, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.75937879, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 2.59673810005188 + }, + { + "auxiliary_loss_clip": 0.01076974, + "auxiliary_loss_mlp": 0.01027399, + "balance_loss_clip": 1.03134775, + "balance_loss_mlp": 1.01384544, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 2.127514784520743, + "language_loss": 0.78316104, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.8042047, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.658963203430176 + }, + { + "auxiliary_loss_clip": 0.0108949, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.03835297, + "balance_loss_mlp": 1.01886916, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.499965962754733, + "language_loss": 0.78425902, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80547082, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 4.139268636703491 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.03477037, + "balance_loss_mlp": 1.01834702, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.3992130909593303, + "language_loss": 0.73318207, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75450945, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.5417675971984863 + }, + { + "auxiliary_loss_clip": 0.01083735, + "auxiliary_loss_mlp": 0.00749703, + "balance_loss_clip": 1.03569627, + "balance_loss_mlp": 1.00036621, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.650914466885943, + "language_loss": 0.83753967, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85587406, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.6538174152374268 + }, + { + "auxiliary_loss_clip": 0.01026007, + "auxiliary_loss_mlp": 0.01002312, + "balance_loss_clip": 1.00505924, + "balance_loss_mlp": 1.00108385, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7268100449888374, + "language_loss": 0.5787577, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59904087, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 3.0886588096618652 + }, + { + "auxiliary_loss_clip": 0.0106025, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.03464961, + "balance_loss_mlp": 1.02130067, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 2.068180211340003, + "language_loss": 0.81783164, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.83877182, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 2.640831232070923 + }, + { + "auxiliary_loss_clip": 0.01087706, + "auxiliary_loss_mlp": 0.01037741, + "balance_loss_clip": 1.03422928, + "balance_loss_mlp": 1.02409208, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.9205771192538865, + "language_loss": 0.81398165, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83523607, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.5787806510925293 + }, + { + "auxiliary_loss_clip": 0.0110257, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.0344851, + "balance_loss_mlp": 1.01673484, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.6265695742059563, + "language_loss": 0.72749031, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.74879849, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.49923038482666 + }, + { + "auxiliary_loss_clip": 0.01079245, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.03757954, + "balance_loss_mlp": 1.02407479, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 2.064833227412734, + "language_loss": 0.71280444, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73395777, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.571901321411133 + }, + { + "auxiliary_loss_clip": 0.01078005, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.03276896, + "balance_loss_mlp": 1.01795137, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.7968340557167384, + "language_loss": 0.76660639, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78768486, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 2.5643928050994873 + }, + { + "auxiliary_loss_clip": 0.01091318, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.0356437, + "balance_loss_mlp": 1.02179849, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.109576602251205, + "language_loss": 0.64154732, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66279393, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.5958542823791504 + }, + { + "auxiliary_loss_clip": 0.01008432, + "auxiliary_loss_mlp": 0.01003763, + "balance_loss_clip": 1.00716496, + "balance_loss_mlp": 1.00270164, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.8924051377111224, + "language_loss": 0.62638617, + "learning_rate": 1.700274261035102e-06, + "loss": 0.6465081, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 3.1509218215942383 + }, + { + "auxiliary_loss_clip": 0.01069279, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.0319891, + "balance_loss_mlp": 1.01795721, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.9957998507000199, + "language_loss": 0.6608693, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.68185842, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 2.722001552581787 + }, + { + "auxiliary_loss_clip": 0.01083848, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.0331713, + "balance_loss_mlp": 1.02283931, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 1.7746159040318223, + "language_loss": 0.70145786, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72265601, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 2.516697883605957 + }, + { + "auxiliary_loss_clip": 0.01056939, + "auxiliary_loss_mlp": 0.01028292, + "balance_loss_clip": 1.03298497, + "balance_loss_mlp": 1.01728904, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.8062929374042707, + "language_loss": 0.77332813, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79418045, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 2.6543850898742676 + }, + { + "auxiliary_loss_clip": 0.01051563, + "auxiliary_loss_mlp": 0.01035489, + "balance_loss_clip": 1.03086519, + "balance_loss_mlp": 1.02259612, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.6298922290993296, + "language_loss": 0.79705024, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81792074, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 2.799217700958252 + }, + { + "auxiliary_loss_clip": 0.01072634, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03282642, + "balance_loss_mlp": 1.01938426, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 4.771197585033499, + "language_loss": 0.75720417, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.77824891, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 4.151091575622559 + }, + { + "auxiliary_loss_clip": 0.01061334, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.03606248, + "balance_loss_mlp": 1.02374315, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.5773514193741593, + "language_loss": 0.68991613, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.71089029, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.7505760192871094 + }, + { + "auxiliary_loss_clip": 0.01104205, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.03557563, + "balance_loss_mlp": 1.01999402, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.2751299837476413, + "language_loss": 0.66165495, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68302017, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.5606689453125 + }, + { + "auxiliary_loss_clip": 0.01081224, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.03568578, + "balance_loss_mlp": 1.01607966, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.810155609932887, + "language_loss": 0.87475014, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89583576, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 2.579509973526001 + }, + { + "auxiliary_loss_clip": 0.01076719, + "auxiliary_loss_mlp": 0.01032933, + "balance_loss_clip": 1.03405547, + "balance_loss_mlp": 1.02061915, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 3.2529311430773484, + "language_loss": 0.58935869, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61045521, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 2.62028169631958 + }, + { + "auxiliary_loss_clip": 0.01096861, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.036919, + "balance_loss_mlp": 1.02081025, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.3574343705681406, + "language_loss": 0.68785763, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.70916957, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 2.5635411739349365 + }, + { + "auxiliary_loss_clip": 0.01057811, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.03408861, + "balance_loss_mlp": 1.01530457, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.7361978274869316, + "language_loss": 0.7917487, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81261444, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 2.7158079147338867 + }, + { + "auxiliary_loss_clip": 0.01046027, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.03149164, + "balance_loss_mlp": 1.02007794, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.66868907675546, + "language_loss": 0.67716324, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69794804, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 2.7956879138946533 + }, + { + "auxiliary_loss_clip": 0.0104853, + "auxiliary_loss_mlp": 0.01034506, + "balance_loss_clip": 1.03310513, + "balance_loss_mlp": 1.02152371, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.1452553168854736, + "language_loss": 0.7881794, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80900967, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.6528868675231934 + }, + { + "auxiliary_loss_clip": 0.0107928, + "auxiliary_loss_mlp": 0.00749512, + "balance_loss_clip": 1.03239548, + "balance_loss_mlp": 1.00030661, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.7267870528307085, + "language_loss": 0.59183985, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.61012775, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.618666410446167 + }, + { + "auxiliary_loss_clip": 0.01087779, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.03364658, + "balance_loss_mlp": 1.01994085, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.2938486786903287, + "language_loss": 0.71803164, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73922384, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 2.572654962539673 + }, + { + "auxiliary_loss_clip": 0.01081558, + "auxiliary_loss_mlp": 0.01027239, + "balance_loss_clip": 1.03475499, + "balance_loss_mlp": 1.0154438, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 2.3109668460267967, + "language_loss": 0.7610243, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.78211224, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 2.5363829135894775 + }, + { + "auxiliary_loss_clip": 0.01073686, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.03526652, + "balance_loss_mlp": 1.02255344, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 1.7715953688201331, + "language_loss": 0.73083192, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.75191486, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 4.081360578536987 + }, + { + "auxiliary_loss_clip": 0.01090404, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.0357399, + "balance_loss_mlp": 1.01703978, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.7787740918266608, + "language_loss": 0.73504788, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75624454, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.593036413192749 + }, + { + "auxiliary_loss_clip": 0.01100934, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.03478885, + "balance_loss_mlp": 1.02108192, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 1.8936991948733917, + "language_loss": 0.83307147, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85441065, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 4.054833650588989 + }, + { + "auxiliary_loss_clip": 0.01090129, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.03372717, + "balance_loss_mlp": 1.0215919, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 2.2235313316157934, + "language_loss": 0.7204771, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74170983, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.570888042449951 + }, + { + "auxiliary_loss_clip": 0.011016, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.03499687, + "balance_loss_mlp": 1.02866709, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 1.6048668182418786, + "language_loss": 0.77533615, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79675674, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.5404598712921143 + }, + { + "auxiliary_loss_clip": 0.01078713, + "auxiliary_loss_mlp": 0.01030169, + "balance_loss_clip": 1.03239417, + "balance_loss_mlp": 1.01892185, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 2.204834848104241, + "language_loss": 0.70507193, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72616076, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.6166651248931885 + }, + { + "auxiliary_loss_clip": 0.0098062, + "auxiliary_loss_mlp": 0.01002888, + "balance_loss_clip": 1.00918233, + "balance_loss_mlp": 1.00178528, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7744373768060981, + "language_loss": 0.55602807, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57586312, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 3.1322574615478516 + }, + { + "auxiliary_loss_clip": 0.01078475, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.03484082, + "balance_loss_mlp": 1.0231638, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.6245328009022448, + "language_loss": 0.82055497, + "learning_rate": 1.691036046141018e-06, + "loss": 0.84168011, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.64208984375 + }, + { + "auxiliary_loss_clip": 0.01067659, + "auxiliary_loss_mlp": 0.00749372, + "balance_loss_clip": 1.03395092, + "balance_loss_mlp": 1.00032413, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.7327255049501822, + "language_loss": 0.74607939, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76424974, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.7705459594726562 + }, + { + "auxiliary_loss_clip": 0.01094733, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.03472137, + "balance_loss_mlp": 1.0187583, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.9199600229037466, + "language_loss": 0.82910037, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85035384, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.617506265640259 + }, + { + "auxiliary_loss_clip": 0.01059981, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.031883, + "balance_loss_mlp": 1.02441597, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.0194588425372264, + "language_loss": 0.65076894, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67172527, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 2.6095356941223145 + }, + { + "auxiliary_loss_clip": 0.01082098, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.03537309, + "balance_loss_mlp": 1.01875448, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 2.6541394647779404, + "language_loss": 0.80901968, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83015549, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.615367889404297 + }, + { + "auxiliary_loss_clip": 0.01102342, + "auxiliary_loss_mlp": 0.01024595, + "balance_loss_clip": 1.03641963, + "balance_loss_mlp": 1.01393223, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4998161419930263, + "language_loss": 0.73086143, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75213081, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.545163631439209 + }, + { + "auxiliary_loss_clip": 0.01008279, + "auxiliary_loss_mlp": 0.01006138, + "balance_loss_clip": 1.00731933, + "balance_loss_mlp": 1.00499964, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6236517828865008, + "language_loss": 0.5352152, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55535942, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 4.770559310913086 + }, + { + "auxiliary_loss_clip": 0.01104227, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.0370872, + "balance_loss_mlp": 1.02420723, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.7689114890894337, + "language_loss": 0.688537, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.70994008, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 2.5337729454040527 + }, + { + "auxiliary_loss_clip": 0.01061964, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02884889, + "balance_loss_mlp": 1.02228081, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.7142495544098921, + "language_loss": 0.75387681, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77484417, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.726661443710327 + }, + { + "auxiliary_loss_clip": 0.01078245, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.03429747, + "balance_loss_mlp": 1.01753807, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 2.4674862410676464, + "language_loss": 0.75473815, + "learning_rate": 1.687573444537108e-06, + "loss": 0.77582407, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.6388182640075684 + }, + { + "auxiliary_loss_clip": 0.01089548, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.03408599, + "balance_loss_mlp": 1.02120256, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 2.458534323209355, + "language_loss": 0.75935781, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78057826, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.5798637866973877 + }, + { + "auxiliary_loss_clip": 0.01073872, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.03360438, + "balance_loss_mlp": 1.02037954, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 1.8939567135187705, + "language_loss": 0.71647704, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73753488, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.593735456466675 + }, + { + "auxiliary_loss_clip": 0.01071182, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.03574336, + "balance_loss_mlp": 1.02181351, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3395390739511717, + "language_loss": 0.82555228, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84661776, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 2.6135077476501465 + }, + { + "auxiliary_loss_clip": 0.01085942, + "auxiliary_loss_mlp": 0.01027493, + "balance_loss_clip": 1.0301156, + "balance_loss_mlp": 1.01603734, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.7087418891510957, + "language_loss": 0.66350484, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68463922, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.569850444793701 + }, + { + "auxiliary_loss_clip": 0.01065837, + "auxiliary_loss_mlp": 0.00749247, + "balance_loss_clip": 1.03415346, + "balance_loss_mlp": 1.0002656, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 2.7614870622652945, + "language_loss": 0.8098827, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.82803357, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.6090307235717773 + }, + { + "auxiliary_loss_clip": 0.01082529, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.03263164, + "balance_loss_mlp": 1.01776767, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.427895315512214, + "language_loss": 0.69231892, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71344733, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 2.7573795318603516 + }, + { + "auxiliary_loss_clip": 0.01055243, + "auxiliary_loss_mlp": 0.01026912, + "balance_loss_clip": 1.03256571, + "balance_loss_mlp": 1.01559877, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 2.1463247760304816, + "language_loss": 0.74680686, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76762843, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 2.6214613914489746 + }, + { + "auxiliary_loss_clip": 0.01107161, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.03418231, + "balance_loss_mlp": 1.02191019, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.9315290081549232, + "language_loss": 0.82438505, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84580445, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.5035367012023926 + }, + { + "auxiliary_loss_clip": 0.01077764, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.03202772, + "balance_loss_mlp": 1.0189476, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.2939954211603353, + "language_loss": 0.71649468, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73758495, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 2.624042272567749 + }, + { + "auxiliary_loss_clip": 0.01066961, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.03672767, + "balance_loss_mlp": 1.01939535, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 2.105869311000594, + "language_loss": 0.74430096, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76529419, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.6413090229034424 + }, + { + "auxiliary_loss_clip": 0.01045589, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.02836299, + "balance_loss_mlp": 1.02054501, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 1.8500152544875008, + "language_loss": 0.72401273, + "learning_rate": 1.683342680176499e-06, + "loss": 0.7447952, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.620781421661377 + }, + { + "auxiliary_loss_clip": 0.01026661, + "auxiliary_loss_mlp": 0.01004939, + "balance_loss_clip": 1.00586605, + "balance_loss_mlp": 1.00367498, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7942576018861548, + "language_loss": 0.54387444, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56419045, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.2035269737243652 + }, + { + "auxiliary_loss_clip": 0.01090223, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.03443384, + "balance_loss_mlp": 1.0149895, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 2.6955386705275335, + "language_loss": 0.70571744, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.72689283, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 4.067910194396973 + }, + { + "auxiliary_loss_clip": 0.01082893, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.03513312, + "balance_loss_mlp": 1.02002454, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 1.8549072731241227, + "language_loss": 0.75983036, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78098512, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 2.6286861896514893 + }, + { + "auxiliary_loss_clip": 0.01085497, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.03120899, + "balance_loss_mlp": 1.02063787, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 1.9546070586403026, + "language_loss": 0.82071263, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84189624, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 2.543862819671631 + }, + { + "auxiliary_loss_clip": 0.010934, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.03781962, + "balance_loss_mlp": 1.02270484, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.079548633837966, + "language_loss": 0.70300835, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72429931, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.559129476547241 + }, + { + "auxiliary_loss_clip": 0.01093251, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.03429973, + "balance_loss_mlp": 1.01935661, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.8992523167454136, + "language_loss": 0.74658602, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76783049, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 2.882539987564087 + }, + { + "auxiliary_loss_clip": 0.01089291, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.03485417, + "balance_loss_mlp": 1.01940346, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.70513744626713, + "language_loss": 0.82299376, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84418821, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 2.567011594772339 + }, + { + "auxiliary_loss_clip": 0.01065643, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.03205776, + "balance_loss_mlp": 1.02068377, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 2.284457534550137, + "language_loss": 0.643444, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66443741, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.6114325523376465 + }, + { + "auxiliary_loss_clip": 0.01079073, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.03489661, + "balance_loss_mlp": 1.01702762, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.6734881368653787, + "language_loss": 0.9213112, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.9423784, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.5710835456848145 + }, + { + "auxiliary_loss_clip": 0.01100404, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.03728676, + "balance_loss_mlp": 1.02061009, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 2.582125715390414, + "language_loss": 0.6075325, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62887824, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 2.584731101989746 + }, + { + "auxiliary_loss_clip": 0.01051731, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.02972853, + "balance_loss_mlp": 1.01426125, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 1.9823786448278287, + "language_loss": 0.81282061, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83361316, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 2.64412784576416 + }, + { + "auxiliary_loss_clip": 0.01082334, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03590059, + "balance_loss_mlp": 1.01925349, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 2.1251828156704895, + "language_loss": 0.87356126, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.894696, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.5959527492523193 + }, + { + "auxiliary_loss_clip": 0.01089899, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.03577685, + "balance_loss_mlp": 1.019593, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 2.002729939028408, + "language_loss": 0.84639364, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.86760628, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 2.561884641647339 + }, + { + "auxiliary_loss_clip": 0.01014546, + "auxiliary_loss_mlp": 0.01004645, + "balance_loss_clip": 1.00406027, + "balance_loss_mlp": 1.00354195, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.7969465835574611, + "language_loss": 0.58305675, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60324866, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 4.65715217590332 + }, + { + "auxiliary_loss_clip": 0.01084236, + "auxiliary_loss_mlp": 0.0102876, + "balance_loss_clip": 1.03517318, + "balance_loss_mlp": 1.01674366, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.116632097580485, + "language_loss": 0.69822931, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.71935928, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 4.204939126968384 + }, + { + "auxiliary_loss_clip": 0.0107123, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.03497946, + "balance_loss_mlp": 1.01712811, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.8608189346207267, + "language_loss": 0.66953957, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69054043, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.640573740005493 + }, + { + "auxiliary_loss_clip": 0.01005814, + "auxiliary_loss_mlp": 0.00999656, + "balance_loss_clip": 1.00525522, + "balance_loss_mlp": 0.99845821, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.9380798475066127, + "language_loss": 0.58098525, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60103995, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 3.154306411743164 + }, + { + "auxiliary_loss_clip": 0.0105654, + "auxiliary_loss_mlp": 0.01037657, + "balance_loss_clip": 1.02952075, + "balance_loss_mlp": 1.02370918, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.7447675287962507, + "language_loss": 0.7337997, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75474167, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.7104272842407227 + }, + { + "auxiliary_loss_clip": 0.01076084, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.03533697, + "balance_loss_mlp": 1.01989532, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 2.9259590105785915, + "language_loss": 0.6106078, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63170075, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 2.5974907875061035 + }, + { + "auxiliary_loss_clip": 0.01060178, + "auxiliary_loss_mlp": 0.01027083, + "balance_loss_clip": 1.03082442, + "balance_loss_mlp": 1.01532316, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8281573366765302, + "language_loss": 0.8103075, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83118016, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.588200092315674 + }, + { + "auxiliary_loss_clip": 0.01040247, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.02622366, + "balance_loss_mlp": 1.02405882, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.6506508392690475, + "language_loss": 0.77762485, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.79839903, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 2.6574349403381348 + }, + { + "auxiliary_loss_clip": 0.01043455, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.02936578, + "balance_loss_mlp": 1.01802897, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.581522974505008, + "language_loss": 0.68886685, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.70961797, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.6459949016571045 + }, + { + "auxiliary_loss_clip": 0.01065092, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.03268957, + "balance_loss_mlp": 1.02345228, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 2.091174342414684, + "language_loss": 0.66756988, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.68856382, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 2.53144907951355 + }, + { + "auxiliary_loss_clip": 0.01070188, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.03327298, + "balance_loss_mlp": 1.01992702, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.953263381020001, + "language_loss": 0.74515247, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76617026, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 2.633045196533203 + }, + { + "auxiliary_loss_clip": 0.0104087, + "auxiliary_loss_mlp": 0.01037242, + "balance_loss_clip": 1.03224599, + "balance_loss_mlp": 1.02285349, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 2.071838039965678, + "language_loss": 0.7984401, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81922126, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 2.650391101837158 + }, + { + "auxiliary_loss_clip": 0.01059493, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.03116584, + "balance_loss_mlp": 1.02717638, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.6892264604046339, + "language_loss": 0.7102735, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73126388, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 4.244629144668579 + }, + { + "auxiliary_loss_clip": 0.01042774, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.03255808, + "balance_loss_mlp": 1.02026868, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 1.8844124540741094, + "language_loss": 0.81406176, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83481288, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 2.6222755908966064 + }, + { + "auxiliary_loss_clip": 0.01055377, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.03038931, + "balance_loss_mlp": 1.01625657, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.706444417315091, + "language_loss": 0.78225815, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80309069, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.5995960235595703 + }, + { + "auxiliary_loss_clip": 0.01103882, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.03557086, + "balance_loss_mlp": 1.02238119, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 4.56975024260731, + "language_loss": 0.82879889, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85017818, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.4851996898651123 + }, + { + "auxiliary_loss_clip": 0.01094553, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.03385842, + "balance_loss_mlp": 1.02008975, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.866375273739463, + "language_loss": 0.67101246, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69228899, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 2.514916181564331 + }, + { + "auxiliary_loss_clip": 0.01086611, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.03303182, + "balance_loss_mlp": 1.01762497, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.5178129737880983, + "language_loss": 0.58164155, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60278171, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.603665828704834 + }, + { + "auxiliary_loss_clip": 0.010169, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.02605796, + "balance_loss_mlp": 1.02617884, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.6634703474737633, + "language_loss": 0.68940252, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.70996863, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 2.7076621055603027 + }, + { + "auxiliary_loss_clip": 0.01032106, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.03544593, + "balance_loss_mlp": 1.0170449, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 1.5259076216041851, + "language_loss": 0.77919543, + "learning_rate": 1.670659182280247e-06, + "loss": 0.79979408, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 2.7847063541412354 + }, + { + "auxiliary_loss_clip": 0.01007827, + "auxiliary_loss_mlp": 0.00999568, + "balance_loss_clip": 1.00716674, + "balance_loss_mlp": 0.99844706, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.7027760437955437, + "language_loss": 0.49203607, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51210999, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.313115119934082 + }, + { + "auxiliary_loss_clip": 0.01093342, + "auxiliary_loss_mlp": 0.00749298, + "balance_loss_clip": 1.03643656, + "balance_loss_mlp": 1.0003264, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 3.4811966033705612, + "language_loss": 0.62732995, + "learning_rate": 1.6698909172706e-06, + "loss": 0.64575636, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 2.657496929168701 + }, + { + "auxiliary_loss_clip": 0.01081361, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.01907563, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.7560717887153605, + "language_loss": 0.69274843, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71386862, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.6110103130340576 + }, + { + "auxiliary_loss_clip": 0.01087301, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.03182435, + "balance_loss_mlp": 1.02096319, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.7944968520614164, + "language_loss": 0.64527261, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66648585, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.575211524963379 + }, + { + "auxiliary_loss_clip": 0.00966245, + "auxiliary_loss_mlp": 0.0101178, + "balance_loss_clip": 1.01116228, + "balance_loss_mlp": 1.01057649, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7523026477652953, + "language_loss": 0.59760308, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61738336, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.4736526012420654 + }, + { + "auxiliary_loss_clip": 0.01079227, + "auxiliary_loss_mlp": 0.00749042, + "balance_loss_clip": 1.03274643, + "balance_loss_mlp": 1.00030649, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.5405423558357885, + "language_loss": 0.74475384, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76303649, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 2.910216808319092 + }, + { + "auxiliary_loss_clip": 0.01069021, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.03327227, + "balance_loss_mlp": 1.02242446, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.9819900294713901, + "language_loss": 0.7307092, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.75174415, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.6118838787078857 + }, + { + "auxiliary_loss_clip": 0.01088095, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.0352962, + "balance_loss_mlp": 1.022434, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.6756773558606513, + "language_loss": 0.81802446, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83923101, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 2.594142436981201 + }, + { + "auxiliary_loss_clip": 0.01072041, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.03250849, + "balance_loss_mlp": 1.02390957, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.68058883462905, + "language_loss": 0.80726647, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82834435, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 4.364210367202759 + }, + { + "auxiliary_loss_clip": 0.0110557, + "auxiliary_loss_mlp": 0.0074936, + "balance_loss_clip": 1.03563666, + "balance_loss_mlp": 1.00028038, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 2.450415548226917, + "language_loss": 0.788625, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80717432, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.6056149005889893 + }, + { + "auxiliary_loss_clip": 0.01075524, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.03574991, + "balance_loss_mlp": 1.02120769, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 2.084939510764174, + "language_loss": 0.58828521, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.60936236, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.6162285804748535 + }, + { + "auxiliary_loss_clip": 0.01093721, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.03465033, + "balance_loss_mlp": 1.01745462, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 1.8560040362682162, + "language_loss": 0.81840861, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83963549, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 2.638655424118042 + }, + { + "auxiliary_loss_clip": 0.01101504, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.03674245, + "balance_loss_mlp": 1.02323151, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.9236570052491357, + "language_loss": 0.86200833, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88337183, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 2.6188156604766846 + }, + { + "auxiliary_loss_clip": 0.0107865, + "auxiliary_loss_mlp": 0.01036331, + "balance_loss_clip": 1.03443182, + "balance_loss_mlp": 1.02402878, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.147151913319104, + "language_loss": 0.73557329, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75672317, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.6138784885406494 + }, + { + "auxiliary_loss_clip": 0.01084232, + "auxiliary_loss_mlp": 0.00749356, + "balance_loss_clip": 1.03479779, + "balance_loss_mlp": 1.00026405, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.6985951098851433, + "language_loss": 0.75331038, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77164626, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.612510919570923 + }, + { + "auxiliary_loss_clip": 0.01103648, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.03553247, + "balance_loss_mlp": 1.01913214, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 2.4928877890855374, + "language_loss": 0.72712696, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74847353, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 2.56186580657959 + }, + { + "auxiliary_loss_clip": 0.01048784, + "auxiliary_loss_mlp": 0.01027138, + "balance_loss_clip": 1.03084171, + "balance_loss_mlp": 1.01680231, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.7810872869070007, + "language_loss": 0.7377404, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75849962, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 2.630220651626587 + }, + { + "auxiliary_loss_clip": 0.01042699, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.02871513, + "balance_loss_mlp": 1.02120423, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.5895227019535072, + "language_loss": 0.78278387, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80353576, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 2.6356098651885986 + }, + { + "auxiliary_loss_clip": 0.01106512, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.03604662, + "balance_loss_mlp": 1.01860225, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 3.2362052564570996, + "language_loss": 0.63449341, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65588367, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.4832212924957275 + }, + { + "auxiliary_loss_clip": 0.01088064, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.03282082, + "balance_loss_mlp": 1.01706362, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 2.256468102147688, + "language_loss": 0.6654979, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68666011, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 2.62957763671875 + }, + { + "auxiliary_loss_clip": 0.01074359, + "auxiliary_loss_mlp": 0.00749226, + "balance_loss_clip": 1.03175426, + "balance_loss_mlp": 1.00026882, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.4485960380273948, + "language_loss": 0.71527869, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73351455, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 4.069199085235596 + }, + { + "auxiliary_loss_clip": 0.01103003, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.03464949, + "balance_loss_mlp": 1.01896441, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.6498823315721718, + "language_loss": 0.74147916, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76281381, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 4.123257160186768 + }, + { + "auxiliary_loss_clip": 0.01098482, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.04024398, + "balance_loss_mlp": 1.02382123, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 1.7853090985469127, + "language_loss": 0.60700595, + "learning_rate": 1.661827179985277e-06, + "loss": 0.62835467, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 2.583641529083252 + }, + { + "auxiliary_loss_clip": 0.01077146, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.03022778, + "balance_loss_mlp": 1.01736403, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.6603794929492184, + "language_loss": 0.75070739, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77176332, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.676584243774414 + }, + { + "auxiliary_loss_clip": 0.010787, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.0357641, + "balance_loss_mlp": 1.01664197, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 2.120565329462271, + "language_loss": 0.83668685, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.8577702, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.5549027919769287 + }, + { + "auxiliary_loss_clip": 0.01069799, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.03204942, + "balance_loss_mlp": 1.02399206, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 3.397797623779028, + "language_loss": 0.75705516, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77812123, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.608220100402832 + }, + { + "auxiliary_loss_clip": 0.01046746, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.02976787, + "balance_loss_mlp": 1.02018571, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 1.9492057281790989, + "language_loss": 0.83205187, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85283339, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 2.6612942218780518 + }, + { + "auxiliary_loss_clip": 0.01066244, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.0347569, + "balance_loss_mlp": 1.02008104, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 1.8651289910944484, + "language_loss": 0.74464917, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.7656247, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 2.654658794403076 + }, + { + "auxiliary_loss_clip": 0.01074613, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.03466606, + "balance_loss_mlp": 1.01992536, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 1.9790269624950763, + "language_loss": 0.77328479, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79434568, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 2.594877243041992 + }, + { + "auxiliary_loss_clip": 0.0106507, + "auxiliary_loss_mlp": 0.01037816, + "balance_loss_clip": 1.03501284, + "balance_loss_mlp": 1.02572846, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.6383015831380519, + "language_loss": 0.8091718, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83020067, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.676361322402954 + }, + { + "auxiliary_loss_clip": 0.01099026, + "auxiliary_loss_mlp": 0.01026679, + "balance_loss_clip": 1.03145599, + "balance_loss_mlp": 1.01443076, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.2624599777449712, + "language_loss": 0.70998251, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73123956, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.608652353286743 + }, + { + "auxiliary_loss_clip": 0.01064205, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.03113103, + "balance_loss_mlp": 1.0178194, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.986452398543848, + "language_loss": 0.73563123, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75657338, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 4.2519025802612305 + }, + { + "auxiliary_loss_clip": 0.01077268, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.03190517, + "balance_loss_mlp": 1.01901865, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 1.6800838524602122, + "language_loss": 0.7496748, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77075338, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 2.6373250484466553 + }, + { + "auxiliary_loss_clip": 0.0105938, + "auxiliary_loss_mlp": 0.01042394, + "balance_loss_clip": 1.03475094, + "balance_loss_mlp": 1.0280056, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.143319400490857, + "language_loss": 0.75987738, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78089511, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.7540199756622314 + }, + { + "auxiliary_loss_clip": 0.01071737, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.0302031, + "balance_loss_mlp": 1.02540195, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.5955208257905609, + "language_loss": 0.74518907, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76629215, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 2.6883327960968018 + }, + { + "auxiliary_loss_clip": 0.01082074, + "auxiliary_loss_mlp": 0.01036112, + "balance_loss_clip": 1.0342505, + "balance_loss_mlp": 1.02410209, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 1.7601081330041288, + "language_loss": 0.66719985, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.68838167, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.6234660148620605 + }, + { + "auxiliary_loss_clip": 0.01074253, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.0305624, + "balance_loss_mlp": 1.02243412, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 1.8399514206277516, + "language_loss": 0.72054017, + "learning_rate": 1.656454488573026e-06, + "loss": 0.74164426, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 2.605111837387085 + }, + { + "auxiliary_loss_clip": 0.01056013, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.03032899, + "balance_loss_mlp": 1.01553428, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.4644394066668163, + "language_loss": 0.70291156, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72374183, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 2.656402587890625 + }, + { + "auxiliary_loss_clip": 0.01058839, + "auxiliary_loss_mlp": 0.00749108, + "balance_loss_clip": 1.03342223, + "balance_loss_mlp": 1.00029874, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 1.5899700313629546, + "language_loss": 0.69315428, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71123374, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 2.7001144886016846 + }, + { + "auxiliary_loss_clip": 0.01078173, + "auxiliary_loss_mlp": 0.01028513, + "balance_loss_clip": 1.03220248, + "balance_loss_mlp": 1.01803458, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 2.2947151936408954, + "language_loss": 0.60847878, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62954569, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.594479560852051 + }, + { + "auxiliary_loss_clip": 0.01061687, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.03548539, + "balance_loss_mlp": 1.01894248, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 1.774172163829918, + "language_loss": 0.73015887, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75108504, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 2.762906551361084 + }, + { + "auxiliary_loss_clip": 0.01080379, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.03367805, + "balance_loss_mlp": 1.0258317, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.7234371826567254, + "language_loss": 0.77080518, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79197675, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.610773801803589 + }, + { + "auxiliary_loss_clip": 0.01090799, + "auxiliary_loss_mlp": 0.0103549, + "balance_loss_clip": 1.03346241, + "balance_loss_mlp": 1.02270436, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.9311612547343242, + "language_loss": 0.66041631, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68167919, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 2.768611192703247 + }, + { + "auxiliary_loss_clip": 0.01093329, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.03412771, + "balance_loss_mlp": 1.0168066, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.2909543189905803, + "language_loss": 0.67797923, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.69920319, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.554943799972534 + }, + { + "auxiliary_loss_clip": 0.0107691, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.03659534, + "balance_loss_mlp": 1.0153389, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 2.3295527240918537, + "language_loss": 0.77055514, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79160428, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.6590425968170166 + }, + { + "auxiliary_loss_clip": 0.01049435, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.03492475, + "balance_loss_mlp": 1.01992714, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.8760586299800424, + "language_loss": 0.71919131, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74000847, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 2.770714044570923 + }, + { + "auxiliary_loss_clip": 0.0109277, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.03481436, + "balance_loss_mlp": 1.01947379, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 1.8701599224304652, + "language_loss": 0.73197633, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75322044, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.5359203815460205 + }, + { + "auxiliary_loss_clip": 0.01089314, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.03364468, + "balance_loss_mlp": 1.01928365, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.9985375572639905, + "language_loss": 0.73004127, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75123209, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 2.6775999069213867 + }, + { + "auxiliary_loss_clip": 0.01089959, + "auxiliary_loss_mlp": 0.01030549, + "balance_loss_clip": 1.03344166, + "balance_loss_mlp": 1.01910484, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 2.8074342592966968, + "language_loss": 0.73931849, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76052356, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 4.016689300537109 + }, + { + "auxiliary_loss_clip": 0.01095255, + "auxiliary_loss_mlp": 0.00749397, + "balance_loss_clip": 1.03610492, + "balance_loss_mlp": 1.00029707, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 2.33735625255139, + "language_loss": 0.83945119, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.85789776, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.5344502925872803 + }, + { + "auxiliary_loss_clip": 0.01074539, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.02997017, + "balance_loss_mlp": 1.01726532, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 2.0363406619666686, + "language_loss": 0.71909148, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74012381, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 2.6680045127868652 + }, + { + "auxiliary_loss_clip": 0.01000169, + "auxiliary_loss_mlp": 0.01009418, + "balance_loss_clip": 1.00834405, + "balance_loss_mlp": 1.00847578, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.71371036272415, + "language_loss": 0.55355501, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57365084, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 3.218031883239746 + }, + { + "auxiliary_loss_clip": 0.01084377, + "auxiliary_loss_mlp": 0.0103196, + "balance_loss_clip": 1.03187943, + "balance_loss_mlp": 1.01863205, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 1.9540589715758911, + "language_loss": 0.63961446, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.66077787, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.5223653316497803 + }, + { + "auxiliary_loss_clip": 0.01050426, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.03132629, + "balance_loss_mlp": 1.01920426, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 1.7754890974760003, + "language_loss": 0.79000056, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81083357, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 2.6373114585876465 + }, + { + "auxiliary_loss_clip": 0.01064737, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.03319287, + "balance_loss_mlp": 1.02688444, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.2649882953758707, + "language_loss": 0.69278538, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.7138468, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 2.6343271732330322 + }, + { + "auxiliary_loss_clip": 0.01077351, + "auxiliary_loss_mlp": 0.01034026, + "balance_loss_clip": 1.03493309, + "balance_loss_mlp": 1.02149105, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.6320525904547714, + "language_loss": 0.7500686, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.77118236, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 2.628300666809082 + }, + { + "auxiliary_loss_clip": 0.01060452, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.03507364, + "balance_loss_mlp": 1.02317119, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.7375578861854477, + "language_loss": 0.5752213, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59618771, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 2.7455525398254395 + }, + { + "auxiliary_loss_clip": 0.01063521, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.03210771, + "balance_loss_mlp": 1.0193373, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 2.1716313241220737, + "language_loss": 0.74303323, + "learning_rate": 1.648400251450638e-06, + "loss": 0.76398206, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.6064257621765137 + }, + { + "auxiliary_loss_clip": 0.00998215, + "auxiliary_loss_mlp": 0.01001597, + "balance_loss_clip": 1.00667799, + "balance_loss_mlp": 0.99999917, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6672430176830959, + "language_loss": 0.57573032, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59572852, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.2492029666900635 + }, + { + "auxiliary_loss_clip": 0.01088936, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.03448224, + "balance_loss_mlp": 1.02133799, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.7794932567221957, + "language_loss": 0.539855, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.56108087, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 4.1228554248809814 + }, + { + "auxiliary_loss_clip": 0.01104654, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.03513348, + "balance_loss_mlp": 1.02280402, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.6805460773121144, + "language_loss": 0.79393286, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81532925, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 2.5298314094543457 + }, + { + "auxiliary_loss_clip": 0.01083282, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.03698003, + "balance_loss_mlp": 1.02132106, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 1.874946483000667, + "language_loss": 0.66569722, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68686199, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 4.1267688274383545 + }, + { + "auxiliary_loss_clip": 0.01069302, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.03276312, + "balance_loss_mlp": 1.01777637, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.920099879039549, + "language_loss": 0.71146572, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73246402, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.5971319675445557 + }, + { + "auxiliary_loss_clip": 0.01069785, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.03443694, + "balance_loss_mlp": 1.0184679, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.5597867113454027, + "language_loss": 0.69435155, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71534675, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 2.7271718978881836 + }, + { + "auxiliary_loss_clip": 0.01060135, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.03159952, + "balance_loss_mlp": 1.0166688, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4595528986769175, + "language_loss": 0.71128839, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73216844, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.6689529418945312 + }, + { + "auxiliary_loss_clip": 0.01071938, + "auxiliary_loss_mlp": 0.00749329, + "balance_loss_clip": 1.03424144, + "balance_loss_mlp": 1.00033045, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.0420213126701467, + "language_loss": 0.71996486, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.7381776, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 2.7209744453430176 + }, + { + "auxiliary_loss_clip": 0.01092988, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.03467894, + "balance_loss_mlp": 1.0187794, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6821031272268827, + "language_loss": 0.78158849, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80282772, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 2.5907483100891113 + }, + { + "auxiliary_loss_clip": 0.01081205, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.0353291, + "balance_loss_mlp": 1.01593649, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 2.0407206813944514, + "language_loss": 0.77776879, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.79885602, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 2.622741937637329 + }, + { + "auxiliary_loss_clip": 0.01081151, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.03548336, + "balance_loss_mlp": 1.02502036, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 1.663922984985799, + "language_loss": 0.81339806, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83456928, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.613783836364746 + }, + { + "auxiliary_loss_clip": 0.0110415, + "auxiliary_loss_mlp": 0.00749506, + "balance_loss_clip": 1.03550899, + "balance_loss_mlp": 1.00033069, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 2.05087237092968, + "language_loss": 0.60838127, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62691784, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.6208810806274414 + }, + { + "auxiliary_loss_clip": 0.01090851, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.03440762, + "balance_loss_mlp": 1.01839685, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 2.0263401469472107, + "language_loss": 0.65294439, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67415917, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.6420552730560303 + }, + { + "auxiliary_loss_clip": 0.01013871, + "auxiliary_loss_mlp": 0.01001775, + "balance_loss_clip": 1.01150477, + "balance_loss_mlp": 1.00033247, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6624670181355836, + "language_loss": 0.47965968, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.49981618, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 4.727824449539185 + }, + { + "auxiliary_loss_clip": 0.01067413, + "auxiliary_loss_mlp": 0.00749477, + "balance_loss_clip": 1.03354466, + "balance_loss_mlp": 1.00035429, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.6683602150717682, + "language_loss": 0.85993242, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.87810135, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.6317903995513916 + }, + { + "auxiliary_loss_clip": 0.01067266, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03341174, + "balance_loss_mlp": 1.02104616, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.5399864027919699, + "language_loss": 0.78965545, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81066179, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 2.6713578701019287 + }, + { + "auxiliary_loss_clip": 0.01082292, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.03529298, + "balance_loss_mlp": 1.01767266, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.9089873456815247, + "language_loss": 0.6969611, + "learning_rate": 1.641884454927604e-06, + "loss": 0.71806842, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 2.5938334465026855 + }, + { + "auxiliary_loss_clip": 0.01066332, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.03150392, + "balance_loss_mlp": 1.01829875, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.669054914165204, + "language_loss": 0.76622868, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.7871899, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.5964019298553467 + }, + { + "auxiliary_loss_clip": 0.00997889, + "auxiliary_loss_mlp": 0.00746764, + "balance_loss_clip": 1.00717282, + "balance_loss_mlp": 0.99999803, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7926539999002973, + "language_loss": 0.5741303, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59157681, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 3.127810001373291 + }, + { + "auxiliary_loss_clip": 0.01078004, + "auxiliary_loss_mlp": 0.00749514, + "balance_loss_clip": 1.03526747, + "balance_loss_mlp": 1.00037241, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 2.2935005236349335, + "language_loss": 0.72016764, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73844284, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.5789554119110107 + }, + { + "auxiliary_loss_clip": 0.01105666, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.03583479, + "balance_loss_mlp": 1.01920938, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.6524476127698429, + "language_loss": 0.78110325, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80246758, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 2.5069618225097656 + }, + { + "auxiliary_loss_clip": 0.01107899, + "auxiliary_loss_mlp": 0.01030503, + "balance_loss_clip": 1.03647089, + "balance_loss_mlp": 1.01775956, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 3.1112461569209255, + "language_loss": 0.80431497, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82569909, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.5440475940704346 + }, + { + "auxiliary_loss_clip": 0.0105885, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.03395474, + "balance_loss_mlp": 1.02910829, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 2.5049446488012164, + "language_loss": 0.66436964, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68539214, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 2.653527021408081 + }, + { + "auxiliary_loss_clip": 0.01108133, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.03715754, + "balance_loss_mlp": 1.0217545, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.110674587548138, + "language_loss": 0.69458967, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71601361, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.521637439727783 + }, + { + "auxiliary_loss_clip": 0.01090873, + "auxiliary_loss_mlp": 0.0074958, + "balance_loss_clip": 1.03674936, + "balance_loss_mlp": 1.00043523, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 2.087467357827256, + "language_loss": 0.81541353, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83381808, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.5977907180786133 + }, + { + "auxiliary_loss_clip": 0.01104193, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.02122188, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 2.6370922966960895, + "language_loss": 0.66220123, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68358749, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.4968180656433105 + }, + { + "auxiliary_loss_clip": 0.01070719, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.03351498, + "balance_loss_mlp": 1.02095485, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.6339615196426713, + "language_loss": 0.71929133, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.74033237, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 2.5532476902008057 + }, + { + "auxiliary_loss_clip": 0.01076998, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.03360438, + "balance_loss_mlp": 1.0245378, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.9306249369822435, + "language_loss": 0.76288402, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78402531, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 2.699972152709961 + }, + { + "auxiliary_loss_clip": 0.01084687, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.03460026, + "balance_loss_mlp": 1.02259755, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.602505563702448, + "language_loss": 0.75024116, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77143216, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.587799310684204 + }, + { + "auxiliary_loss_clip": 0.01068744, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.03308344, + "balance_loss_mlp": 1.0147326, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 1.5333843579281559, + "language_loss": 0.82249564, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84344637, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.587643623352051 + }, + { + "auxiliary_loss_clip": 0.01068378, + "auxiliary_loss_mlp": 0.01027629, + "balance_loss_clip": 1.03370643, + "balance_loss_mlp": 1.01631606, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.279910350734444, + "language_loss": 0.86123824, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88219833, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 4.10622239112854 + }, + { + "auxiliary_loss_clip": 0.01057154, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.03228259, + "balance_loss_mlp": 1.0145781, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.7651325047185766, + "language_loss": 0.75499648, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77583855, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 2.6401267051696777 + }, + { + "auxiliary_loss_clip": 0.01101373, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.03389597, + "balance_loss_mlp": 1.02335119, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 4.76693622956767, + "language_loss": 0.81892914, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84028733, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 2.510479688644409 + }, + { + "auxiliary_loss_clip": 0.01060895, + "auxiliary_loss_mlp": 0.00749584, + "balance_loss_clip": 1.0306983, + "balance_loss_mlp": 1.00039244, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6238055411986771, + "language_loss": 0.77193642, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79004121, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 2.5905916690826416 + }, + { + "auxiliary_loss_clip": 0.0108177, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.03312492, + "balance_loss_mlp": 1.02386689, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.5061047499215288, + "language_loss": 0.68599379, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70718306, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 2.609391450881958 + }, + { + "auxiliary_loss_clip": 0.0110466, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.03472662, + "balance_loss_mlp": 1.01936722, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.5524463163366176, + "language_loss": 0.79695296, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81831729, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 2.4919803142547607 + }, + { + "auxiliary_loss_clip": 0.01089215, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.03496099, + "balance_loss_mlp": 1.01813841, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 2.532035159423126, + "language_loss": 0.720209, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74140304, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.587480068206787 + }, + { + "auxiliary_loss_clip": 0.01080854, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.0338614, + "balance_loss_mlp": 1.01761854, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.955715788083434, + "language_loss": 0.69598508, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71708012, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 2.652070999145508 + }, + { + "auxiliary_loss_clip": 0.01080655, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.03383732, + "balance_loss_mlp": 1.02307951, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 2.5660043199472, + "language_loss": 0.6134814, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63463235, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.541795253753662 + }, + { + "auxiliary_loss_clip": 0.01080418, + "auxiliary_loss_mlp": 0.01026189, + "balance_loss_clip": 1.0344739, + "balance_loss_mlp": 1.01511455, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.310507461721018, + "language_loss": 0.75812781, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.77919388, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 2.5440447330474854 + }, + { + "auxiliary_loss_clip": 0.01016045, + "auxiliary_loss_mlp": 0.010003, + "balance_loss_clip": 1.00456309, + "balance_loss_mlp": 0.99912608, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8965243370336534, + "language_loss": 0.668616, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68877947, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.0634377002716064 + }, + { + "auxiliary_loss_clip": 0.01098553, + "auxiliary_loss_mlp": 0.01040549, + "balance_loss_clip": 1.03729236, + "balance_loss_mlp": 1.02809119, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.0385473365377775, + "language_loss": 0.81433117, + "learning_rate": 1.63230955093099e-06, + "loss": 0.83572221, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.605294942855835 + }, + { + "auxiliary_loss_clip": 0.01079619, + "auxiliary_loss_mlp": 0.0103127, + "balance_loss_clip": 1.03090835, + "balance_loss_mlp": 1.01908052, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.5868360380327, + "language_loss": 0.86157191, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88268083, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 4.1307313442230225 + }, + { + "auxiliary_loss_clip": 0.01071545, + "auxiliary_loss_mlp": 0.01029446, + "balance_loss_clip": 1.03324986, + "balance_loss_mlp": 1.01716733, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.7185290371661257, + "language_loss": 0.87568772, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89669764, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 4.047940492630005 + }, + { + "auxiliary_loss_clip": 0.01053431, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.03145468, + "balance_loss_mlp": 1.01781416, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.9382005748807838, + "language_loss": 0.85338891, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87422681, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.703164577484131 + }, + { + "auxiliary_loss_clip": 0.01088026, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.03357399, + "balance_loss_mlp": 1.02090883, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 2.2725180379862424, + "language_loss": 0.7898159, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.81101555, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 2.5483028888702393 + }, + { + "auxiliary_loss_clip": 0.01100847, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.03439116, + "balance_loss_mlp": 1.0197674, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.5237485559534836, + "language_loss": 0.82664037, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.84795541, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 2.5407047271728516 + }, + { + "auxiliary_loss_clip": 0.01084027, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.03548455, + "balance_loss_mlp": 1.02842605, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.8232782125192197, + "language_loss": 0.72289348, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74413919, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 2.5721161365509033 + }, + { + "auxiliary_loss_clip": 0.01101028, + "auxiliary_loss_mlp": 0.0074945, + "balance_loss_clip": 1.03337109, + "balance_loss_mlp": 1.00047994, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.5481490018349797, + "language_loss": 0.77978486, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.79828966, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.575474739074707 + }, + { + "auxiliary_loss_clip": 0.01075891, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.03450155, + "balance_loss_mlp": 1.02205443, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.5453036266876807, + "language_loss": 0.7155304, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73661506, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 2.5417544841766357 + }, + { + "auxiliary_loss_clip": 0.010778, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.03256273, + "balance_loss_mlp": 1.01672757, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 2.342851249525501, + "language_loss": 0.69936204, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72041547, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.5812699794769287 + }, + { + "auxiliary_loss_clip": 0.01091018, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.0341444, + "balance_loss_mlp": 1.01868141, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.5222166531195465, + "language_loss": 0.65486705, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67607641, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 2.6195554733276367 + }, + { + "auxiliary_loss_clip": 0.01076334, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 1.02170813, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.8939904968607004, + "language_loss": 0.72851956, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74961066, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 2.5608034133911133 + }, + { + "auxiliary_loss_clip": 0.01086354, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.03218055, + "balance_loss_mlp": 1.0247457, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.6752012721116263, + "language_loss": 0.80203521, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82325888, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 4.119612455368042 + }, + { + "auxiliary_loss_clip": 0.01087595, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.03221464, + "balance_loss_mlp": 1.02280211, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 2.161425361699987, + "language_loss": 0.72274315, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74396634, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 2.5662450790405273 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.0103711, + "balance_loss_clip": 1.03435588, + "balance_loss_mlp": 1.02534366, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 2.1798580217160537, + "language_loss": 0.85888284, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88026816, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 2.4301021099090576 + }, + { + "auxiliary_loss_clip": 0.01006978, + "auxiliary_loss_mlp": 0.00998001, + "balance_loss_clip": 1.00593781, + "balance_loss_mlp": 0.99689257, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7663385746271087, + "language_loss": 0.56152451, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58157432, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 2.97572922706604 + }, + { + "auxiliary_loss_clip": 0.01082701, + "auxiliary_loss_mlp": 0.01027576, + "balance_loss_clip": 1.03497601, + "balance_loss_mlp": 1.01650763, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 2.085111484726983, + "language_loss": 0.6678912, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68899393, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.6134517192840576 + }, + { + "auxiliary_loss_clip": 0.01084592, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.03244519, + "balance_loss_mlp": 1.02777338, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.8595414470003053, + "language_loss": 0.76019907, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.78146082, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.6542370319366455 + }, + { + "auxiliary_loss_clip": 0.01100329, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.03306103, + "balance_loss_mlp": 1.01855183, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.2861853689501388, + "language_loss": 0.78766924, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80897534, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 2.568265199661255 + }, + { + "auxiliary_loss_clip": 0.01090682, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.03569388, + "balance_loss_mlp": 1.01853228, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.8148568551909101, + "language_loss": 0.85552752, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87673151, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 2.533290386199951 + }, + { + "auxiliary_loss_clip": 0.01082335, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.03474975, + "balance_loss_mlp": 1.02145958, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.6791174220966192, + "language_loss": 0.74995613, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77112144, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 2.647843360900879 + }, + { + "auxiliary_loss_clip": 0.01083724, + "auxiliary_loss_mlp": 0.01032037, + "balance_loss_clip": 1.03413677, + "balance_loss_mlp": 1.01996148, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.5770126371813675, + "language_loss": 0.71202636, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73318398, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.5913000106811523 + }, + { + "auxiliary_loss_clip": 0.01055595, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.02993059, + "balance_loss_mlp": 1.02883363, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 2.3513352507026024, + "language_loss": 0.69888484, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71986961, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 2.5738861560821533 + }, + { + "auxiliary_loss_clip": 0.01103177, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.03572249, + "balance_loss_mlp": 1.02226686, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 2.2174509807279654, + "language_loss": 0.62602293, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64739907, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.573538064956665 + }, + { + "auxiliary_loss_clip": 0.01091172, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.03526998, + "balance_loss_mlp": 1.0227747, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.5649682496034145, + "language_loss": 0.83079052, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85205066, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.542210340499878 + }, + { + "auxiliary_loss_clip": 0.01071199, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.03991795, + "balance_loss_mlp": 1.02235913, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 2.0055918672115762, + "language_loss": 0.72914916, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75020903, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 2.626701831817627 + }, + { + "auxiliary_loss_clip": 0.01085907, + "auxiliary_loss_mlp": 0.00749531, + "balance_loss_clip": 1.03252053, + "balance_loss_mlp": 1.00051808, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.891695813111892, + "language_loss": 0.79907441, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.81742877, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.5517773628234863 + }, + { + "auxiliary_loss_clip": 0.0108395, + "auxiliary_loss_mlp": 0.01034183, + "balance_loss_clip": 1.03528273, + "balance_loss_mlp": 1.02243531, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.2327518644499005, + "language_loss": 0.64749551, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66867685, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.600170612335205 + }, + { + "auxiliary_loss_clip": 0.01080858, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.03328753, + "balance_loss_mlp": 1.01905656, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 3.0522682030957493, + "language_loss": 0.82623184, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.84733921, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 2.5583841800689697 + }, + { + "auxiliary_loss_clip": 0.01061403, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.0330565, + "balance_loss_mlp": 1.0191642, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.2021791792397876, + "language_loss": 0.7351349, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75607389, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 4.025869369506836 + }, + { + "auxiliary_loss_clip": 0.01047294, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.03013921, + "balance_loss_mlp": 1.01813745, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 2.2197136082559537, + "language_loss": 0.76073039, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78151178, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 2.855740785598755 + }, + { + "auxiliary_loss_clip": 0.01084077, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.03741884, + "balance_loss_mlp": 1.02110362, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 2.5178184568003035, + "language_loss": 0.56466269, + "learning_rate": 1.620448797546459e-06, + "loss": 0.585832, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 2.668184995651245 + }, + { + "auxiliary_loss_clip": 0.01072809, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.03168917, + "balance_loss_mlp": 1.02089512, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.6906053546000175, + "language_loss": 0.76083839, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78190517, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.563704013824463 + }, + { + "auxiliary_loss_clip": 0.01088268, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.03286791, + "balance_loss_mlp": 1.02144837, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 2.04268747430954, + "language_loss": 0.74206275, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76328558, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 2.5157151222229004 + }, + { + "auxiliary_loss_clip": 0.01068047, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.03085291, + "balance_loss_mlp": 1.02110887, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.2863116874309686, + "language_loss": 0.69356072, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71457362, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.624570369720459 + }, + { + "auxiliary_loss_clip": 0.01053162, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.03612983, + "balance_loss_mlp": 1.02089274, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.6796305789262496, + "language_loss": 0.79453814, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81539416, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 2.6744136810302734 + }, + { + "auxiliary_loss_clip": 0.01074794, + "auxiliary_loss_mlp": 0.01042911, + "balance_loss_clip": 1.0339396, + "balance_loss_mlp": 1.02923203, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 2.348361057572014, + "language_loss": 0.67301512, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.69419211, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 2.5695760250091553 + }, + { + "auxiliary_loss_clip": 0.0106921, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.03615654, + "balance_loss_mlp": 1.02316642, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 3.31727465287636, + "language_loss": 0.72183985, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.7428863, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.6116936206817627 + }, + { + "auxiliary_loss_clip": 0.01089241, + "auxiliary_loss_mlp": 0.01035655, + "balance_loss_clip": 1.03706431, + "balance_loss_mlp": 1.02392483, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.6991553370977492, + "language_loss": 0.79672104, + "learning_rate": 1.617772461696843e-06, + "loss": 0.81797004, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.561088800430298 + }, + { + "auxiliary_loss_clip": 0.01094609, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.03373086, + "balance_loss_mlp": 1.02191126, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.0240578631889816, + "language_loss": 0.83332521, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85460925, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.53121280670166 + }, + { + "auxiliary_loss_clip": 0.01096746, + "auxiliary_loss_mlp": 0.0074967, + "balance_loss_clip": 1.03497243, + "balance_loss_mlp": 1.00059378, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.4165874277665087, + "language_loss": 0.70909643, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72756064, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.618483781814575 + }, + { + "auxiliary_loss_clip": 0.01082115, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.03466439, + "balance_loss_mlp": 1.01563621, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.3365229259528215, + "language_loss": 0.73006129, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.75116152, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 4.045390605926514 + }, + { + "auxiliary_loss_clip": 0.0109357, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.03554022, + "balance_loss_mlp": 1.01946151, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 2.71026698806586, + "language_loss": 0.74056751, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76181757, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 2.6609747409820557 + }, + { + "auxiliary_loss_clip": 0.01091366, + "auxiliary_loss_mlp": 0.01039594, + "balance_loss_clip": 1.03353941, + "balance_loss_mlp": 1.02761364, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.6772234640907164, + "language_loss": 0.67830276, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.69961238, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 4.054778099060059 + }, + { + "auxiliary_loss_clip": 0.01074207, + "auxiliary_loss_mlp": 0.010546, + "balance_loss_clip": 1.03411412, + "balance_loss_mlp": 1.03854299, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.063397193246, + "language_loss": 0.71479619, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73608422, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.5646839141845703 + }, + { + "auxiliary_loss_clip": 0.01080964, + "auxiliary_loss_mlp": 0.00749225, + "balance_loss_clip": 1.03478932, + "balance_loss_mlp": 1.00057769, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 2.941970415309337, + "language_loss": 0.79465318, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81295508, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.658094644546509 + }, + { + "auxiliary_loss_clip": 0.01037679, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.03326511, + "balance_loss_mlp": 1.01800585, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 2.566238237244049, + "language_loss": 0.64125645, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66193503, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 2.755561590194702 + }, + { + "auxiliary_loss_clip": 0.01099105, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.03675878, + "balance_loss_mlp": 1.02292037, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.725367787453383, + "language_loss": 0.71179736, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.7331509, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 2.578169584274292 + }, + { + "auxiliary_loss_clip": 0.01050044, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.03289461, + "balance_loss_mlp": 1.02307141, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.5174204312294355, + "language_loss": 0.83941734, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86026156, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.721834182739258 + }, + { + "auxiliary_loss_clip": 0.01055795, + "auxiliary_loss_mlp": 0.01037328, + "balance_loss_clip": 1.03541744, + "balance_loss_mlp": 1.02462077, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 1.7671257823883406, + "language_loss": 0.57255936, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.5934906, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.6668100357055664 + }, + { + "auxiliary_loss_clip": 0.01071936, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.03134263, + "balance_loss_mlp": 1.02504373, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.6249281421945614, + "language_loss": 0.75796968, + "learning_rate": 1.613186112465078e-06, + "loss": 0.77906835, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 2.629512071609497 + }, + { + "auxiliary_loss_clip": 0.00986625, + "auxiliary_loss_mlp": 0.0100942, + "balance_loss_clip": 1.00591862, + "balance_loss_mlp": 1.00843096, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7392082149674787, + "language_loss": 0.60759878, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.6275593, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.321805477142334 + }, + { + "auxiliary_loss_clip": 0.01073971, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.03309631, + "balance_loss_mlp": 1.02021217, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.6724519973961824, + "language_loss": 0.75281793, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77387458, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 4.274180173873901 + }, + { + "auxiliary_loss_clip": 0.01091808, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.034935, + "balance_loss_mlp": 1.02017975, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.5114316244409791, + "language_loss": 0.744012, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76524687, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.5508933067321777 + }, + { + "auxiliary_loss_clip": 0.01104134, + "auxiliary_loss_mlp": 0.01027327, + "balance_loss_clip": 1.03504395, + "balance_loss_mlp": 1.01569247, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.6218824001870977, + "language_loss": 0.71059191, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73190653, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.5728564262390137 + }, + { + "auxiliary_loss_clip": 0.01095697, + "auxiliary_loss_mlp": 0.01038737, + "balance_loss_clip": 1.03561711, + "balance_loss_mlp": 1.02607155, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.0952048958939384, + "language_loss": 0.55353171, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57487601, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.556256055831909 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01032962, + "balance_loss_clip": 1.03346896, + "balance_loss_mlp": 1.02178609, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.8393524499248806, + "language_loss": 0.64568102, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66701633, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 2.477344274520874 + }, + { + "auxiliary_loss_clip": 0.0109345, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.0342648, + "balance_loss_mlp": 1.01737297, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.494947949098361, + "language_loss": 0.67090255, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.692132, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 2.7974908351898193 + }, + { + "auxiliary_loss_clip": 0.0108423, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.0375526, + "balance_loss_mlp": 1.01753306, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 1.7092694948726412, + "language_loss": 0.71746486, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.7386052, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 2.6083641052246094 + }, + { + "auxiliary_loss_clip": 0.01099249, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.03605449, + "balance_loss_mlp": 1.02048051, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.7436397812154198, + "language_loss": 0.76274872, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78405386, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 2.7161660194396973 + }, + { + "auxiliary_loss_clip": 0.01046044, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.030617, + "balance_loss_mlp": 1.02230465, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.2704990312854445, + "language_loss": 0.67009181, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.69091374, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 2.890756130218506 + }, + { + "auxiliary_loss_clip": 0.01082302, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.03676212, + "balance_loss_mlp": 1.01857686, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.6013391349491837, + "language_loss": 0.79835999, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81948388, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 2.647073984146118 + }, + { + "auxiliary_loss_clip": 0.01081052, + "auxiliary_loss_mlp": 0.01026274, + "balance_loss_clip": 1.03475773, + "balance_loss_mlp": 1.01515198, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 3.2325770873840813, + "language_loss": 0.6960597, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71713293, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.5600342750549316 + }, + { + "auxiliary_loss_clip": 0.01094088, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.03435826, + "balance_loss_mlp": 1.0196476, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 2.042209382414428, + "language_loss": 0.66896635, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.69021893, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.5174014568328857 + }, + { + "auxiliary_loss_clip": 0.01072287, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.03333616, + "balance_loss_mlp": 1.01698875, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.743670277997353, + "language_loss": 0.72999972, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.75100619, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 2.7980830669403076 + }, + { + "auxiliary_loss_clip": 0.01085524, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.0353682, + "balance_loss_mlp": 1.01765049, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.150146383457817, + "language_loss": 0.64301366, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66417778, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.61307430267334 + }, + { + "auxiliary_loss_clip": 0.01073384, + "auxiliary_loss_mlp": 0.01034997, + "balance_loss_clip": 1.03213751, + "balance_loss_mlp": 1.02219439, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.7403940525793147, + "language_loss": 0.85417211, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87525594, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.575913906097412 + }, + { + "auxiliary_loss_clip": 0.0111058, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.03811884, + "balance_loss_mlp": 1.02389503, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.163903149562164, + "language_loss": 0.67350721, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69498378, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 2.4624948501586914 + }, + { + "auxiliary_loss_clip": 0.01007456, + "auxiliary_loss_mlp": 0.0100072, + "balance_loss_clip": 1.00533319, + "balance_loss_mlp": 0.99969465, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6429684116142947, + "language_loss": 0.57246268, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59254444, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 3.270653247833252 + }, + { + "auxiliary_loss_clip": 0.01073741, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.03421986, + "balance_loss_mlp": 1.01701379, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 5.612014619515498, + "language_loss": 0.82535768, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84638631, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 4.135624885559082 + }, + { + "auxiliary_loss_clip": 0.01025175, + "auxiliary_loss_mlp": 0.01001657, + "balance_loss_clip": 1.00446379, + "balance_loss_mlp": 1.00070906, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6233179445417791, + "language_loss": 0.49578321, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51605153, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 3.134129524230957 + }, + { + "auxiliary_loss_clip": 0.01077391, + "auxiliary_loss_mlp": 0.01024028, + "balance_loss_clip": 1.0339663, + "balance_loss_mlp": 1.01289952, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 2.1709977921832913, + "language_loss": 0.84753591, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86855006, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.5811448097229004 + }, + { + "auxiliary_loss_clip": 0.01090613, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.03538632, + "balance_loss_mlp": 1.01949894, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.6760253876528162, + "language_loss": 0.79783487, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.81906241, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 2.5450103282928467 + }, + { + "auxiliary_loss_clip": 0.01071776, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.03379488, + "balance_loss_mlp": 1.02235579, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.4600093984873121, + "language_loss": 0.65856671, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.67963898, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.684965133666992 + }, + { + "auxiliary_loss_clip": 0.01075215, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.03389478, + "balance_loss_mlp": 1.02402568, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 2.28395703565967, + "language_loss": 0.78948438, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.81060946, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 2.643980026245117 + }, + { + "auxiliary_loss_clip": 0.01098573, + "auxiliary_loss_mlp": 0.01025292, + "balance_loss_clip": 1.03289866, + "balance_loss_mlp": 1.01428294, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.9025438523916502, + "language_loss": 0.79579604, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81703472, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 2.5285494327545166 + }, + { + "auxiliary_loss_clip": 0.01040256, + "auxiliary_loss_mlp": 0.00749521, + "balance_loss_clip": 1.03225625, + "balance_loss_mlp": 1.00051737, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.8219452623728676, + "language_loss": 0.6304785, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.64837629, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 2.7735235691070557 + }, + { + "auxiliary_loss_clip": 0.01106213, + "auxiliary_loss_mlp": 0.0074964, + "balance_loss_clip": 1.03759074, + "balance_loss_mlp": 1.00056362, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.9347253424375688, + "language_loss": 0.7787444, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.79730296, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.821230411529541 + }, + { + "auxiliary_loss_clip": 0.00982753, + "auxiliary_loss_mlp": 0.01024794, + "balance_loss_clip": 1.01077485, + "balance_loss_mlp": 1.02348852, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7386853157378434, + "language_loss": 0.59680939, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61688495, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.4695026874542236 + }, + { + "auxiliary_loss_clip": 0.01092916, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.03298938, + "balance_loss_mlp": 1.02986884, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 1.6448643068886462, + "language_loss": 0.70833844, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.72969383, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 2.9968912601470947 + }, + { + "auxiliary_loss_clip": 0.01070054, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.0335238, + "balance_loss_mlp": 1.01938927, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.765554247590286, + "language_loss": 0.71267438, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73367971, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 2.576272964477539 + }, + { + "auxiliary_loss_clip": 0.01102615, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.03479981, + "balance_loss_mlp": 1.02048922, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.14891875294294, + "language_loss": 0.70351911, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.72486949, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 4.034661293029785 + }, + { + "auxiliary_loss_clip": 0.01080156, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.03582871, + "balance_loss_mlp": 1.0246979, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 1.9144334248318242, + "language_loss": 0.67420971, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69539875, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 4.235365152359009 + }, + { + "auxiliary_loss_clip": 0.01080682, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.03473401, + "balance_loss_mlp": 1.02021086, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 2.0394154273895375, + "language_loss": 0.81423229, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83535552, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.661792516708374 + }, + { + "auxiliary_loss_clip": 0.01051365, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.02776408, + "balance_loss_mlp": 1.02289248, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.5767297232290656, + "language_loss": 0.72761893, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74850202, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.626009225845337 + }, + { + "auxiliary_loss_clip": 0.01100222, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.03466666, + "balance_loss_mlp": 1.01756871, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 32.94926533612042, + "language_loss": 0.77927721, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80056745, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 2.518742799758911 + }, + { + "auxiliary_loss_clip": 0.01096511, + "auxiliary_loss_mlp": 0.0074979, + "balance_loss_clip": 1.03689456, + "balance_loss_mlp": 1.00060761, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.602295379321797, + "language_loss": 0.72340447, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74186748, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.6005470752716064 + }, + { + "auxiliary_loss_clip": 0.0105065, + "auxiliary_loss_mlp": 0.01035723, + "balance_loss_clip": 1.03887677, + "balance_loss_mlp": 1.02408206, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.7609799532395696, + "language_loss": 0.68560088, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70646465, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 2.805480480194092 + }, + { + "auxiliary_loss_clip": 0.0106914, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.03165531, + "balance_loss_mlp": 1.02153099, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4101850012641537, + "language_loss": 0.73238879, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75341654, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 2.6318726539611816 + }, + { + "auxiliary_loss_clip": 0.01092092, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.03622246, + "balance_loss_mlp": 1.01915526, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.6799398076668646, + "language_loss": 0.76329935, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78453183, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.5290751457214355 + }, + { + "auxiliary_loss_clip": 0.0107283, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.0361017, + "balance_loss_mlp": 1.01969182, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.970028395935301, + "language_loss": 0.83358842, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85463858, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 2.67307448387146 + }, + { + "auxiliary_loss_clip": 0.0107965, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.03703499, + "balance_loss_mlp": 1.01867056, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.7253270362368756, + "language_loss": 0.77661002, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.79773873, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 4.151566982269287 + }, + { + "auxiliary_loss_clip": 0.01100633, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.03443754, + "balance_loss_mlp": 1.02362132, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.5836202860719601, + "language_loss": 0.73758841, + "learning_rate": 1.597150687927619e-06, + "loss": 0.75894213, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.5055925846099854 + }, + { + "auxiliary_loss_clip": 0.01059399, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.03348637, + "balance_loss_mlp": 1.0186801, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.6092579526890116, + "language_loss": 0.69016421, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71106648, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 2.533452272415161 + }, + { + "auxiliary_loss_clip": 0.01068524, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.03259134, + "balance_loss_mlp": 1.0175308, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 2.2536397179950476, + "language_loss": 0.75918716, + "learning_rate": 1.596387759940665e-06, + "loss": 0.7801671, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.768342971801758 + }, + { + "auxiliary_loss_clip": 0.01072746, + "auxiliary_loss_mlp": 0.01029206, + "balance_loss_clip": 1.0340898, + "balance_loss_mlp": 1.01788688, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.9412876449384462, + "language_loss": 0.76792032, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.78893983, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 2.6212339401245117 + }, + { + "auxiliary_loss_clip": 0.01064882, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.03151715, + "balance_loss_mlp": 1.01882648, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.365795488441689, + "language_loss": 0.68717349, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70813549, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.5978758335113525 + }, + { + "auxiliary_loss_clip": 0.01085657, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.03065336, + "balance_loss_mlp": 1.01670182, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.8070551015541103, + "language_loss": 0.8328892, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85403323, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.553903579711914 + }, + { + "auxiliary_loss_clip": 0.01103729, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.0365746, + "balance_loss_mlp": 1.01933002, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.8841990159746538, + "language_loss": 0.79681337, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81816316, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 2.502448558807373 + }, + { + "auxiliary_loss_clip": 0.01091447, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.03429592, + "balance_loss_mlp": 1.02213931, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.7617389193794828, + "language_loss": 0.77804935, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79929203, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 2.5500125885009766 + }, + { + "auxiliary_loss_clip": 0.01069755, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.03239512, + "balance_loss_mlp": 1.02102947, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.7048874467155923, + "language_loss": 0.8121618, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83318889, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.6053028106689453 + }, + { + "auxiliary_loss_clip": 0.01086864, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.03189826, + "balance_loss_mlp": 1.01894426, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.6180586746643872, + "language_loss": 0.67286682, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69404328, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.6413376331329346 + }, + { + "auxiliary_loss_clip": 0.01091433, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.03529596, + "balance_loss_mlp": 1.01827729, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.943373332707562, + "language_loss": 0.77662534, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79783642, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 2.582488775253296 + }, + { + "auxiliary_loss_clip": 0.01079994, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.03477287, + "balance_loss_mlp": 1.01928473, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.546584906468229, + "language_loss": 0.74938917, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77050626, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.638972043991089 + }, + { + "auxiliary_loss_clip": 0.01099345, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.03407538, + "balance_loss_mlp": 1.01715589, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.8395584752487057, + "language_loss": 0.81238824, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.83366358, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.484741687774658 + }, + { + "auxiliary_loss_clip": 0.01082636, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.03417706, + "balance_loss_mlp": 1.020841, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 1.8439073707805151, + "language_loss": 0.72488725, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74604166, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 2.6021597385406494 + }, + { + "auxiliary_loss_clip": 0.01079992, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.03343391, + "balance_loss_mlp": 1.01851106, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.5151524763960107, + "language_loss": 0.7724936, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79359084, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 2.635849714279175 + }, + { + "auxiliary_loss_clip": 0.0103624, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.02891159, + "balance_loss_mlp": 1.02395058, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.5334647052998416, + "language_loss": 0.709481, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.7302174, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 2.8867263793945312 + }, + { + "auxiliary_loss_clip": 0.01006775, + "auxiliary_loss_mlp": 0.0100178, + "balance_loss_clip": 1.00626469, + "balance_loss_mlp": 1.00068307, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.770056000176157, + "language_loss": 0.55947924, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57956481, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.1771931648254395 + }, + { + "auxiliary_loss_clip": 0.01070683, + "auxiliary_loss_mlp": 0.01039509, + "balance_loss_clip": 1.03663898, + "balance_loss_mlp": 1.02658725, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 1.9299585453766193, + "language_loss": 0.71015882, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73126078, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 4.237186431884766 + }, + { + "auxiliary_loss_clip": 0.01069157, + "auxiliary_loss_mlp": 0.01037497, + "balance_loss_clip": 1.03453755, + "balance_loss_mlp": 1.02471149, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 1.9632740942010452, + "language_loss": 0.82021821, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84128475, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 2.7280259132385254 + }, + { + "auxiliary_loss_clip": 0.01100411, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.03405535, + "balance_loss_mlp": 1.02299035, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.9454895105610122, + "language_loss": 0.699714, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72107339, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.5652332305908203 + }, + { + "auxiliary_loss_clip": 0.01078815, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.0343051, + "balance_loss_mlp": 1.02446067, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.534545115415573, + "language_loss": 0.71665037, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.73779464, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.6800320148468018 + }, + { + "auxiliary_loss_clip": 0.01089001, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.03225493, + "balance_loss_mlp": 1.01701522, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.6205679116844869, + "language_loss": 0.84070039, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86187565, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 2.5866904258728027 + }, + { + "auxiliary_loss_clip": 0.01076853, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.03051782, + "balance_loss_mlp": 1.01685238, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.5193560031058355, + "language_loss": 0.72380555, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74486148, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 2.6267356872558594 + }, + { + "auxiliary_loss_clip": 0.01080665, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.03588653, + "balance_loss_mlp": 1.02708149, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 2.6818465425744358, + "language_loss": 0.74390888, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76511002, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.590811014175415 + }, + { + "auxiliary_loss_clip": 0.01067268, + "auxiliary_loss_mlp": 0.00749486, + "balance_loss_clip": 1.03284192, + "balance_loss_mlp": 1.00061941, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.94357325283351, + "language_loss": 0.79264706, + "learning_rate": 1.587999618060523e-06, + "loss": 0.81081462, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 2.702418327331543 + }, + { + "auxiliary_loss_clip": 0.01101682, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.03434229, + "balance_loss_mlp": 1.01900578, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.6848039318909491, + "language_loss": 0.75033987, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.7716648, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.6143453121185303 + }, + { + "auxiliary_loss_clip": 0.01073006, + "auxiliary_loss_mlp": 0.01028657, + "balance_loss_clip": 1.03360224, + "balance_loss_mlp": 1.01657498, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 5.724625391211806, + "language_loss": 0.79723406, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81825066, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 2.6019299030303955 + }, + { + "auxiliary_loss_clip": 0.01071011, + "auxiliary_loss_mlp": 0.01036449, + "balance_loss_clip": 1.03419375, + "balance_loss_mlp": 1.02375293, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.7355301408840842, + "language_loss": 0.77506518, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79613984, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.626342296600342 + }, + { + "auxiliary_loss_clip": 0.01085036, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.03454447, + "balance_loss_mlp": 1.02972531, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.099420548790242, + "language_loss": 0.63363039, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65489781, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.5373284816741943 + }, + { + "auxiliary_loss_clip": 0.01071894, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.03335989, + "balance_loss_mlp": 1.02447832, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.5004635369406631, + "language_loss": 0.77333158, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79441202, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 4.1106390953063965 + }, + { + "auxiliary_loss_clip": 0.01070409, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.03011537, + "balance_loss_mlp": 1.0188756, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.8153203270206224, + "language_loss": 0.68475658, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70575535, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 4.069039821624756 + }, + { + "auxiliary_loss_clip": 0.01061655, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.0335362, + "balance_loss_mlp": 1.0209682, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.305279542158163, + "language_loss": 0.72319436, + "learning_rate": 1.585332242234043e-06, + "loss": 0.7441411, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 2.6168160438537598 + }, + { + "auxiliary_loss_clip": 0.01092751, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.03719759, + "balance_loss_mlp": 1.02217495, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 1.60094005188246, + "language_loss": 0.72480482, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74606335, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.5242061614990234 + }, + { + "auxiliary_loss_clip": 0.01078953, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.03340578, + "balance_loss_mlp": 1.02274704, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 1.686058296089626, + "language_loss": 0.69775277, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71888816, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 2.5221664905548096 + }, + { + "auxiliary_loss_clip": 0.01078015, + "auxiliary_loss_mlp": 0.01040484, + "balance_loss_clip": 1.03516948, + "balance_loss_mlp": 1.02721608, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 3.6899981704701657, + "language_loss": 0.77600485, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79718983, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 2.559539318084717 + }, + { + "auxiliary_loss_clip": 0.01102989, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.0363344, + "balance_loss_mlp": 1.02200747, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8579306946089864, + "language_loss": 0.73967397, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.76103944, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.5116336345672607 + }, + { + "auxiliary_loss_clip": 0.01084631, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.03599119, + "balance_loss_mlp": 1.01926613, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.7488786826082248, + "language_loss": 0.7339617, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75512373, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 2.641819477081299 + }, + { + "auxiliary_loss_clip": 0.01103968, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.03521812, + "balance_loss_mlp": 1.02067137, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.7766953674119377, + "language_loss": 0.66753548, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.68890077, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.567211151123047 + }, + { + "auxiliary_loss_clip": 0.010945, + "auxiliary_loss_mlp": 0.0103573, + "balance_loss_clip": 1.03443241, + "balance_loss_mlp": 1.02292657, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.234620891270435, + "language_loss": 0.85389745, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87519968, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.6031055450439453 + }, + { + "auxiliary_loss_clip": 0.01103998, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.03621554, + "balance_loss_mlp": 1.02089548, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 2.2092459241548923, + "language_loss": 0.75640994, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77777094, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 4.06545352935791 + }, + { + "auxiliary_loss_clip": 0.0108337, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.03571761, + "balance_loss_mlp": 1.02125382, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 2.099375985915705, + "language_loss": 0.59135187, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61252463, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.7261710166931152 + }, + { + "auxiliary_loss_clip": 0.01059385, + "auxiliary_loss_mlp": 0.01048925, + "balance_loss_clip": 1.03467011, + "balance_loss_mlp": 1.03394628, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.5411261075809575, + "language_loss": 0.84306484, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.8641479, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.6351499557495117 + }, + { + "auxiliary_loss_clip": 0.01017488, + "auxiliary_loss_mlp": 0.01001961, + "balance_loss_clip": 1.00654113, + "balance_loss_mlp": 1.00091803, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8423780057644918, + "language_loss": 0.63032913, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65052366, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 3.138667106628418 + }, + { + "auxiliary_loss_clip": 0.01057932, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.02946758, + "balance_loss_mlp": 1.0205822, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.7784240980345087, + "language_loss": 0.82067138, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84158266, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.6385586261749268 + }, + { + "auxiliary_loss_clip": 0.01069293, + "auxiliary_loss_mlp": 0.01036473, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.02409935, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 15.774265900590391, + "language_loss": 0.77343822, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79449594, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 2.568096876144409 + }, + { + "auxiliary_loss_clip": 0.01086559, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.03701484, + "balance_loss_mlp": 1.02282882, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 1.8214204318505498, + "language_loss": 0.74195099, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76316875, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.5473597049713135 + }, + { + "auxiliary_loss_clip": 0.01074918, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.03325629, + "balance_loss_mlp": 1.01641214, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.1513201260501678, + "language_loss": 0.76876169, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78979778, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 2.6042139530181885 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.03622174, + "balance_loss_mlp": 1.01888371, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 2.154293643867522, + "language_loss": 0.74720389, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76856226, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.465479612350464 + }, + { + "auxiliary_loss_clip": 0.01040298, + "auxiliary_loss_mlp": 0.01034823, + "balance_loss_clip": 1.035254, + "balance_loss_mlp": 1.02373016, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.6555098462993851, + "language_loss": 0.70573312, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.7264843, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.733750820159912 + }, + { + "auxiliary_loss_clip": 0.01107009, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.03512096, + "balance_loss_mlp": 1.02325833, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 2.0683339389599364, + "language_loss": 0.69442356, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71584702, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 2.5206003189086914 + }, + { + "auxiliary_loss_clip": 0.0108867, + "auxiliary_loss_mlp": 0.01031534, + "balance_loss_clip": 1.03685355, + "balance_loss_mlp": 1.02012587, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.7547408433854248, + "language_loss": 0.71683323, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73803532, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.5324008464813232 + }, + { + "auxiliary_loss_clip": 0.01095237, + "auxiliary_loss_mlp": 0.01038705, + "balance_loss_clip": 1.03610396, + "balance_loss_mlp": 1.02577066, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 2.237647253976277, + "language_loss": 0.71251833, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73385775, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.648930072784424 + }, + { + "auxiliary_loss_clip": 0.01016905, + "auxiliary_loss_mlp": 0.01002227, + "balance_loss_clip": 1.00581694, + "balance_loss_mlp": 1.00110042, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6535525937551321, + "language_loss": 0.53585207, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55604339, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 3.166501522064209 + }, + { + "auxiliary_loss_clip": 0.01093986, + "auxiliary_loss_mlp": 0.0104089, + "balance_loss_clip": 1.03457046, + "balance_loss_mlp": 1.02879, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.5515729641020486, + "language_loss": 0.62131572, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64266449, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 2.653038501739502 + }, + { + "auxiliary_loss_clip": 0.0108944, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.03189754, + "balance_loss_mlp": 1.0201056, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.570627522976624, + "language_loss": 0.65473652, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.6759578, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.7526473999023438 + }, + { + "auxiliary_loss_clip": 0.01045507, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.03012204, + "balance_loss_mlp": 1.01583517, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.5399445502255351, + "language_loss": 0.74524403, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76596612, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.6076319217681885 + }, + { + "auxiliary_loss_clip": 0.01025589, + "auxiliary_loss_mlp": 0.01001288, + "balance_loss_clip": 1.00466728, + "balance_loss_mlp": 1.00019169, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8787559717352526, + "language_loss": 0.58437651, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60464525, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.100019693374634 + }, + { + "auxiliary_loss_clip": 0.01084719, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.03664064, + "balance_loss_mlp": 1.02194095, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.2502353713483596, + "language_loss": 0.81941044, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84059465, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 4.250750780105591 + }, + { + "auxiliary_loss_clip": 0.01069766, + "auxiliary_loss_mlp": 0.00749583, + "balance_loss_clip": 1.02960443, + "balance_loss_mlp": 1.00055289, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 11.331606685689948, + "language_loss": 0.81785673, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83605027, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.658531427383423 + }, + { + "auxiliary_loss_clip": 0.01087969, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.03768253, + "balance_loss_mlp": 1.0227232, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.7526764926535232, + "language_loss": 0.81073862, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.8319841, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 2.670508623123169 + }, + { + "auxiliary_loss_clip": 0.01087706, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.03557849, + "balance_loss_mlp": 1.02243328, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.7611557958376378, + "language_loss": 0.79676926, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.81798053, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.507420539855957 + }, + { + "auxiliary_loss_clip": 0.01097678, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.02121234, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.7414298763092353, + "language_loss": 0.7908324, + "learning_rate": 1.573909419957653e-06, + "loss": 0.8121506, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 2.64963436126709 + }, + { + "auxiliary_loss_clip": 0.01081653, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.03489399, + "balance_loss_mlp": 1.02210188, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 2.1181807066384075, + "language_loss": 0.64564556, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66679227, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.7862656116485596 + }, + { + "auxiliary_loss_clip": 0.01051197, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.03205562, + "balance_loss_mlp": 1.02322769, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.5354165800752682, + "language_loss": 0.73371482, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75458169, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 2.662574529647827 + }, + { + "auxiliary_loss_clip": 0.01075183, + "auxiliary_loss_mlp": 0.01039617, + "balance_loss_clip": 1.03589344, + "balance_loss_mlp": 1.02804756, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 1.9057877978902222, + "language_loss": 0.78921574, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81036377, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 2.6035609245300293 + }, + { + "auxiliary_loss_clip": 0.01061592, + "auxiliary_loss_mlp": 0.01034943, + "balance_loss_clip": 1.034621, + "balance_loss_mlp": 1.02190757, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 2.7789342684742406, + "language_loss": 0.61429161, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.63525695, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.6897146701812744 + }, + { + "auxiliary_loss_clip": 0.01046409, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.03243351, + "balance_loss_mlp": 1.02358699, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.7583998570326294, + "language_loss": 0.81337023, + "learning_rate": 1.572007019492342e-06, + "loss": 0.8342011, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.695371627807617 + }, + { + "auxiliary_loss_clip": 0.01071976, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.03683162, + "balance_loss_mlp": 1.02389622, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 2.755029823172936, + "language_loss": 0.87815988, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.89924574, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.6678404808044434 + }, + { + "auxiliary_loss_clip": 0.01102501, + "auxiliary_loss_mlp": 0.00749244, + "balance_loss_clip": 1.03513074, + "balance_loss_mlp": 1.00053263, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.5131361734116757, + "language_loss": 0.79108596, + "learning_rate": 1.571246172811984e-06, + "loss": 0.80960333, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.5393528938293457 + }, + { + "auxiliary_loss_clip": 0.01088244, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.03457594, + "balance_loss_mlp": 1.02004099, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.318027662542295, + "language_loss": 0.70199692, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72320235, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 4.168844938278198 + }, + { + "auxiliary_loss_clip": 0.010422, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.03603745, + "balance_loss_mlp": 1.01972365, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.467701451671293, + "language_loss": 0.63762283, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65835768, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 4.246052265167236 + }, + { + "auxiliary_loss_clip": 0.01012546, + "auxiliary_loss_mlp": 0.01005376, + "balance_loss_clip": 1.01681805, + "balance_loss_mlp": 1.00407648, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8375215709284402, + "language_loss": 0.54254723, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.5627265, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 3.22977614402771 + }, + { + "auxiliary_loss_clip": 0.01005069, + "auxiliary_loss_mlp": 0.0100654, + "balance_loss_clip": 1.00437641, + "balance_loss_mlp": 1.00534153, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7408663156806323, + "language_loss": 0.56180775, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58192384, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 3.0064892768859863 + }, + { + "auxiliary_loss_clip": 0.01100988, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.03424597, + "balance_loss_mlp": 1.01858854, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.7378055259095706, + "language_loss": 0.65589672, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67719585, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.5290396213531494 + }, + { + "auxiliary_loss_clip": 0.01079796, + "auxiliary_loss_mlp": 0.01026832, + "balance_loss_clip": 1.03392172, + "balance_loss_mlp": 1.0157516, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.9659691952962457, + "language_loss": 0.83469391, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85576022, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 2.5781922340393066 + }, + { + "auxiliary_loss_clip": 0.01104583, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.03703463, + "balance_loss_mlp": 1.01951623, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.8635397401716733, + "language_loss": 0.75580049, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77715814, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.479526996612549 + }, + { + "auxiliary_loss_clip": 0.01031376, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.02939391, + "balance_loss_mlp": 1.01968813, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.386470833667857, + "language_loss": 0.75089467, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77153873, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.756103992462158 + }, + { + "auxiliary_loss_clip": 0.01080018, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.03580761, + "balance_loss_mlp": 1.01861262, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.6679841506770514, + "language_loss": 0.73722994, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.75833917, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 2.589337110519409 + }, + { + "auxiliary_loss_clip": 0.01075561, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.03381538, + "balance_loss_mlp": 1.02024579, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.0791857907197944, + "language_loss": 0.78738451, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80846256, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.632605791091919 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.03575671, + "balance_loss_mlp": 1.02504921, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.7658344891589743, + "language_loss": 0.75283468, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77423441, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 2.5396859645843506 + }, + { + "auxiliary_loss_clip": 0.01018764, + "auxiliary_loss_mlp": 0.01003477, + "balance_loss_clip": 1.00785875, + "balance_loss_mlp": 1.00245142, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8172041356656798, + "language_loss": 0.57402962, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59425199, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 4.4395318031311035 + }, + { + "auxiliary_loss_clip": 0.01061355, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.03201973, + "balance_loss_mlp": 1.02157474, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 2.00792515794964, + "language_loss": 0.70302641, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72399813, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.789255142211914 + }, + { + "auxiliary_loss_clip": 0.01095923, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.03738749, + "balance_loss_mlp": 1.01821327, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 3.542636967567192, + "language_loss": 0.65363491, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67488497, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 2.7311301231384277 + }, + { + "auxiliary_loss_clip": 0.01082467, + "auxiliary_loss_mlp": 0.00749322, + "balance_loss_clip": 1.03690851, + "balance_loss_mlp": 1.0004797, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.5887686502860683, + "language_loss": 0.73040479, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.74872267, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.6746487617492676 + }, + { + "auxiliary_loss_clip": 0.01078114, + "auxiliary_loss_mlp": 0.01034171, + "balance_loss_clip": 1.0330013, + "balance_loss_mlp": 1.02069449, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 2.009094701136309, + "language_loss": 0.75596154, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.77708435, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.632880926132202 + }, + { + "auxiliary_loss_clip": 0.01091044, + "auxiliary_loss_mlp": 0.01030102, + "balance_loss_clip": 1.03279495, + "balance_loss_mlp": 1.01842499, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.8371418796414578, + "language_loss": 0.80436492, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82557642, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.709686279296875 + }, + { + "auxiliary_loss_clip": 0.01016081, + "auxiliary_loss_mlp": 0.01004632, + "balance_loss_clip": 1.00506544, + "balance_loss_mlp": 1.00352883, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7843264777056426, + "language_loss": 0.56928599, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58949304, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 3.1039645671844482 + }, + { + "auxiliary_loss_clip": 0.01090076, + "auxiliary_loss_mlp": 0.00749383, + "balance_loss_clip": 1.03328037, + "balance_loss_mlp": 1.00053847, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 1.8914718020977557, + "language_loss": 0.79170442, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81009901, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 2.5968716144561768 + }, + { + "auxiliary_loss_clip": 0.01068048, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.03278363, + "balance_loss_mlp": 1.028584, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.4030835818095857, + "language_loss": 0.75911462, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78020251, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.6013565063476562 + }, + { + "auxiliary_loss_clip": 0.01014869, + "auxiliary_loss_mlp": 0.0099949, + "balance_loss_clip": 1.00409305, + "balance_loss_mlp": 0.99842894, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7663725900826145, + "language_loss": 0.54915094, + "learning_rate": 1.563261231127095e-06, + "loss": 0.56929457, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.238027572631836 + }, + { + "auxiliary_loss_clip": 0.01068, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.03701663, + "balance_loss_mlp": 1.01738548, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 2.109890329455644, + "language_loss": 0.77170295, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.79266638, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 2.643059730529785 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.03462088, + "balance_loss_mlp": 1.0188911, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.6799694264849183, + "language_loss": 0.77903903, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.8003872, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 2.587351083755493 + }, + { + "auxiliary_loss_clip": 0.01059578, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.03390694, + "balance_loss_mlp": 1.02444637, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.697107791762907, + "language_loss": 0.83521122, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.8561722, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.799340009689331 + }, + { + "auxiliary_loss_clip": 0.01072571, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.03029346, + "balance_loss_mlp": 1.01964951, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.4899356820390297, + "language_loss": 0.66886997, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68992817, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.7289974689483643 + }, + { + "auxiliary_loss_clip": 0.01090319, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.03329802, + "balance_loss_mlp": 1.02064836, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5258405078222823, + "language_loss": 0.71205896, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73329002, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.7262914180755615 + }, + { + "auxiliary_loss_clip": 0.01078845, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.03397155, + "balance_loss_mlp": 1.02250803, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 2.911478486727687, + "language_loss": 0.85482383, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87595087, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 2.6918177604675293 + }, + { + "auxiliary_loss_clip": 0.01085884, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.03345537, + "balance_loss_mlp": 1.01981187, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.4824851735976265, + "language_loss": 0.77858961, + "learning_rate": 1.560601200301392e-06, + "loss": 0.7997545, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 2.5880024433135986 + }, + { + "auxiliary_loss_clip": 0.01107629, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.03803897, + "balance_loss_mlp": 1.0168376, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.7623160474710415, + "language_loss": 0.71228969, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73365414, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 4.112656354904175 + }, + { + "auxiliary_loss_clip": 0.01080885, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.03644347, + "balance_loss_mlp": 1.02177429, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 2.5568799100962565, + "language_loss": 0.81221181, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83335042, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.59975004196167 + }, + { + "auxiliary_loss_clip": 0.0105019, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.02969599, + "balance_loss_mlp": 1.01738739, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 1.6784101285081603, + "language_loss": 0.80514872, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82594025, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.6462929248809814 + }, + { + "auxiliary_loss_clip": 0.01087232, + "auxiliary_loss_mlp": 0.01034301, + "balance_loss_clip": 1.03330684, + "balance_loss_mlp": 1.02215385, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.0902638162596903, + "language_loss": 0.74995434, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77116966, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.897075653076172 + }, + { + "auxiliary_loss_clip": 0.01067292, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.03188586, + "balance_loss_mlp": 1.0178225, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.7674667233925176, + "language_loss": 0.81143099, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83239579, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 2.7171945571899414 + }, + { + "auxiliary_loss_clip": 0.01086702, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.03712928, + "balance_loss_mlp": 1.01898384, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.869449278027365, + "language_loss": 0.78345108, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80462271, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.619471549987793 + }, + { + "auxiliary_loss_clip": 0.01005513, + "auxiliary_loss_mlp": 0.01001213, + "balance_loss_clip": 1.0045929, + "balance_loss_mlp": 1.00008607, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7636640062378193, + "language_loss": 0.56640196, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58646917, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.2081308364868164 + }, + { + "auxiliary_loss_clip": 0.01070026, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.03480482, + "balance_loss_mlp": 1.01891494, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.418184788595448, + "language_loss": 0.65227938, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67327386, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.735165596008301 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.03675985, + "balance_loss_mlp": 1.02145243, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.754525469409652, + "language_loss": 0.78529596, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.8067292, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.5593960285186768 + }, + { + "auxiliary_loss_clip": 0.01059083, + "auxiliary_loss_mlp": 0.00749454, + "balance_loss_clip": 1.02995098, + "balance_loss_mlp": 1.00056696, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.535954078654435, + "language_loss": 0.73400712, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75209254, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.6605663299560547 + }, + { + "auxiliary_loss_clip": 0.01083618, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.0343343, + "balance_loss_mlp": 1.01574588, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 2.482726771594796, + "language_loss": 0.69433594, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71545506, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 2.642666816711426 + }, + { + "auxiliary_loss_clip": 0.01103126, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.03363276, + "balance_loss_mlp": 1.01754987, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.732220940761933, + "language_loss": 0.80429345, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82561988, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 2.5295510292053223 + }, + { + "auxiliary_loss_clip": 0.01077873, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.03133345, + "balance_loss_mlp": 1.0202775, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 2.264695598831732, + "language_loss": 0.72677779, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74787903, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 4.111287355422974 + }, + { + "auxiliary_loss_clip": 0.01072768, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.03202343, + "balance_loss_mlp": 1.01710868, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.670446364533597, + "language_loss": 0.74871039, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76972026, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 2.602132797241211 + }, + { + "auxiliary_loss_clip": 0.01091785, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.03418684, + "balance_loss_mlp": 1.02285683, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 2.053576096357344, + "language_loss": 0.8016274, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82289505, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 4.118633031845093 + }, + { + "auxiliary_loss_clip": 0.0107905, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.03342366, + "balance_loss_mlp": 1.02056372, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.5361377677246664, + "language_loss": 0.67406857, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69518822, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 2.564443826675415 + }, + { + "auxiliary_loss_clip": 0.01103623, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.03468919, + "balance_loss_mlp": 1.02318716, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.0583329549109948, + "language_loss": 0.75542349, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.77680707, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 2.5808398723602295 + }, + { + "auxiliary_loss_clip": 0.01063696, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.03368556, + "balance_loss_mlp": 1.02640438, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.5839722951397628, + "language_loss": 0.82625616, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.84727144, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.6335558891296387 + }, + { + "auxiliary_loss_clip": 0.01025559, + "auxiliary_loss_mlp": 0.01000384, + "balance_loss_clip": 1.00492692, + "balance_loss_mlp": 0.99935913, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9292124902696123, + "language_loss": 0.71320629, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73346573, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 3.0804128646850586 + }, + { + "auxiliary_loss_clip": 0.01080935, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.0316143, + "balance_loss_mlp": 1.02090907, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.3823512023568365, + "language_loss": 0.89394188, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91507876, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.473512887954712 + }, + { + "auxiliary_loss_clip": 0.01080259, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.03638959, + "balance_loss_mlp": 1.02137327, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.3845161885875583, + "language_loss": 0.6836884, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70482099, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.5327587127685547 + }, + { + "auxiliary_loss_clip": 0.01094927, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.0360359, + "balance_loss_mlp": 1.0220747, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 2.5788177835469046, + "language_loss": 0.86214852, + "learning_rate": 1.552246441587197e-06, + "loss": 0.883434, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 2.482515811920166 + }, + { + "auxiliary_loss_clip": 0.01079273, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.03636813, + "balance_loss_mlp": 1.02680755, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.5527718394036527, + "language_loss": 0.82933706, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.8505165, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 2.564664125442505 + }, + { + "auxiliary_loss_clip": 0.01041165, + "auxiliary_loss_mlp": 0.00749529, + "balance_loss_clip": 1.03525448, + "balance_loss_mlp": 1.00059164, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.73231860514011, + "language_loss": 0.6682477, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.68615466, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 4.214448928833008 + }, + { + "auxiliary_loss_clip": 0.01060984, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.03126168, + "balance_loss_mlp": 1.03120327, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.74236883745527, + "language_loss": 0.81330985, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.83435917, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.588512897491455 + }, + { + "auxiliary_loss_clip": 0.01086777, + "auxiliary_loss_mlp": 0.01035874, + "balance_loss_clip": 1.03353024, + "balance_loss_mlp": 1.02511501, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.8412760100036398, + "language_loss": 0.77940887, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80063546, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.6189777851104736 + }, + { + "auxiliary_loss_clip": 0.01083467, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.03241265, + "balance_loss_mlp": 1.02066481, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 1.8006955060887795, + "language_loss": 0.70577902, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72695452, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.600930690765381 + }, + { + "auxiliary_loss_clip": 0.01109737, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.03821278, + "balance_loss_mlp": 1.02285075, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.83703358994378, + "language_loss": 0.78382075, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80527836, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.512732744216919 + }, + { + "auxiliary_loss_clip": 0.01084946, + "auxiliary_loss_mlp": 0.01045346, + "balance_loss_clip": 1.03315544, + "balance_loss_mlp": 1.03104138, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 1.898896897804314, + "language_loss": 0.70111072, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72241366, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 2.601209878921509 + }, + { + "auxiliary_loss_clip": 0.01045421, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.03129113, + "balance_loss_mlp": 1.02411234, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.8793882260058166, + "language_loss": 0.52983284, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.55067742, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 2.7458488941192627 + }, + { + "auxiliary_loss_clip": 0.01085808, + "auxiliary_loss_mlp": 0.01031515, + "balance_loss_clip": 1.03452229, + "balance_loss_mlp": 1.01844978, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 2.1623249768358113, + "language_loss": 0.87092346, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.8920967, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 2.56095814704895 + }, + { + "auxiliary_loss_clip": 0.01070937, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.03151321, + "balance_loss_mlp": 1.02628589, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.684125612079969, + "language_loss": 0.72230887, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74339724, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 2.6096928119659424 + }, + { + "auxiliary_loss_clip": 0.01095588, + "auxiliary_loss_mlp": 0.01037188, + "balance_loss_clip": 1.03662419, + "balance_loss_mlp": 1.02430689, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.3335357185192778, + "language_loss": 0.74002516, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.7613529, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 2.5764567852020264 + }, + { + "auxiliary_loss_clip": 0.01065953, + "auxiliary_loss_mlp": 0.01035352, + "balance_loss_clip": 1.03290594, + "balance_loss_mlp": 1.02191114, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.9620588911833308, + "language_loss": 0.70351875, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72453177, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 2.8571815490722656 + }, + { + "auxiliary_loss_clip": 0.01060942, + "auxiliary_loss_mlp": 0.0103745, + "balance_loss_clip": 1.0321424, + "balance_loss_mlp": 1.02456951, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.6996222924199988, + "language_loss": 0.82570714, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84669107, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.6440439224243164 + }, + { + "auxiliary_loss_clip": 0.01108632, + "auxiliary_loss_mlp": 0.00749626, + "balance_loss_clip": 1.03691626, + "balance_loss_mlp": 1.00058019, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 2.2393214805152515, + "language_loss": 0.68549293, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70407552, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.4932782649993896 + }, + { + "auxiliary_loss_clip": 0.01104544, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.03428352, + "balance_loss_mlp": 1.0179987, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 4.988392825304163, + "language_loss": 0.58614588, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.6074965, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.5035274028778076 + }, + { + "auxiliary_loss_clip": 0.01078216, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.03350163, + "balance_loss_mlp": 1.01833236, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 1.8476780174595009, + "language_loss": 0.75218874, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77327716, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 2.5376391410827637 + }, + { + "auxiliary_loss_clip": 0.01068336, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.0330528, + "balance_loss_mlp": 1.01974273, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 2.2900926601497447, + "language_loss": 0.75848001, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77947962, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 2.6225810050964355 + }, + { + "auxiliary_loss_clip": 0.01081484, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.03493464, + "balance_loss_mlp": 1.01920485, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.5721677873642075, + "language_loss": 0.75073862, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77186483, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 4.0769383907318115 + }, + { + "auxiliary_loss_clip": 0.01080096, + "auxiliary_loss_mlp": 0.01029097, + "balance_loss_clip": 1.0353179, + "balance_loss_mlp": 1.0180347, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 2.879859493012116, + "language_loss": 0.80908948, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83018148, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.6946074962615967 + }, + { + "auxiliary_loss_clip": 0.01076387, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.03421056, + "balance_loss_mlp": 1.01649594, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.8289065670226383, + "language_loss": 0.71804285, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73909467, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.7162654399871826 + }, + { + "auxiliary_loss_clip": 0.01009498, + "auxiliary_loss_mlp": 0.01011594, + "balance_loss_clip": 1.00878513, + "balance_loss_mlp": 1.01028252, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7956909985048528, + "language_loss": 0.53312713, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55333805, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 3.2773654460906982 + }, + { + "auxiliary_loss_clip": 0.01077378, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.03482592, + "balance_loss_mlp": 1.01965487, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.57161998946627, + "language_loss": 0.73072624, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75182176, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 2.641672372817993 + }, + { + "auxiliary_loss_clip": 0.01071463, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.03142428, + "balance_loss_mlp": 1.02341223, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 1.8127514018163091, + "language_loss": 0.81392616, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83502048, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.7409791946411133 + }, + { + "auxiliary_loss_clip": 0.01094879, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.03562665, + "balance_loss_mlp": 1.01949131, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.8604022340598594, + "language_loss": 0.72111684, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74238515, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.5244674682617188 + }, + { + "auxiliary_loss_clip": 0.010839, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.03752065, + "balance_loss_mlp": 1.01795435, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.3004811474395233, + "language_loss": 0.75165868, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77280146, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.508805990219116 + }, + { + "auxiliary_loss_clip": 0.01071276, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.03676891, + "balance_loss_mlp": 1.01907194, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.5672413807383465, + "language_loss": 0.70623183, + "learning_rate": 1.542383242598344e-06, + "loss": 0.72726244, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.5990641117095947 + }, + { + "auxiliary_loss_clip": 0.01107428, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.03632402, + "balance_loss_mlp": 1.02023935, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 3.2627107694568, + "language_loss": 0.74667847, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.7680949, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.526273488998413 + }, + { + "auxiliary_loss_clip": 0.01091423, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.03459191, + "balance_loss_mlp": 1.0198226, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.8630611319826418, + "language_loss": 0.7715773, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79281473, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 2.530494451522827 + }, + { + "auxiliary_loss_clip": 0.0110081, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.03534198, + "balance_loss_mlp": 1.01611519, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.81687355415777, + "language_loss": 0.71053374, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.7318145, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 2.4385647773742676 + }, + { + "auxiliary_loss_clip": 0.01074966, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.03059602, + "balance_loss_mlp": 1.01803839, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 1.6234821814879872, + "language_loss": 0.72070611, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74176419, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 2.5694572925567627 + }, + { + "auxiliary_loss_clip": 0.00996966, + "auxiliary_loss_mlp": 0.01000695, + "balance_loss_clip": 1.00573945, + "balance_loss_mlp": 0.99967593, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7395593472769124, + "language_loss": 0.56977284, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58974946, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 3.1590194702148438 + }, + { + "auxiliary_loss_clip": 0.01101548, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.03443933, + "balance_loss_mlp": 1.01941586, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.6386188970406768, + "language_loss": 0.76270747, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78403002, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 4.384374380111694 + }, + { + "auxiliary_loss_clip": 0.01007025, + "auxiliary_loss_mlp": 0.01000367, + "balance_loss_clip": 1.00646389, + "balance_loss_mlp": 0.99918091, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8596338066369034, + "language_loss": 0.60484463, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62491858, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 4.507591485977173 + }, + { + "auxiliary_loss_clip": 0.01109306, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.03675079, + "balance_loss_mlp": 1.01912522, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 1.9449335322045191, + "language_loss": 0.71683109, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73824298, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.5774478912353516 + }, + { + "auxiliary_loss_clip": 0.01078671, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.03334415, + "balance_loss_mlp": 1.02352715, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5873696940569888, + "language_loss": 0.72927284, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.7504096, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 2.699848175048828 + }, + { + "auxiliary_loss_clip": 0.01090056, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.03483832, + "balance_loss_mlp": 1.01826501, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 2.5367716517255228, + "language_loss": 0.72665036, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74785489, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.560587167739868 + }, + { + "auxiliary_loss_clip": 0.0108883, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.03667188, + "balance_loss_mlp": 1.02199435, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.9344942031455536, + "language_loss": 0.74748117, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.76872528, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 2.575462818145752 + }, + { + "auxiliary_loss_clip": 0.01063441, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.0335778, + "balance_loss_mlp": 1.01955128, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.941861895862359, + "language_loss": 0.72241199, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74336445, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 3.0143861770629883 + }, + { + "auxiliary_loss_clip": 0.01084509, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.03081465, + "balance_loss_mlp": 1.02086782, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.6281995397761038, + "language_loss": 0.80259001, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82375783, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 2.472167491912842 + }, + { + "auxiliary_loss_clip": 0.01077332, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.03717566, + "balance_loss_mlp": 1.02258217, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.656547917227051, + "language_loss": 0.79282868, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81394523, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 2.579803943634033 + }, + { + "auxiliary_loss_clip": 0.01070249, + "auxiliary_loss_mlp": 0.01040274, + "balance_loss_clip": 1.03410816, + "balance_loss_mlp": 1.02757204, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.7279921944580088, + "language_loss": 0.83711958, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.85822481, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 4.03794002532959 + }, + { + "auxiliary_loss_clip": 0.01095056, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.03540134, + "balance_loss_mlp": 1.02109444, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.8024600452168629, + "language_loss": 0.69838244, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71966404, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.59094500541687 + }, + { + "auxiliary_loss_clip": 0.01085663, + "auxiliary_loss_mlp": 0.00749697, + "balance_loss_clip": 1.03348374, + "balance_loss_mlp": 1.00063252, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 3.350484168890879, + "language_loss": 0.63991868, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65827233, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.642700433731079 + }, + { + "auxiliary_loss_clip": 0.01024594, + "auxiliary_loss_mlp": 0.00746926, + "balance_loss_clip": 1.00383759, + "balance_loss_mlp": 1.00023389, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7145910906351279, + "language_loss": 0.53897297, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55668813, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.0862767696380615 + }, + { + "auxiliary_loss_clip": 0.01066817, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.03146911, + "balance_loss_mlp": 1.02282691, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.8278460953406168, + "language_loss": 0.70656574, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.72757578, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.7725064754486084 + }, + { + "auxiliary_loss_clip": 0.01057141, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.0337894, + "balance_loss_mlp": 1.02460742, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 1.9007673543079917, + "language_loss": 0.67159992, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69253874, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.7499196529388428 + }, + { + "auxiliary_loss_clip": 0.01052727, + "auxiliary_loss_mlp": 0.01037981, + "balance_loss_clip": 1.02961445, + "balance_loss_mlp": 1.02346146, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.4652425113837468, + "language_loss": 0.66273713, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68364418, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 2.7214484214782715 + }, + { + "auxiliary_loss_clip": 0.01107719, + "auxiliary_loss_mlp": 0.01044916, + "balance_loss_clip": 1.03745639, + "balance_loss_mlp": 1.03144491, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.7264287181358924, + "language_loss": 0.74542081, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76694715, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 2.581685781478882 + }, + { + "auxiliary_loss_clip": 0.01071648, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.03573346, + "balance_loss_mlp": 1.02832961, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.560356324638521, + "language_loss": 0.53149176, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.5526222, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.675058126449585 + }, + { + "auxiliary_loss_clip": 0.01095871, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.03649116, + "balance_loss_mlp": 1.02164495, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.2686210405469796, + "language_loss": 0.64929068, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.6705963, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 2.7592194080352783 + }, + { + "auxiliary_loss_clip": 0.01088667, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.03359056, + "balance_loss_mlp": 1.02255726, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.646395066345968, + "language_loss": 0.73444903, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75568157, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.598586320877075 + }, + { + "auxiliary_loss_clip": 0.01101995, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.03456724, + "balance_loss_mlp": 1.02133155, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 2.044877952747093, + "language_loss": 0.74497664, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76632726, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.5277578830718994 + }, + { + "auxiliary_loss_clip": 0.01070198, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.03625691, + "balance_loss_mlp": 1.02317548, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.4669152079542331, + "language_loss": 0.73695141, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.75799918, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.692430019378662 + }, + { + "auxiliary_loss_clip": 0.01052552, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.02909911, + "balance_loss_mlp": 1.02349138, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.7413338086363936, + "language_loss": 0.70034039, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72123623, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.6009373664855957 + }, + { + "auxiliary_loss_clip": 0.01105271, + "auxiliary_loss_mlp": 0.00749703, + "balance_loss_clip": 1.03436923, + "balance_loss_mlp": 1.00069904, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 2.141029919583706, + "language_loss": 0.66957229, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.68812203, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 2.4982986450195312 + }, + { + "auxiliary_loss_clip": 0.01075002, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.03626907, + "balance_loss_mlp": 1.02417445, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 1.9712467542191403, + "language_loss": 0.72285211, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.7439692, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 2.5982890129089355 + }, + { + "auxiliary_loss_clip": 0.0107303, + "auxiliary_loss_mlp": 0.00749578, + "balance_loss_clip": 1.03197539, + "balance_loss_mlp": 1.00059891, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.370855777385194, + "language_loss": 0.70600516, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72423124, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 2.5777182579040527 + }, + { + "auxiliary_loss_clip": 0.0108586, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.03135347, + "balance_loss_mlp": 1.02558827, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 2.1080288442684423, + "language_loss": 0.7051658, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72641408, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 4.00823450088501 + }, + { + "auxiliary_loss_clip": 0.01068169, + "auxiliary_loss_mlp": 0.01034635, + "balance_loss_clip": 1.03710675, + "balance_loss_mlp": 1.02158153, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 1.8368302229759248, + "language_loss": 0.69231379, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71334183, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 2.6743977069854736 + }, + { + "auxiliary_loss_clip": 0.01059914, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.03289747, + "balance_loss_mlp": 1.02221334, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 1.8479377219206499, + "language_loss": 0.69667649, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.7176193, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.74788236618042 + }, + { + "auxiliary_loss_clip": 0.01091361, + "auxiliary_loss_mlp": 0.01026484, + "balance_loss_clip": 1.0347116, + "balance_loss_mlp": 1.0150758, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.094150221967384, + "language_loss": 0.77321857, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.794397, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.55692720413208 + }, + { + "auxiliary_loss_clip": 0.01087247, + "auxiliary_loss_mlp": 0.01032499, + "balance_loss_clip": 1.03565407, + "balance_loss_mlp": 1.02046442, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.5036062529541763, + "language_loss": 0.79245377, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81365126, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 2.709022283554077 + }, + { + "auxiliary_loss_clip": 0.01064865, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.03346682, + "balance_loss_mlp": 1.02084017, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.929297134432352, + "language_loss": 0.66266811, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68363762, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.6804184913635254 + }, + { + "auxiliary_loss_clip": 0.01072747, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.03442609, + "balance_loss_mlp": 1.02603042, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.447600717934269, + "language_loss": 0.79977012, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82089657, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.7415432929992676 + }, + { + "auxiliary_loss_clip": 0.01070584, + "auxiliary_loss_mlp": 0.00749415, + "balance_loss_clip": 1.03352618, + "balance_loss_mlp": 1.00056529, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.5950818782556488, + "language_loss": 0.70619559, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72439563, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 2.6235709190368652 + }, + { + "auxiliary_loss_clip": 0.01067131, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.03495884, + "balance_loss_mlp": 1.02143979, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.7852386651881114, + "language_loss": 0.83341718, + "learning_rate": 1.527232084570895e-06, + "loss": 0.8544243, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.6532604694366455 + }, + { + "auxiliary_loss_clip": 0.01087757, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_clip": 1.03452301, + "balance_loss_mlp": 1.03045571, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.569020865776474, + "language_loss": 0.76294762, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78427124, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.580735921859741 + }, + { + "auxiliary_loss_clip": 0.01036413, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.02629066, + "balance_loss_mlp": 1.02912974, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 2.019678051902234, + "language_loss": 0.69246364, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71326673, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 2.7001590728759766 + }, + { + "auxiliary_loss_clip": 0.01100217, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.03445923, + "balance_loss_mlp": 1.02020621, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 2.071545213803296, + "language_loss": 0.60368621, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62501377, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 2.5455567836761475 + }, + { + "auxiliary_loss_clip": 0.01070636, + "auxiliary_loss_mlp": 0.01037624, + "balance_loss_clip": 1.03374946, + "balance_loss_mlp": 1.02466047, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.9976181382551559, + "language_loss": 0.65007591, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67115849, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.623481273651123 + }, + { + "auxiliary_loss_clip": 0.01062026, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.03188097, + "balance_loss_mlp": 1.02728915, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.5310899770067499, + "language_loss": 0.7419281, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76293373, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 4.169341325759888 + }, + { + "auxiliary_loss_clip": 0.01074631, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.03287828, + "balance_loss_mlp": 1.01902211, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.5694884945991356, + "language_loss": 0.83062518, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85167807, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 2.6666183471679688 + }, + { + "auxiliary_loss_clip": 0.01072783, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.03051984, + "balance_loss_mlp": 1.01582909, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 4.049829550219861, + "language_loss": 0.79285067, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81386137, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 4.031061410903931 + }, + { + "auxiliary_loss_clip": 0.01100796, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.0348146, + "balance_loss_mlp": 1.02358627, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.5416244893147795, + "language_loss": 0.73854184, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.75989711, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 2.4851346015930176 + }, + { + "auxiliary_loss_clip": 0.01064937, + "auxiliary_loss_mlp": 0.01031762, + "balance_loss_clip": 1.03127122, + "balance_loss_mlp": 1.01839864, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 2.3264363523097327, + "language_loss": 0.76576138, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78672832, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.627639055252075 + }, + { + "auxiliary_loss_clip": 0.0104861, + "auxiliary_loss_mlp": 0.01047319, + "balance_loss_clip": 1.02876401, + "balance_loss_mlp": 1.03312719, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 2.5140620445103385, + "language_loss": 0.78498, + "learning_rate": 1.523448741022722e-06, + "loss": 0.80593926, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.712562084197998 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.03410733, + "balance_loss_mlp": 1.01937389, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 2.0491039385250533, + "language_loss": 0.66079849, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68178821, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.6617698669433594 + }, + { + "auxiliary_loss_clip": 0.01091182, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.03513408, + "balance_loss_mlp": 1.01865542, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.6707513572513362, + "language_loss": 0.77680755, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.79802465, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 2.5879340171813965 + }, + { + "auxiliary_loss_clip": 0.01092409, + "auxiliary_loss_mlp": 0.01035746, + "balance_loss_clip": 1.0346539, + "balance_loss_mlp": 1.0240221, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.4571642123921515, + "language_loss": 0.73396289, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75524443, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.6214663982391357 + }, + { + "auxiliary_loss_clip": 0.01074102, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.0344944, + "balance_loss_mlp": 1.01836991, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.4861421302686748, + "language_loss": 0.74558109, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76662576, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.6619396209716797 + }, + { + "auxiliary_loss_clip": 0.01097521, + "auxiliary_loss_mlp": 0.00749787, + "balance_loss_clip": 1.03491414, + "balance_loss_mlp": 1.00059319, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 2.349815174260536, + "language_loss": 0.77968585, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.79815888, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 4.147475242614746 + }, + { + "auxiliary_loss_clip": 0.01102594, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.03462124, + "balance_loss_mlp": 1.01683915, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 1.93015262066969, + "language_loss": 0.77030051, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79161793, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.598991870880127 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.03806782, + "balance_loss_mlp": 1.0202024, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 1.6673303967302617, + "language_loss": 0.74243307, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76375026, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.501950740814209 + }, + { + "auxiliary_loss_clip": 0.01050269, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.03259766, + "balance_loss_mlp": 1.01817381, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 1.9461347638274278, + "language_loss": 0.7147963, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.7356168, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.647481918334961 + }, + { + "auxiliary_loss_clip": 0.01084409, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.03550076, + "balance_loss_mlp": 1.02099299, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 1.878852357849727, + "language_loss": 0.82041699, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84159976, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 2.5330560207366943 + }, + { + "auxiliary_loss_clip": 0.01093427, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.03571355, + "balance_loss_mlp": 1.01867568, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.7609573779576495, + "language_loss": 0.80863404, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.82987398, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 2.5460174083709717 + }, + { + "auxiliary_loss_clip": 0.01095153, + "auxiliary_loss_mlp": 0.0103055, + "balance_loss_clip": 1.03575802, + "balance_loss_mlp": 1.01709127, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 2.030126385733282, + "language_loss": 0.76680094, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78805798, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 2.574716806411743 + }, + { + "auxiliary_loss_clip": 0.01067084, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.03374314, + "balance_loss_mlp": 1.01752865, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 5.441975622408726, + "language_loss": 0.70285016, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72380048, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 2.55772066116333 + }, + { + "auxiliary_loss_clip": 0.01075748, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.03444552, + "balance_loss_mlp": 1.0205214, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.50557934676239, + "language_loss": 0.7229569, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74403924, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 2.592979907989502 + }, + { + "auxiliary_loss_clip": 0.01075551, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.03485751, + "balance_loss_mlp": 1.01854038, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 2.8412012191762206, + "language_loss": 0.78469354, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80575514, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.6188197135925293 + }, + { + "auxiliary_loss_clip": 0.01079046, + "auxiliary_loss_mlp": 0.00749736, + "balance_loss_clip": 1.03666222, + "balance_loss_mlp": 1.00060165, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 2.8632845141944787, + "language_loss": 0.75708354, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.77537137, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.657193183898926 + }, + { + "auxiliary_loss_clip": 0.01104916, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.03796566, + "balance_loss_mlp": 1.02173305, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.013984178245552, + "language_loss": 0.80963731, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83102769, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.5207695960998535 + }, + { + "auxiliary_loss_clip": 0.01046212, + "auxiliary_loss_mlp": 0.01040611, + "balance_loss_clip": 1.03196049, + "balance_loss_mlp": 1.02715266, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.6959238638492822, + "language_loss": 0.76779181, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78866005, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 2.688955307006836 + }, + { + "auxiliary_loss_clip": 0.01067047, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.03457499, + "balance_loss_mlp": 1.01856565, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 2.2500045411845666, + "language_loss": 0.66554236, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68651354, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.692289113998413 + }, + { + "auxiliary_loss_clip": 0.01104661, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.03662515, + "balance_loss_mlp": 1.02103543, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.6099208685037958, + "language_loss": 0.78005528, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80143023, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 2.51191782951355 + }, + { + "auxiliary_loss_clip": 0.00999608, + "auxiliary_loss_mlp": 0.01008623, + "balance_loss_clip": 1.01173723, + "balance_loss_mlp": 1.00727022, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9292748537084411, + "language_loss": 0.65143275, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67151499, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 3.1538631916046143 + }, + { + "auxiliary_loss_clip": 0.01061978, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.0335319, + "balance_loss_mlp": 1.02577364, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.822647388828137, + "language_loss": 0.61663324, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63763082, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 4.105116844177246 + }, + { + "auxiliary_loss_clip": 0.01104703, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.03560305, + "balance_loss_mlp": 1.0252564, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.7404833578693752, + "language_loss": 0.82538807, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84681118, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.564215660095215 + }, + { + "auxiliary_loss_clip": 0.01082288, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.03539133, + "balance_loss_mlp": 1.01943064, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 9.420730391517466, + "language_loss": 0.72755367, + "learning_rate": 1.514753932336165e-06, + "loss": 0.74869466, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.5621330738067627 + }, + { + "auxiliary_loss_clip": 0.01076611, + "auxiliary_loss_mlp": 0.00749848, + "balance_loss_clip": 1.03361499, + "balance_loss_mlp": 1.00056124, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.636000718104043, + "language_loss": 0.82734489, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84560955, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 2.6222310066223145 + }, + { + "auxiliary_loss_clip": 0.01088314, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.0344274, + "balance_loss_mlp": 1.01821399, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.75751675208982, + "language_loss": 0.76163507, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78280449, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 2.616313934326172 + }, + { + "auxiliary_loss_clip": 0.01079505, + "auxiliary_loss_mlp": 0.01025269, + "balance_loss_clip": 1.03550005, + "balance_loss_mlp": 1.01414073, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.573527367754564, + "language_loss": 0.72074687, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74179459, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.599677324295044 + }, + { + "auxiliary_loss_clip": 0.01058193, + "auxiliary_loss_mlp": 0.01027396, + "balance_loss_clip": 1.03289199, + "balance_loss_mlp": 1.01640534, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.6615293988381625, + "language_loss": 0.79615259, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.8170085, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 2.662562370300293 + }, + { + "auxiliary_loss_clip": 0.01048974, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.03392744, + "balance_loss_mlp": 1.02333295, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 3.3940424340732562, + "language_loss": 0.88444167, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90528733, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 2.6311450004577637 + }, + { + "auxiliary_loss_clip": 0.0101106, + "auxiliary_loss_mlp": 0.00999161, + "balance_loss_clip": 1.01005268, + "balance_loss_mlp": 0.9981184, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7547016897800967, + "language_loss": 0.57862598, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59872818, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 3.118765354156494 + }, + { + "auxiliary_loss_clip": 0.01098938, + "auxiliary_loss_mlp": 0.00749619, + "balance_loss_clip": 1.0367713, + "balance_loss_mlp": 1.00049114, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.2120782299753903, + "language_loss": 0.75835925, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.77684486, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 2.632960081100464 + }, + { + "auxiliary_loss_clip": 0.01072733, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.03482485, + "balance_loss_mlp": 1.0156703, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.8747361194671577, + "language_loss": 0.77610546, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79710615, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.5961966514587402 + }, + { + "auxiliary_loss_clip": 0.0109101, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.03506851, + "balance_loss_mlp": 1.01495457, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.719244677305196, + "language_loss": 0.83555907, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85673523, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.531480550765991 + }, + { + "auxiliary_loss_clip": 0.0109025, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.03335178, + "balance_loss_mlp": 1.01895285, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 6.689207025015653, + "language_loss": 0.74450433, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76571882, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.5729053020477295 + }, + { + "auxiliary_loss_clip": 0.01103077, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.03520274, + "balance_loss_mlp": 1.02452779, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 3.1718935171699045, + "language_loss": 0.7814312, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80282426, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 2.554725408554077 + }, + { + "auxiliary_loss_clip": 0.01076697, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_clip": 1.03205872, + "balance_loss_mlp": 1.01900065, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.032527135566422, + "language_loss": 0.73765194, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.75873077, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 4.1454808712005615 + }, + { + "auxiliary_loss_clip": 0.0106359, + "auxiliary_loss_mlp": 0.01036068, + "balance_loss_clip": 1.03184247, + "balance_loss_mlp": 1.02291298, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.2042139992480054, + "language_loss": 0.82059085, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84158742, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 4.058758497238159 + }, + { + "auxiliary_loss_clip": 0.01058037, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.03133297, + "balance_loss_mlp": 1.01975989, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.740936411681874, + "language_loss": 0.79653764, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81744206, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 2.648857593536377 + }, + { + "auxiliary_loss_clip": 0.01058455, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.03545403, + "balance_loss_mlp": 1.02674341, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 5.984693138852644, + "language_loss": 0.69751877, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71849328, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.6152281761169434 + }, + { + "auxiliary_loss_clip": 0.01077775, + "auxiliary_loss_mlp": 0.01038579, + "balance_loss_clip": 1.03439033, + "balance_loss_mlp": 1.0264498, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 1.784084861928392, + "language_loss": 0.65871119, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.67987478, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.553196668624878 + }, + { + "auxiliary_loss_clip": 0.01075492, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.03385532, + "balance_loss_mlp": 1.01802754, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.6432694731489754, + "language_loss": 0.81537652, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83643293, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 2.671995162963867 + }, + { + "auxiliary_loss_clip": 0.0107448, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.03354526, + "balance_loss_mlp": 1.01879585, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 4.702091494977631, + "language_loss": 0.69260132, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71364343, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.5754709243774414 + }, + { + "auxiliary_loss_clip": 0.01076974, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.03301859, + "balance_loss_mlp": 1.01916027, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 2.1126295939755395, + "language_loss": 0.82593215, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.8470124, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.587643623352051 + }, + { + "auxiliary_loss_clip": 0.01070842, + "auxiliary_loss_mlp": 0.010285, + "balance_loss_clip": 1.03042078, + "balance_loss_mlp": 1.01526141, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.2477500201685134, + "language_loss": 0.80946398, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.83045745, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.5480523109436035 + }, + { + "auxiliary_loss_clip": 0.01053325, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03496468, + "balance_loss_mlp": 1.01571488, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.9446944594841267, + "language_loss": 0.74237049, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76318514, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.6280200481414795 + }, + { + "auxiliary_loss_clip": 0.01059951, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.03080392, + "balance_loss_mlp": 1.02124095, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.8129180917286334, + "language_loss": 0.63607633, + "learning_rate": 1.506446264718213e-06, + "loss": 0.6570338, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 4.2861833572387695 + }, + { + "auxiliary_loss_clip": 0.01060778, + "auxiliary_loss_mlp": 0.0074917, + "balance_loss_clip": 1.03199172, + "balance_loss_mlp": 1.00045478, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 2.008822185666887, + "language_loss": 0.76274139, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78084087, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.695917844772339 + }, + { + "auxiliary_loss_clip": 0.0107433, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.03242052, + "balance_loss_mlp": 1.01751459, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.6372343932690938, + "language_loss": 0.62680215, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64784396, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.5844662189483643 + }, + { + "auxiliary_loss_clip": 0.01093096, + "auxiliary_loss_mlp": 0.01036807, + "balance_loss_clip": 1.03552866, + "balance_loss_mlp": 1.02493334, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.5137465074555054, + "language_loss": 0.76054579, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.78184485, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.5672426223754883 + }, + { + "auxiliary_loss_clip": 0.01077647, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.03399456, + "balance_loss_mlp": 1.02154756, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 3.7349975278877037, + "language_loss": 0.75459373, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77570647, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 2.6143603324890137 + }, + { + "auxiliary_loss_clip": 0.01061487, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.03276682, + "balance_loss_mlp": 1.01844704, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.7042386892629853, + "language_loss": 0.75838649, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77930778, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 2.647763729095459 + }, + { + "auxiliary_loss_clip": 0.01081229, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.03592694, + "balance_loss_mlp": 1.0165298, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 2.0585936113323773, + "language_loss": 0.70421726, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72530723, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.6312503814697266 + }, + { + "auxiliary_loss_clip": 0.01087769, + "auxiliary_loss_mlp": 0.0074979, + "balance_loss_clip": 1.03750825, + "balance_loss_mlp": 1.0005579, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 1.7045318746384874, + "language_loss": 0.8057462, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82412183, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 2.5903353691101074 + }, + { + "auxiliary_loss_clip": 0.01076145, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.03322411, + "balance_loss_mlp": 1.01777196, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.525900986256631, + "language_loss": 0.67238677, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.6934303, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.670238494873047 + }, + { + "auxiliary_loss_clip": 0.01063718, + "auxiliary_loss_mlp": 0.01026886, + "balance_loss_clip": 1.03353584, + "balance_loss_mlp": 1.015275, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.8876580405235919, + "language_loss": 0.88660848, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90751457, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.713144540786743 + }, + { + "auxiliary_loss_clip": 0.01082354, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.03418326, + "balance_loss_mlp": 1.01689756, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.800248397187262, + "language_loss": 0.86609137, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88719493, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 2.527653455734253 + }, + { + "auxiliary_loss_clip": 0.01090815, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.0341053, + "balance_loss_mlp": 1.02215528, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 1.8920460556267122, + "language_loss": 0.77874935, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79999197, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 2.54489803314209 + }, + { + "auxiliary_loss_clip": 0.01052503, + "auxiliary_loss_mlp": 0.01046625, + "balance_loss_clip": 1.03033531, + "balance_loss_mlp": 1.03277326, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 2.0571981948160545, + "language_loss": 0.6443845, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66537583, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 2.660223960876465 + }, + { + "auxiliary_loss_clip": 0.01089154, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.03463411, + "balance_loss_mlp": 1.01790416, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 2.208746367227933, + "language_loss": 0.76870996, + "learning_rate": 1.501541436426501e-06, + "loss": 0.78989339, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 2.58736252784729 + }, + { + "auxiliary_loss_clip": 0.01063811, + "auxiliary_loss_mlp": 0.00749683, + "balance_loss_clip": 1.03738284, + "balance_loss_mlp": 1.00045061, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 4.601090607844745, + "language_loss": 0.74583787, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.76397276, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.6818771362304688 + }, + { + "auxiliary_loss_clip": 0.01064474, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.03396153, + "balance_loss_mlp": 1.0227766, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.9638307053153625, + "language_loss": 0.76332021, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78429997, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 2.647920608520508 + }, + { + "auxiliary_loss_clip": 0.0106556, + "auxiliary_loss_mlp": 0.01026604, + "balance_loss_clip": 1.0319525, + "balance_loss_mlp": 1.01633978, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.511698105948102, + "language_loss": 0.70780718, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72872889, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 4.201192617416382 + }, + { + "auxiliary_loss_clip": 0.01040899, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.02909231, + "balance_loss_mlp": 1.01961148, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 2.024431252279999, + "language_loss": 0.778597, + "learning_rate": 1.500032899685832e-06, + "loss": 0.79931951, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.7094664573669434 + }, + { + "auxiliary_loss_clip": 0.01074845, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.03380609, + "balance_loss_mlp": 1.02802801, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.7625721780459545, + "language_loss": 0.70797867, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72912657, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.627932548522949 + }, + { + "auxiliary_loss_clip": 0.0107324, + "auxiliary_loss_mlp": 0.0103479, + "balance_loss_clip": 1.03337455, + "balance_loss_mlp": 1.02230287, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.4600803654453107, + "language_loss": 0.67432868, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.695409, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 2.6381139755249023 + }, + { + "auxiliary_loss_clip": 0.01082705, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.0334022, + "balance_loss_mlp": 1.02066016, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 2.3383123783627213, + "language_loss": 0.78166443, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80281806, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 2.5706307888031006 + }, + { + "auxiliary_loss_clip": 0.01075944, + "auxiliary_loss_mlp": 0.01023481, + "balance_loss_clip": 1.03454423, + "balance_loss_mlp": 1.01266921, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 1.9213738900307833, + "language_loss": 0.71966481, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74065912, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.6610474586486816 + }, + { + "auxiliary_loss_clip": 0.01080446, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03561866, + "balance_loss_mlp": 1.02041364, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.5017358811772465, + "language_loss": 0.66498291, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68611312, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 2.54329776763916 + }, + { + "auxiliary_loss_clip": 0.01044013, + "auxiliary_loss_mlp": 0.00749543, + "balance_loss_clip": 1.03087807, + "balance_loss_mlp": 1.00037372, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 2.0351477772571203, + "language_loss": 0.75268477, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77062035, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 2.7133750915527344 + }, + { + "auxiliary_loss_clip": 0.01048793, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.03434038, + "balance_loss_mlp": 1.02619052, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.7868407120084449, + "language_loss": 0.7435317, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76440418, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 3.028484582901001 + }, + { + "auxiliary_loss_clip": 0.01054412, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.03170002, + "balance_loss_mlp": 1.01527238, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.2661166519146114, + "language_loss": 0.71672672, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.7375403, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.656709671020508 + }, + { + "auxiliary_loss_clip": 0.01068858, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.03425765, + "balance_loss_mlp": 1.0202744, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 2.114812939422978, + "language_loss": 0.7447899, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76580536, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 2.612736225128174 + }, + { + "auxiliary_loss_clip": 0.01096738, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.036286, + "balance_loss_mlp": 1.02080286, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.261811336629711, + "language_loss": 0.78824288, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.80954826, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.539902925491333 + }, + { + "auxiliary_loss_clip": 0.01090646, + "auxiliary_loss_mlp": 0.01031865, + "balance_loss_clip": 1.03353047, + "balance_loss_mlp": 1.01956856, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.4789622364304866, + "language_loss": 0.84787095, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.8690961, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.567511558532715 + }, + { + "auxiliary_loss_clip": 0.0100702, + "auxiliary_loss_mlp": 0.01000114, + "balance_loss_clip": 1.00562048, + "balance_loss_mlp": 0.99910021, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.7083665968730507, + "language_loss": 0.5999279, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.61999929, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 4.710608720779419 + }, + { + "auxiliary_loss_clip": 0.0107606, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.03107464, + "balance_loss_mlp": 1.01875639, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 1.8867555482983542, + "language_loss": 0.77486002, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.7959435, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 2.5597152709960938 + }, + { + "auxiliary_loss_clip": 0.01082353, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.03018653, + "balance_loss_mlp": 1.01653254, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.5987778458046595, + "language_loss": 0.7605738, + "learning_rate": 1.494755415907243e-06, + "loss": 0.78167105, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 4.17846941947937 + }, + { + "auxiliary_loss_clip": 0.01091681, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.03264487, + "balance_loss_mlp": 1.01586294, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.4780175932243624, + "language_loss": 0.81412065, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83531839, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 2.5153331756591797 + }, + { + "auxiliary_loss_clip": 0.0107267, + "auxiliary_loss_mlp": 0.00749564, + "balance_loss_clip": 1.03270078, + "balance_loss_mlp": 1.00044632, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.9861977758101412, + "language_loss": 0.70521933, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.72344166, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 2.81198787689209 + }, + { + "auxiliary_loss_clip": 0.01090457, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.03415227, + "balance_loss_mlp": 1.02506804, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.498571750611269, + "language_loss": 0.57817137, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59944189, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 2.583348274230957 + }, + { + "auxiliary_loss_clip": 0.01092373, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.03441906, + "balance_loss_mlp": 1.02312851, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.9146549642633846, + "language_loss": 0.77363336, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.7949096, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.6692750453948975 + }, + { + "auxiliary_loss_clip": 0.01089419, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.03291583, + "balance_loss_mlp": 1.01535606, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.280920161698972, + "language_loss": 0.82419372, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84536034, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.520875930786133 + }, + { + "auxiliary_loss_clip": 0.01092229, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.03495753, + "balance_loss_mlp": 1.02217698, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.670266024136156, + "language_loss": 0.79369378, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81495041, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.5379979610443115 + }, + { + "auxiliary_loss_clip": 0.01076188, + "auxiliary_loss_mlp": 0.00749732, + "balance_loss_clip": 1.03798568, + "balance_loss_mlp": 1.00051379, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.8238167788300157, + "language_loss": 0.74471474, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.7629739, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.6509780883789062 + }, + { + "auxiliary_loss_clip": 0.01105685, + "auxiliary_loss_mlp": 0.01031865, + "balance_loss_clip": 1.03819406, + "balance_loss_mlp": 1.02003932, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 2.3813482779680237, + "language_loss": 0.66415799, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68553346, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.649097204208374 + }, + { + "auxiliary_loss_clip": 0.01079744, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.03454185, + "balance_loss_mlp": 1.02571559, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 4.037310570497678, + "language_loss": 0.77010107, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79127699, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 4.130960464477539 + }, + { + "auxiliary_loss_clip": 0.01019487, + "auxiliary_loss_mlp": 0.01002032, + "balance_loss_clip": 1.01103151, + "balance_loss_mlp": 1.00084555, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8485800101678622, + "language_loss": 0.64553165, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66574681, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 2.9838650226593018 + }, + { + "auxiliary_loss_clip": 0.01086854, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.03351712, + "balance_loss_mlp": 1.01786685, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.7107809995342704, + "language_loss": 0.69582129, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71698928, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.5916237831115723 + }, + { + "auxiliary_loss_clip": 0.01072211, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.03265429, + "balance_loss_mlp": 1.01694894, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.542110078053323, + "language_loss": 0.79366386, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81468177, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 2.627213954925537 + }, + { + "auxiliary_loss_clip": 0.01067509, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.03292775, + "balance_loss_mlp": 1.0204941, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 2.7571069295333888, + "language_loss": 0.70502436, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72601855, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 2.677565574645996 + }, + { + "auxiliary_loss_clip": 0.01073164, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.03683281, + "balance_loss_mlp": 1.01903057, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 2.057732502498831, + "language_loss": 0.6915251, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71257383, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 2.6476564407348633 + }, + { + "auxiliary_loss_clip": 0.01087888, + "auxiliary_loss_mlp": 0.01037395, + "balance_loss_clip": 1.03311396, + "balance_loss_mlp": 1.02587986, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.9263199905795205, + "language_loss": 0.53798461, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55923748, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.657181739807129 + }, + { + "auxiliary_loss_clip": 0.00999217, + "auxiliary_loss_mlp": 0.01005223, + "balance_loss_clip": 1.01030207, + "balance_loss_mlp": 1.00407302, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6599424917405834, + "language_loss": 0.54569709, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56574148, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.250948667526245 + }, + { + "auxiliary_loss_clip": 0.01066162, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.02002287, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.9749032403883249, + "language_loss": 0.75239933, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.77337468, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 2.6718595027923584 + }, + { + "auxiliary_loss_clip": 0.01069557, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.03432322, + "balance_loss_mlp": 1.01918328, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 2.0645846559997953, + "language_loss": 0.77630836, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79731137, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.602170467376709 + }, + { + "auxiliary_loss_clip": 0.01046594, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.03056788, + "balance_loss_mlp": 1.01580608, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.612288551922638, + "language_loss": 0.7904439, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81119657, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 2.671616792678833 + }, + { + "auxiliary_loss_clip": 0.01092178, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03493762, + "balance_loss_mlp": 1.01892352, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 3.1027771632599745, + "language_loss": 0.83447778, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.85570514, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.591376781463623 + }, + { + "auxiliary_loss_clip": 0.01075701, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.03500772, + "balance_loss_mlp": 1.02106023, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 2.5284517048384507, + "language_loss": 0.70584881, + "learning_rate": 1.486846243389939e-06, + "loss": 0.72693235, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 2.602773666381836 + }, + { + "auxiliary_loss_clip": 0.01086822, + "auxiliary_loss_mlp": 0.01043505, + "balance_loss_clip": 1.03197014, + "balance_loss_mlp": 1.02836502, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.120583672396771, + "language_loss": 0.63901907, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66032237, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.6240994930267334 + }, + { + "auxiliary_loss_clip": 0.01102552, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.03708148, + "balance_loss_mlp": 1.01667583, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.719333332510216, + "language_loss": 0.72087264, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74217105, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 2.5682082176208496 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.03563762, + "balance_loss_mlp": 1.02122951, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.68792164281385, + "language_loss": 0.84384495, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86517763, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.566836357116699 + }, + { + "auxiliary_loss_clip": 0.00977249, + "auxiliary_loss_mlp": 0.01006983, + "balance_loss_clip": 1.00797701, + "balance_loss_mlp": 1.00593364, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8139396154260515, + "language_loss": 0.5822438, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60208613, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 4.620957612991333 + }, + { + "auxiliary_loss_clip": 0.01042225, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.03211033, + "balance_loss_mlp": 1.01765227, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 1.6408151146808108, + "language_loss": 0.76888752, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.78961122, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 2.870126724243164 + }, + { + "auxiliary_loss_clip": 0.01067819, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.03692269, + "balance_loss_mlp": 1.02066398, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.700672024379209, + "language_loss": 0.77790201, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79889703, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 2.8004939556121826 + }, + { + "auxiliary_loss_clip": 0.01088031, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.03448522, + "balance_loss_mlp": 1.02243304, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.4607991183549554, + "language_loss": 0.7279219, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74914265, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 2.702571153640747 + }, + { + "auxiliary_loss_clip": 0.0109045, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.0332737, + "balance_loss_mlp": 1.01501656, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.7974633982772077, + "language_loss": 0.6971873, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71836543, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 2.6009891033172607 + }, + { + "auxiliary_loss_clip": 0.01090882, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.03468895, + "balance_loss_mlp": 1.01866484, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.6592585013595493, + "language_loss": 0.75023222, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77144539, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 2.6006593704223633 + }, + { + "auxiliary_loss_clip": 0.01073438, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.03256154, + "balance_loss_mlp": 1.01944244, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.5647543954108152, + "language_loss": 0.66801411, + "learning_rate": 1.483082978767595e-06, + "loss": 0.68905824, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.715961217880249 + }, + { + "auxiliary_loss_clip": 0.0101976, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.02647543, + "balance_loss_mlp": 1.01675391, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 1.8525531164610072, + "language_loss": 0.76375067, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78422999, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.6811728477478027 + }, + { + "auxiliary_loss_clip": 0.01025137, + "auxiliary_loss_mlp": 0.01002182, + "balance_loss_clip": 1.00451696, + "balance_loss_mlp": 1.00119865, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9354832413884049, + "language_loss": 0.73423374, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75450695, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 3.1412105560302734 + }, + { + "auxiliary_loss_clip": 0.01077931, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.03380179, + "balance_loss_mlp": 1.01973653, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.7279644452357446, + "language_loss": 0.69398504, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71508515, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.619795560836792 + }, + { + "auxiliary_loss_clip": 0.0109696, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.03574097, + "balance_loss_mlp": 1.02022839, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 3.0004466650131634, + "language_loss": 0.66146874, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.68276787, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.53548264503479 + }, + { + "auxiliary_loss_clip": 0.01069821, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.03344512, + "balance_loss_mlp": 1.0225575, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.6842754964086977, + "language_loss": 0.73227143, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.7533164, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.698957920074463 + }, + { + "auxiliary_loss_clip": 0.01067585, + "auxiliary_loss_mlp": 0.00749451, + "balance_loss_clip": 1.03237462, + "balance_loss_mlp": 1.00049877, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 1.9947698788016583, + "language_loss": 0.80187738, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.82004768, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 2.7789580821990967 + }, + { + "auxiliary_loss_clip": 0.01062827, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.03186166, + "balance_loss_mlp": 1.01702499, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.8526644016477396, + "language_loss": 0.67511845, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69602907, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 2.5609850883483887 + }, + { + "auxiliary_loss_clip": 0.01072707, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.03229845, + "balance_loss_mlp": 1.01972604, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 2.007314459265642, + "language_loss": 0.79040986, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81144333, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 4.144482135772705 + }, + { + "auxiliary_loss_clip": 0.01075985, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.03135943, + "balance_loss_mlp": 1.01798904, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 2.0843365669646197, + "language_loss": 0.83026862, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.85132533, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 2.5898685455322266 + }, + { + "auxiliary_loss_clip": 0.01071461, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.03275967, + "balance_loss_mlp": 1.02264428, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.7954666986553371, + "language_loss": 0.77362752, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79468805, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 3.9991512298583984 + }, + { + "auxiliary_loss_clip": 0.01092033, + "auxiliary_loss_mlp": 0.01035506, + "balance_loss_clip": 1.03577793, + "balance_loss_mlp": 1.02372861, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.7318157257176878, + "language_loss": 0.7902441, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81151944, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 2.638498306274414 + }, + { + "auxiliary_loss_clip": 0.01077044, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.03371811, + "balance_loss_mlp": 1.0238241, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.7971343730846505, + "language_loss": 0.77742577, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79855764, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.569995164871216 + }, + { + "auxiliary_loss_clip": 0.01087376, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.02428401, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.84213770201085, + "language_loss": 0.82693624, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84817863, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.5234785079956055 + }, + { + "auxiliary_loss_clip": 0.01084336, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.03253007, + "balance_loss_mlp": 1.01765966, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 2.267580716074375, + "language_loss": 0.80863333, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82976979, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.561568260192871 + }, + { + "auxiliary_loss_clip": 0.01089747, + "auxiliary_loss_mlp": 0.00749308, + "balance_loss_clip": 1.03288102, + "balance_loss_mlp": 1.0004425, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.8496078759936851, + "language_loss": 0.76639783, + "learning_rate": 1.477441761580111e-06, + "loss": 0.78478843, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 2.5678725242614746 + }, + { + "auxiliary_loss_clip": 0.01088063, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.03636312, + "balance_loss_mlp": 1.02387142, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.7589613073085884, + "language_loss": 0.75906372, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78031284, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.6152806282043457 + }, + { + "auxiliary_loss_clip": 0.01080724, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.03170228, + "balance_loss_mlp": 1.02190864, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 1.8120508499488341, + "language_loss": 0.66443044, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68558842, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.6417269706726074 + }, + { + "auxiliary_loss_clip": 0.0107109, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.03524482, + "balance_loss_mlp": 1.0242219, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.1106477276232107, + "language_loss": 0.71234572, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.7334224, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 4.1020801067352295 + }, + { + "auxiliary_loss_clip": 0.01053418, + "auxiliary_loss_mlp": 0.00749447, + "balance_loss_clip": 1.03228927, + "balance_loss_mlp": 1.0004549, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.7359327226169583, + "language_loss": 0.70254588, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.7205745, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 2.86922287940979 + }, + { + "auxiliary_loss_clip": 0.010559, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.03213644, + "balance_loss_mlp": 1.01779032, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.7469355904912702, + "language_loss": 0.63824576, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65910578, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 2.807413101196289 + }, + { + "auxiliary_loss_clip": 0.01098205, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.03247285, + "balance_loss_mlp": 1.02055979, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.6802610724170826, + "language_loss": 0.69344163, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71474111, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 2.5829715728759766 + }, + { + "auxiliary_loss_clip": 0.01053674, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.03346729, + "balance_loss_mlp": 1.02342367, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.7867753836051186, + "language_loss": 0.76631665, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78719157, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.7281486988067627 + }, + { + "auxiliary_loss_clip": 0.01078483, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.03582394, + "balance_loss_mlp": 1.01828361, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.8788872075085947, + "language_loss": 0.68892276, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71001709, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.590294122695923 + }, + { + "auxiliary_loss_clip": 0.01016369, + "auxiliary_loss_mlp": 0.01010497, + "balance_loss_clip": 1.00874615, + "balance_loss_mlp": 1.00940061, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.865941426534196, + "language_loss": 0.64304721, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66331589, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 3.064838171005249 + }, + { + "auxiliary_loss_clip": 0.01070093, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.03433657, + "balance_loss_mlp": 1.0174216, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.7985782794098633, + "language_loss": 0.74140817, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76240396, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 2.6151211261749268 + }, + { + "auxiliary_loss_clip": 0.01019785, + "auxiliary_loss_mlp": 0.01013156, + "balance_loss_clip": 1.01440239, + "balance_loss_mlp": 1.01215494, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6712403741633662, + "language_loss": 0.51982421, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54015362, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 3.2484660148620605 + }, + { + "auxiliary_loss_clip": 0.0102694, + "auxiliary_loss_mlp": 0.0099893, + "balance_loss_clip": 1.00622892, + "balance_loss_mlp": 0.9979108, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8303430784423314, + "language_loss": 0.54215157, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.5624103, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 3.017425775527954 + }, + { + "auxiliary_loss_clip": 0.01080554, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.03380084, + "balance_loss_mlp": 1.02167654, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.6853144845013959, + "language_loss": 0.6533891, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.67453271, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.7063751220703125 + }, + { + "auxiliary_loss_clip": 0.01045956, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.03221345, + "balance_loss_mlp": 1.02299047, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.138753129021492, + "language_loss": 0.67422247, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69502336, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.6241888999938965 + }, + { + "auxiliary_loss_clip": 0.01093117, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.03433836, + "balance_loss_mlp": 1.01687312, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 1.9791529460975108, + "language_loss": 0.77227736, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79349703, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": 2.600990056991577 + }, + { + "auxiliary_loss_clip": 0.01088015, + "auxiliary_loss_mlp": 0.01027617, + "balance_loss_clip": 1.03233051, + "balance_loss_mlp": 1.01597059, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.5705446243807035, + "language_loss": 0.76215851, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78331482, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 2.5950257778167725 + }, + { + "auxiliary_loss_clip": 0.01053, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.0330199, + "balance_loss_mlp": 1.01700759, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.8916219622967656, + "language_loss": 0.68885374, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70969224, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 2.6766104698181152 + }, + { + "auxiliary_loss_clip": 0.01075946, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.03519869, + "balance_loss_mlp": 1.02138591, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3819514126038601, + "language_loss": 0.69815868, + "learning_rate": 1.470678190375664e-06, + "loss": 0.7192409, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 2.7025256156921387 + }, + { + "auxiliary_loss_clip": 0.0107787, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.03241134, + "balance_loss_mlp": 1.018188, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 1.714838746790828, + "language_loss": 0.77809501, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79917002, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 4.058542728424072 + }, + { + "auxiliary_loss_clip": 0.01056695, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.0341444, + "balance_loss_mlp": 1.02386761, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 2.194263065768673, + "language_loss": 0.75527394, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.77620173, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 2.660365581512451 + }, + { + "auxiliary_loss_clip": 0.01031054, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.03077435, + "balance_loss_mlp": 1.01975429, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 2.0743781588418346, + "language_loss": 0.62077981, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64139569, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 2.8326146602630615 + }, + { + "auxiliary_loss_clip": 0.0108235, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.03566837, + "balance_loss_mlp": 1.0250535, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 2.067027804413929, + "language_loss": 0.72432214, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74551892, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 2.714686632156372 + }, + { + "auxiliary_loss_clip": 0.01051469, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.03385496, + "balance_loss_mlp": 1.02161098, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 2.7954571602584073, + "language_loss": 0.67199868, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.6928488, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.7132627964019775 + }, + { + "auxiliary_loss_clip": 0.01085805, + "auxiliary_loss_mlp": 0.01041864, + "balance_loss_clip": 1.03267419, + "balance_loss_mlp": 1.02826822, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 1.991553250774995, + "language_loss": 0.88482177, + "learning_rate": 1.468425107717461e-06, + "loss": 0.90609848, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.6443798542022705 + }, + { + "auxiliary_loss_clip": 0.01098119, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.03426826, + "balance_loss_mlp": 1.02019072, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.8152740605828528, + "language_loss": 0.72219789, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74348384, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 2.5942981243133545 + }, + { + "auxiliary_loss_clip": 0.01079814, + "auxiliary_loss_mlp": 0.01030907, + "balance_loss_clip": 1.03445423, + "balance_loss_mlp": 1.01849127, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.8000725188740667, + "language_loss": 0.89548349, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91659075, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.5964505672454834 + }, + { + "auxiliary_loss_clip": 0.0109011, + "auxiliary_loss_mlp": 0.01028762, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.01822972, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 1.795809695295117, + "language_loss": 0.69957793, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72076666, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.5534417629241943 + }, + { + "auxiliary_loss_clip": 0.01091702, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.03513527, + "balance_loss_mlp": 1.01583767, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.5983059553303816, + "language_loss": 0.78270245, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.803895, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 2.53092622756958 + }, + { + "auxiliary_loss_clip": 0.01082671, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.03560543, + "balance_loss_mlp": 1.02545595, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.475282263466486, + "language_loss": 0.73984742, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76105464, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 2.6009175777435303 + }, + { + "auxiliary_loss_clip": 0.01082535, + "auxiliary_loss_mlp": 0.0074972, + "balance_loss_clip": 1.0342679, + "balance_loss_mlp": 1.00050044, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.0747247623517744, + "language_loss": 0.78878552, + "learning_rate": 1.466172750724613e-06, + "loss": 0.80710804, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.616663694381714 + }, + { + "auxiliary_loss_clip": 0.01073995, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03748488, + "balance_loss_mlp": 1.02009475, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.4923312989127115, + "language_loss": 0.69326794, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71432221, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 2.684309959411621 + }, + { + "auxiliary_loss_clip": 0.01078535, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.03311145, + "balance_loss_mlp": 1.01925576, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.9275714475624668, + "language_loss": 0.73099351, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75208461, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 4.110365152359009 + }, + { + "auxiliary_loss_clip": 0.01103222, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.03499269, + "balance_loss_mlp": 1.01993966, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.8144610985072458, + "language_loss": 0.68722796, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70857728, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 2.564426898956299 + }, + { + "auxiliary_loss_clip": 0.01105846, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03719139, + "balance_loss_mlp": 1.01760793, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.145719744845372, + "language_loss": 0.72964585, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.751001, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 4.106278419494629 + }, + { + "auxiliary_loss_clip": 0.01065635, + "auxiliary_loss_mlp": 0.01026528, + "balance_loss_clip": 1.03423429, + "balance_loss_mlp": 1.01550758, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 2.510846404018826, + "language_loss": 0.84871101, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86963266, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.680952787399292 + }, + { + "auxiliary_loss_clip": 0.01072082, + "auxiliary_loss_mlp": 0.00749728, + "balance_loss_clip": 1.03444278, + "balance_loss_mlp": 1.0004946, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 2.001122661816053, + "language_loss": 0.66651821, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68473631, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.892732620239258 + }, + { + "auxiliary_loss_clip": 0.0109199, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.03506207, + "balance_loss_mlp": 1.01896286, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6926203042879733, + "language_loss": 0.83763826, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85886651, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.5798189640045166 + }, + { + "auxiliary_loss_clip": 0.01084226, + "auxiliary_loss_mlp": 0.01026482, + "balance_loss_clip": 1.03611052, + "balance_loss_mlp": 1.01489484, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.6498898662961172, + "language_loss": 0.79553115, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81663823, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 2.6617259979248047 + }, + { + "auxiliary_loss_clip": 0.01103043, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.03621268, + "balance_loss_mlp": 1.01647687, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.6114627720512338, + "language_loss": 0.67170393, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69302219, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.5503628253936768 + }, + { + "auxiliary_loss_clip": 0.01083782, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.03262258, + "balance_loss_mlp": 1.02573681, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.3114350697220698, + "language_loss": 0.74424851, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76547778, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 2.6008565425872803 + }, + { + "auxiliary_loss_clip": 0.01091025, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.03476286, + "balance_loss_mlp": 1.01851916, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 2.072439328420836, + "language_loss": 0.67901134, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70021749, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 2.6832756996154785 + }, + { + "auxiliary_loss_clip": 0.01068281, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.03518045, + "balance_loss_mlp": 1.01759422, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 2.522491614451299, + "language_loss": 0.76826304, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.78923965, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.6515250205993652 + }, + { + "auxiliary_loss_clip": 0.01086309, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.03475833, + "balance_loss_mlp": 1.01735735, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 6.126110505955411, + "language_loss": 0.77230346, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79345167, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 2.551698923110962 + }, + { + "auxiliary_loss_clip": 0.01067694, + "auxiliary_loss_mlp": 0.01025897, + "balance_loss_clip": 1.0354526, + "balance_loss_mlp": 1.01504254, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.604227437097222, + "language_loss": 0.73359108, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75452697, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 4.12999415397644 + }, + { + "auxiliary_loss_clip": 0.01096991, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.03766227, + "balance_loss_mlp": 1.01961315, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 1.951265442459132, + "language_loss": 0.68573749, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70702732, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 2.6486339569091797 + }, + { + "auxiliary_loss_clip": 0.01088831, + "auxiliary_loss_mlp": 0.0103461, + "balance_loss_clip": 1.03299236, + "balance_loss_mlp": 1.02207506, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.4814178633212853, + "language_loss": 0.79444581, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81568021, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.535896062850952 + }, + { + "auxiliary_loss_clip": 0.01084907, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.03349221, + "balance_loss_mlp": 1.01872742, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 1.8380895205829766, + "language_loss": 0.81562126, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83677983, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 2.6784605979919434 + }, + { + "auxiliary_loss_clip": 0.01045518, + "auxiliary_loss_mlp": 0.01035665, + "balance_loss_clip": 1.03060031, + "balance_loss_mlp": 1.02160406, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 4.177474094985073, + "language_loss": 0.61975205, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64056385, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 2.6915829181671143 + }, + { + "auxiliary_loss_clip": 0.01098676, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.03460324, + "balance_loss_mlp": 1.02032804, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.5409891324822582, + "language_loss": 0.78921056, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.8105157, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 2.537853956222534 + }, + { + "auxiliary_loss_clip": 0.01063055, + "auxiliary_loss_mlp": 0.01038446, + "balance_loss_clip": 1.033741, + "balance_loss_mlp": 1.02536845, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.2236409524342458, + "language_loss": 0.75802267, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.77903765, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.7431135177612305 + }, + { + "auxiliary_loss_clip": 0.01066487, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.03275776, + "balance_loss_mlp": 1.01854467, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.3407418603442043, + "language_loss": 0.6541819, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67515576, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.6719818115234375 + }, + { + "auxiliary_loss_clip": 0.01089239, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03300786, + "balance_loss_mlp": 1.01939201, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.6526009835110944, + "language_loss": 0.7444424, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76564616, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.614194393157959 + }, + { + "auxiliary_loss_clip": 0.01103095, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.03547096, + "balance_loss_mlp": 1.01639628, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.7556430217596544, + "language_loss": 0.77371508, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79503524, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.6499993801116943 + }, + { + "auxiliary_loss_clip": 0.0108172, + "auxiliary_loss_mlp": 0.01039453, + "balance_loss_clip": 1.03364265, + "balance_loss_mlp": 1.02687073, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.6670028823769454, + "language_loss": 0.74871171, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76992345, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 2.7179391384124756 + }, + { + "auxiliary_loss_clip": 0.01062408, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.03062153, + "balance_loss_mlp": 1.02003133, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.6202718096944386, + "language_loss": 0.68524182, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70618588, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 2.7134714126586914 + }, + { + "auxiliary_loss_clip": 0.01109721, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.03911304, + "balance_loss_mlp": 1.01996136, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.0221553348133194, + "language_loss": 0.8092171, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83063889, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 2.534625291824341 + }, + { + "auxiliary_loss_clip": 0.01099013, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.03491318, + "balance_loss_mlp": 1.01771712, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 2.602427986973599, + "language_loss": 0.6928401, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.71412367, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.5554299354553223 + }, + { + "auxiliary_loss_clip": 0.01087278, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.03352761, + "balance_loss_mlp": 1.01716638, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 2.3271442682710717, + "language_loss": 0.68251389, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70368874, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 2.5843372344970703 + }, + { + "auxiliary_loss_clip": 0.01090866, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.03550696, + "balance_loss_mlp": 1.02417147, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 2.2485408636584996, + "language_loss": 0.78697497, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80823159, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 4.162737607955933 + }, + { + "auxiliary_loss_clip": 0.01045156, + "auxiliary_loss_mlp": 0.01041649, + "balance_loss_clip": 1.03070259, + "balance_loss_mlp": 1.0269506, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.4802551051063946, + "language_loss": 0.72635424, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7472223, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 2.6309914588928223 + }, + { + "auxiliary_loss_clip": 0.01061144, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.03256392, + "balance_loss_mlp": 1.01935887, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.070952201882873, + "language_loss": 0.77756512, + "learning_rate": 1.454547250154447e-06, + "loss": 0.7984938, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.6347858905792236 + }, + { + "auxiliary_loss_clip": 0.01093143, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.03624213, + "balance_loss_mlp": 1.02478838, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.5707020511398062, + "language_loss": 0.83590025, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85719776, + "num_input_tokens_seen": 215077790, + "step": 9985, + "time_per_iteration": 2.634648084640503 + }, + { + "auxiliary_loss_clip": 0.01091019, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.03563333, + "balance_loss_mlp": 1.0259099, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.7310440319090998, + "language_loss": 0.71169037, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73297, + "num_input_tokens_seen": 215097650, + "step": 9986, + "time_per_iteration": 2.685675621032715 + }, + { + "auxiliary_loss_clip": 0.01107886, + "auxiliary_loss_mlp": 0.00749649, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.0004518, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 1.51650343154843, + "language_loss": 0.71742582, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73600125, + "num_input_tokens_seen": 215118235, + "step": 9987, + "time_per_iteration": 2.7392332553863525 + }, + { + "auxiliary_loss_clip": 0.01082381, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.03615713, + "balance_loss_mlp": 1.02064621, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.7015043774545842, + "language_loss": 0.84920996, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.87035513, + "num_input_tokens_seen": 215136755, + "step": 9988, + "time_per_iteration": 2.671679973602295 + }, + { + "auxiliary_loss_clip": 0.01092268, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.03520942, + "balance_loss_mlp": 1.02354002, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.7029956975722373, + "language_loss": 0.65215969, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67343891, + "num_input_tokens_seen": 215155225, + "step": 9989, + "time_per_iteration": 2.659902572631836 + }, + { + "auxiliary_loss_clip": 0.01090016, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.03462505, + "balance_loss_mlp": 1.01900911, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.5480702081696478, + "language_loss": 0.8082267, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82942641, + "num_input_tokens_seen": 215174815, + "step": 9990, + "time_per_iteration": 2.7098886966705322 + }, + { + "auxiliary_loss_clip": 0.01062458, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.03421474, + "balance_loss_mlp": 1.02002501, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 2.4582442004003067, + "language_loss": 0.82738459, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84833211, + "num_input_tokens_seen": 215192045, + "step": 9991, + "time_per_iteration": 2.6995432376861572 + }, + { + "auxiliary_loss_clip": 0.01046117, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.02991867, + "balance_loss_mlp": 1.02693236, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 1.841593816746002, + "language_loss": 0.82643259, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84729922, + "num_input_tokens_seen": 215209885, + "step": 9992, + "time_per_iteration": 2.677741050720215 + }, + { + "auxiliary_loss_clip": 0.01080207, + "auxiliary_loss_mlp": 0.00749611, + "balance_loss_clip": 1.03497434, + "balance_loss_mlp": 1.00048065, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 1.99287516586522, + "language_loss": 0.66098315, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.67928135, + "num_input_tokens_seen": 215228150, + "step": 9993, + "time_per_iteration": 2.6856424808502197 + }, + { + "auxiliary_loss_clip": 0.01062081, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.03200114, + "balance_loss_mlp": 1.02137995, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.5189158475842, + "language_loss": 0.8120029, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83296001, + "num_input_tokens_seen": 215243755, + "step": 9994, + "time_per_iteration": 2.7885637283325195 + }, + { + "auxiliary_loss_clip": 0.01051186, + "auxiliary_loss_mlp": 0.01026508, + "balance_loss_clip": 1.02965403, + "balance_loss_mlp": 1.01532054, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.7864919326230964, + "language_loss": 0.7236197, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74439663, + "num_input_tokens_seen": 215262130, + "step": 9995, + "time_per_iteration": 4.398872375488281 + }, + { + "auxiliary_loss_clip": 0.01078787, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.03207982, + "balance_loss_mlp": 1.02077866, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 2.1359638629974573, + "language_loss": 0.81217253, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83328736, + "num_input_tokens_seen": 215281785, + "step": 9996, + "time_per_iteration": 4.145599603652954 + }, + { + "auxiliary_loss_clip": 0.0103578, + "auxiliary_loss_mlp": 0.01044235, + "balance_loss_clip": 1.02923441, + "balance_loss_mlp": 1.03011489, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 2.2277536150528148, + "language_loss": 0.78306967, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80386984, + "num_input_tokens_seen": 215297550, + "step": 9997, + "time_per_iteration": 2.7700915336608887 + }, + { + "auxiliary_loss_clip": 0.01092439, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.03493524, + "balance_loss_mlp": 1.01924491, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 1.697104909127525, + "language_loss": 0.73075664, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75199926, + "num_input_tokens_seen": 215316360, + "step": 9998, + "time_per_iteration": 2.5399959087371826 + }, + { + "auxiliary_loss_clip": 0.01067446, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.03230596, + "balance_loss_mlp": 1.01908183, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.4933444201799564, + "language_loss": 0.7231735, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74415743, + "num_input_tokens_seen": 215336405, + "step": 9999, + "time_per_iteration": 2.710336208343506 + }, + { + "auxiliary_loss_clip": 0.01052693, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.03152764, + "balance_loss_mlp": 1.02610624, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.4590345898887602, + "language_loss": 0.78214574, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80305588, + "num_input_tokens_seen": 215356590, + "step": 10000, + "time_per_iteration": 2.7703933715820312 + }, + { + "auxiliary_loss_clip": 0.01108528, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.0385797, + "balance_loss_mlp": 1.02205825, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 2.0536407887109362, + "language_loss": 0.77370769, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79514301, + "num_input_tokens_seen": 215374295, + "step": 10001, + "time_per_iteration": 2.5745887756347656 + }, + { + "auxiliary_loss_clip": 0.01094793, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.03575969, + "balance_loss_mlp": 1.01596713, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.6223950874008963, + "language_loss": 0.58902872, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.6102643, + "num_input_tokens_seen": 215394535, + "step": 10002, + "time_per_iteration": 2.7021875381469727 + }, + { + "auxiliary_loss_clip": 0.01085063, + "auxiliary_loss_mlp": 0.01038399, + "balance_loss_clip": 1.03731132, + "balance_loss_mlp": 1.02470779, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 2.712459011897056, + "language_loss": 0.7792412, + "learning_rate": 1.447431741055314e-06, + "loss": 0.80047584, + "num_input_tokens_seen": 215414355, + "step": 10003, + "time_per_iteration": 2.636690378189087 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.03750038, + "balance_loss_mlp": 1.0198884, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.4641695341227803, + "language_loss": 0.77095723, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79234111, + "num_input_tokens_seen": 215428280, + "step": 10004, + "time_per_iteration": 2.5266854763031006 + }, + { + "auxiliary_loss_clip": 0.01091481, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.03563452, + "balance_loss_mlp": 1.01886582, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.5954257683247342, + "language_loss": 0.72279596, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.7440176, + "num_input_tokens_seen": 215448970, + "step": 10005, + "time_per_iteration": 2.70756196975708 + }, + { + "auxiliary_loss_clip": 0.01100922, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_clip": 1.03680015, + "balance_loss_mlp": 1.01664758, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.1502620925554394, + "language_loss": 0.75215727, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.77344859, + "num_input_tokens_seen": 215465260, + "step": 10006, + "time_per_iteration": 4.075509786605835 + }, + { + "auxiliary_loss_clip": 0.01076565, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.03277993, + "balance_loss_mlp": 1.02514386, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 1.867781856121465, + "language_loss": 0.73789507, + "learning_rate": 1.445934699732685e-06, + "loss": 0.75903279, + "num_input_tokens_seen": 215482725, + "step": 10007, + "time_per_iteration": 2.560137987136841 + }, + { + "auxiliary_loss_clip": 0.0107847, + "auxiliary_loss_mlp": 0.01029129, + "balance_loss_clip": 1.03344047, + "balance_loss_mlp": 1.01783991, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 1.7457552656075592, + "language_loss": 0.69832128, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.71939725, + "num_input_tokens_seen": 215500420, + "step": 10008, + "time_per_iteration": 2.608480453491211 + }, + { + "auxiliary_loss_clip": 0.01090234, + "auxiliary_loss_mlp": 0.01028884, + "balance_loss_clip": 1.0344764, + "balance_loss_mlp": 1.01747561, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.5259470018895724, + "language_loss": 0.76373088, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.784922, + "num_input_tokens_seen": 215522260, + "step": 10009, + "time_per_iteration": 2.664808750152588 + }, + { + "auxiliary_loss_clip": 0.0107433, + "auxiliary_loss_mlp": 0.00749439, + "balance_loss_clip": 1.03364015, + "balance_loss_mlp": 1.00041342, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.080708368076372, + "language_loss": 0.74133033, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.75956804, + "num_input_tokens_seen": 215541715, + "step": 10010, + "time_per_iteration": 2.8561947345733643 + }, + { + "auxiliary_loss_clip": 0.01015479, + "auxiliary_loss_mlp": 0.01000482, + "balance_loss_clip": 1.00450885, + "balance_loss_mlp": 0.99954653, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.8292698267684077, + "language_loss": 0.55106032, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57121992, + "num_input_tokens_seen": 215603020, + "step": 10011, + "time_per_iteration": 3.218629837036133 + }, + { + "auxiliary_loss_clip": 0.01091554, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.03410244, + "balance_loss_mlp": 1.02219474, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.5711235513511046, + "language_loss": 0.62045813, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64170837, + "num_input_tokens_seen": 215625115, + "step": 10012, + "time_per_iteration": 2.6908013820648193 + }, + { + "auxiliary_loss_clip": 0.01059151, + "auxiliary_loss_mlp": 0.0102557, + "balance_loss_clip": 1.03308821, + "balance_loss_mlp": 1.01466227, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.9105714631511872, + "language_loss": 0.75250429, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77335149, + "num_input_tokens_seen": 215643730, + "step": 10013, + "time_per_iteration": 2.6411795616149902 + }, + { + "auxiliary_loss_clip": 0.01098064, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.03562307, + "balance_loss_mlp": 1.0189594, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.6830069917273531, + "language_loss": 0.81460834, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83588636, + "num_input_tokens_seen": 215664425, + "step": 10014, + "time_per_iteration": 2.5543885231018066 + }, + { + "auxiliary_loss_clip": 0.01073047, + "auxiliary_loss_mlp": 0.01027425, + "balance_loss_clip": 1.03117144, + "balance_loss_mlp": 1.0162847, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.8831121502125434, + "language_loss": 0.72670329, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74770796, + "num_input_tokens_seen": 215684280, + "step": 10015, + "time_per_iteration": 2.7508437633514404 + }, + { + "auxiliary_loss_clip": 0.01007809, + "auxiliary_loss_mlp": 0.01002919, + "balance_loss_clip": 1.00723314, + "balance_loss_mlp": 1.00184011, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8271049362859743, + "language_loss": 0.54781806, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56792533, + "num_input_tokens_seen": 215739780, + "step": 10016, + "time_per_iteration": 3.168778896331787 + }, + { + "auxiliary_loss_clip": 0.01081491, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.03644729, + "balance_loss_mlp": 1.01710224, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.5698897477098368, + "language_loss": 0.83123213, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85233831, + "num_input_tokens_seen": 215757885, + "step": 10017, + "time_per_iteration": 2.733111619949341 + }, + { + "auxiliary_loss_clip": 0.01077121, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.03530931, + "balance_loss_mlp": 1.01925325, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.7905335481611675, + "language_loss": 0.83625126, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85733157, + "num_input_tokens_seen": 215776415, + "step": 10018, + "time_per_iteration": 2.7039361000061035 + }, + { + "auxiliary_loss_clip": 0.01074822, + "auxiliary_loss_mlp": 0.01038544, + "balance_loss_clip": 1.03161931, + "balance_loss_mlp": 1.02579999, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 4.222160234026761, + "language_loss": 0.78119695, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80233061, + "num_input_tokens_seen": 215794865, + "step": 10019, + "time_per_iteration": 2.707791328430176 + }, + { + "auxiliary_loss_clip": 0.0106244, + "auxiliary_loss_mlp": 0.00749312, + "balance_loss_clip": 1.03499866, + "balance_loss_mlp": 1.00038743, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.4707180853555604, + "language_loss": 0.73772597, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75584352, + "num_input_tokens_seen": 215816840, + "step": 10020, + "time_per_iteration": 2.7527825832366943 + }, + { + "auxiliary_loss_clip": 0.01081248, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.03359127, + "balance_loss_mlp": 1.02457619, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.531044829705172, + "language_loss": 0.64196408, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66314554, + "num_input_tokens_seen": 215836100, + "step": 10021, + "time_per_iteration": 2.693695068359375 + }, + { + "auxiliary_loss_clip": 0.01090371, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.0346843, + "balance_loss_mlp": 1.02292824, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.7121508765719033, + "language_loss": 0.80422533, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82547629, + "num_input_tokens_seen": 215858480, + "step": 10022, + "time_per_iteration": 4.233901023864746 + }, + { + "auxiliary_loss_clip": 0.01087958, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.03563714, + "balance_loss_mlp": 1.02262676, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.7310680833949272, + "language_loss": 0.66256106, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68378246, + "num_input_tokens_seen": 215879950, + "step": 10023, + "time_per_iteration": 2.7293283939361572 + }, + { + "auxiliary_loss_clip": 0.01091409, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.03360534, + "balance_loss_mlp": 1.0199945, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 2.0236750027943287, + "language_loss": 0.74524641, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76648241, + "num_input_tokens_seen": 215899830, + "step": 10024, + "time_per_iteration": 2.7250020503997803 + }, + { + "auxiliary_loss_clip": 0.01092163, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03546262, + "balance_loss_mlp": 1.01973259, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.8392947498881884, + "language_loss": 0.7267164, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.7479589, + "num_input_tokens_seen": 215920440, + "step": 10025, + "time_per_iteration": 2.6229748725891113 + }, + { + "auxiliary_loss_clip": 0.01105243, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.03503716, + "balance_loss_mlp": 1.02044773, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 3.0386648703715182, + "language_loss": 0.66750371, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.68888783, + "num_input_tokens_seen": 215940535, + "step": 10026, + "time_per_iteration": 2.537515163421631 + }, + { + "auxiliary_loss_clip": 0.01094669, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.03181589, + "balance_loss_mlp": 1.02556491, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.7217265838678335, + "language_loss": 0.79913032, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82043982, + "num_input_tokens_seen": 215958045, + "step": 10027, + "time_per_iteration": 2.556999921798706 + }, + { + "auxiliary_loss_clip": 0.01066848, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.03350747, + "balance_loss_mlp": 1.02153206, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 1.9975408711243192, + "language_loss": 0.70562899, + "learning_rate": 1.438080769071171e-06, + "loss": 0.72663617, + "num_input_tokens_seen": 215977330, + "step": 10028, + "time_per_iteration": 2.7087912559509277 + }, + { + "auxiliary_loss_clip": 0.01068321, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.03694963, + "balance_loss_mlp": 1.02180743, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 1.7303709019723037, + "language_loss": 0.84071457, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86173916, + "num_input_tokens_seen": 215997865, + "step": 10029, + "time_per_iteration": 2.842752456665039 + }, + { + "auxiliary_loss_clip": 0.01079384, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.03435469, + "balance_loss_mlp": 1.02372122, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.6746480820462701, + "language_loss": 0.79762399, + "learning_rate": 1.437333263694373e-06, + "loss": 0.81877196, + "num_input_tokens_seen": 216016230, + "step": 10030, + "time_per_iteration": 2.577847480773926 + }, + { + "auxiliary_loss_clip": 0.01027857, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.02864909, + "balance_loss_mlp": 1.02219665, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.6672936657639021, + "language_loss": 0.71245927, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73308933, + "num_input_tokens_seen": 216035785, + "step": 10031, + "time_per_iteration": 2.7987380027770996 + }, + { + "auxiliary_loss_clip": 0.01056369, + "auxiliary_loss_mlp": 0.0103373, + "balance_loss_clip": 1.03113008, + "balance_loss_mlp": 1.02061737, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.4970929460259716, + "language_loss": 0.72949469, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75039566, + "num_input_tokens_seen": 216059555, + "step": 10032, + "time_per_iteration": 2.771355628967285 + }, + { + "auxiliary_loss_clip": 0.01083937, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.03674746, + "balance_loss_mlp": 1.02344275, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.740648489888318, + "language_loss": 0.68802667, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70922321, + "num_input_tokens_seen": 216077235, + "step": 10033, + "time_per_iteration": 2.6955442428588867 + }, + { + "auxiliary_loss_clip": 0.01072183, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.03363729, + "balance_loss_mlp": 1.02131367, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 2.2655668969676133, + "language_loss": 0.75938159, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78044301, + "num_input_tokens_seen": 216094985, + "step": 10034, + "time_per_iteration": 2.6915125846862793 + }, + { + "auxiliary_loss_clip": 0.01081519, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.03552151, + "balance_loss_mlp": 1.0183785, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 2.302139794187241, + "language_loss": 0.74826461, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76938856, + "num_input_tokens_seen": 216115905, + "step": 10035, + "time_per_iteration": 4.253343343734741 + }, + { + "auxiliary_loss_clip": 0.01069049, + "auxiliary_loss_mlp": 0.01025579, + "balance_loss_clip": 1.03290272, + "balance_loss_mlp": 1.01466501, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.6335189792876368, + "language_loss": 0.86427373, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88521993, + "num_input_tokens_seen": 216132420, + "step": 10036, + "time_per_iteration": 4.082060098648071 + }, + { + "auxiliary_loss_clip": 0.01066203, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.03494191, + "balance_loss_mlp": 1.01733279, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 2.626255111733358, + "language_loss": 0.70017052, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72112596, + "num_input_tokens_seen": 216149800, + "step": 10037, + "time_per_iteration": 2.6507022380828857 + }, + { + "auxiliary_loss_clip": 0.01082651, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.03352022, + "balance_loss_mlp": 1.02250266, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.7911994167945111, + "language_loss": 0.85149831, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87268674, + "num_input_tokens_seen": 216168200, + "step": 10038, + "time_per_iteration": 2.640873432159424 + }, + { + "auxiliary_loss_clip": 0.0108227, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.03457689, + "balance_loss_mlp": 1.01980186, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 2.166890955322376, + "language_loss": 0.75838804, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.77952969, + "num_input_tokens_seen": 216187105, + "step": 10039, + "time_per_iteration": 2.706141948699951 + }, + { + "auxiliary_loss_clip": 0.01088585, + "auxiliary_loss_mlp": 0.01028097, + "balance_loss_clip": 1.03352618, + "balance_loss_mlp": 1.01712966, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.647061111915081, + "language_loss": 0.71047521, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73164201, + "num_input_tokens_seen": 216205440, + "step": 10040, + "time_per_iteration": 2.6053433418273926 + }, + { + "auxiliary_loss_clip": 0.01095662, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.03744936, + "balance_loss_mlp": 1.01987767, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 1.736145651370677, + "language_loss": 0.78154337, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80283678, + "num_input_tokens_seen": 216223130, + "step": 10041, + "time_per_iteration": 2.563021421432495 + }, + { + "auxiliary_loss_clip": 0.01082993, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.03730392, + "balance_loss_mlp": 1.01700902, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.7089338088789294, + "language_loss": 0.75873178, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77984506, + "num_input_tokens_seen": 216240260, + "step": 10042, + "time_per_iteration": 2.6542811393737793 + }, + { + "auxiliary_loss_clip": 0.01051999, + "auxiliary_loss_mlp": 0.01027285, + "balance_loss_clip": 1.03155637, + "balance_loss_mlp": 1.01556063, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 1.8495038695463355, + "language_loss": 0.84461945, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86541229, + "num_input_tokens_seen": 216258510, + "step": 10043, + "time_per_iteration": 2.7979049682617188 + }, + { + "auxiliary_loss_clip": 0.01064383, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.03186154, + "balance_loss_mlp": 1.02152872, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.7993257193137537, + "language_loss": 0.69579017, + "learning_rate": 1.432103122078974e-06, + "loss": 0.71677589, + "num_input_tokens_seen": 216277550, + "step": 10044, + "time_per_iteration": 2.803043842315674 + }, + { + "auxiliary_loss_clip": 0.01091869, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.03680134, + "balance_loss_mlp": 1.01784348, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 1.6680429753333512, + "language_loss": 0.77473146, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.79595691, + "num_input_tokens_seen": 216296690, + "step": 10045, + "time_per_iteration": 2.728431463241577 + }, + { + "auxiliary_loss_clip": 0.01056644, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.03985953, + "balance_loss_mlp": 1.01937115, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.6460971372155047, + "language_loss": 0.77115178, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79203135, + "num_input_tokens_seen": 216316110, + "step": 10046, + "time_per_iteration": 4.247121810913086 + }, + { + "auxiliary_loss_clip": 0.01041349, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.02897549, + "balance_loss_mlp": 1.02120519, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.6210023913486524, + "language_loss": 0.87261784, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89336687, + "num_input_tokens_seen": 216333855, + "step": 10047, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.01090959, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.03643084, + "balance_loss_mlp": 1.01608896, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.4476954172215977, + "language_loss": 0.7550168, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77619374, + "num_input_tokens_seen": 216354890, + "step": 10048, + "time_per_iteration": 2.6558427810668945 + }, + { + "auxiliary_loss_clip": 0.01090544, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03523481, + "balance_loss_mlp": 1.02145493, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 1.9640112917181247, + "language_loss": 0.66490269, + "learning_rate": 1.430236235239386e-06, + "loss": 0.6861608, + "num_input_tokens_seen": 216376055, + "step": 10049, + "time_per_iteration": 2.6129915714263916 + }, + { + "auxiliary_loss_clip": 0.01070822, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.03154981, + "balance_loss_mlp": 1.02409232, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.434097366365812, + "language_loss": 0.66511315, + "learning_rate": 1.429862922631336e-06, + "loss": 0.6861943, + "num_input_tokens_seen": 216396295, + "step": 10050, + "time_per_iteration": 2.6132094860076904 + }, + { + "auxiliary_loss_clip": 0.01067265, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.03296483, + "balance_loss_mlp": 1.02177489, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 2.2004742493130776, + "language_loss": 0.69400585, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.71501184, + "num_input_tokens_seen": 216416605, + "step": 10051, + "time_per_iteration": 2.779066801071167 + }, + { + "auxiliary_loss_clip": 0.01086683, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.03186941, + "balance_loss_mlp": 1.01902843, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 5.651185197587219, + "language_loss": 0.64441752, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66558838, + "num_input_tokens_seen": 216435130, + "step": 10052, + "time_per_iteration": 2.544360637664795 + }, + { + "auxiliary_loss_clip": 0.01077076, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.0324645, + "balance_loss_mlp": 1.01616526, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.6177388619434423, + "language_loss": 0.68893778, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.70998985, + "num_input_tokens_seen": 216455640, + "step": 10053, + "time_per_iteration": 2.705889940261841 + }, + { + "auxiliary_loss_clip": 0.0100504, + "auxiliary_loss_mlp": 0.01002398, + "balance_loss_clip": 1.00403166, + "balance_loss_mlp": 1.00139081, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7538546865227457, + "language_loss": 0.60456067, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62463504, + "num_input_tokens_seen": 216518130, + "step": 10054, + "time_per_iteration": 3.2579002380371094 + }, + { + "auxiliary_loss_clip": 0.01043141, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.03165317, + "balance_loss_mlp": 1.01692486, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.7545659709360952, + "language_loss": 0.85550255, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87622261, + "num_input_tokens_seen": 216536845, + "step": 10055, + "time_per_iteration": 2.6730644702911377 + }, + { + "auxiliary_loss_clip": 0.01078288, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_clip": 1.03555369, + "balance_loss_mlp": 1.02882457, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.3066450401665306, + "language_loss": 0.73392552, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75512648, + "num_input_tokens_seen": 216551860, + "step": 10056, + "time_per_iteration": 2.680682420730591 + }, + { + "auxiliary_loss_clip": 0.0105683, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.03277504, + "balance_loss_mlp": 1.02113724, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.508790051305232, + "language_loss": 0.8041895, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82508498, + "num_input_tokens_seen": 216574775, + "step": 10057, + "time_per_iteration": 2.715822219848633 + }, + { + "auxiliary_loss_clip": 0.01098001, + "auxiliary_loss_mlp": 0.00749263, + "balance_loss_clip": 1.03422832, + "balance_loss_mlp": 1.00042832, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.426760632676412, + "language_loss": 0.75705087, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77552348, + "num_input_tokens_seen": 216590100, + "step": 10058, + "time_per_iteration": 2.4940407276153564 + }, + { + "auxiliary_loss_clip": 0.01086043, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.0326283, + "balance_loss_mlp": 1.01672721, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 1.8857278709011234, + "language_loss": 0.70727098, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.72841215, + "num_input_tokens_seen": 216610145, + "step": 10059, + "time_per_iteration": 2.602663993835449 + }, + { + "auxiliary_loss_clip": 0.01076281, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.03237844, + "balance_loss_mlp": 1.01724172, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 2.0904785072558107, + "language_loss": 0.75965607, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.7807048, + "num_input_tokens_seen": 216630625, + "step": 10060, + "time_per_iteration": 2.5870227813720703 + }, + { + "auxiliary_loss_clip": 0.01087056, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.03286958, + "balance_loss_mlp": 1.01883626, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 1.9290837170529176, + "language_loss": 0.73488486, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75605595, + "num_input_tokens_seen": 216649255, + "step": 10061, + "time_per_iteration": 2.5430960655212402 + }, + { + "auxiliary_loss_clip": 0.01059986, + "auxiliary_loss_mlp": 0.00749499, + "balance_loss_clip": 1.03597653, + "balance_loss_mlp": 1.00045741, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 1.7350657269006244, + "language_loss": 0.67420214, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69229698, + "num_input_tokens_seen": 216668100, + "step": 10062, + "time_per_iteration": 4.334479808807373 + }, + { + "auxiliary_loss_clip": 0.01080059, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.03176188, + "balance_loss_mlp": 1.02408338, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.1273002487933708, + "language_loss": 0.7131092, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73427451, + "num_input_tokens_seen": 216686125, + "step": 10063, + "time_per_iteration": 2.5255603790283203 + }, + { + "auxiliary_loss_clip": 0.01094476, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.03153753, + "balance_loss_mlp": 1.02353382, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.721933637753549, + "language_loss": 0.84638464, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86767697, + "num_input_tokens_seen": 216704265, + "step": 10064, + "time_per_iteration": 2.497816801071167 + }, + { + "auxiliary_loss_clip": 0.01090098, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.03416491, + "balance_loss_mlp": 1.02104926, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.208357571351006, + "language_loss": 0.80002636, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.82125199, + "num_input_tokens_seen": 216721765, + "step": 10065, + "time_per_iteration": 2.528287410736084 + }, + { + "auxiliary_loss_clip": 0.01051058, + "auxiliary_loss_mlp": 0.01036115, + "balance_loss_clip": 1.03316927, + "balance_loss_mlp": 1.02349615, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 1.8568826497487192, + "language_loss": 0.78556579, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80643749, + "num_input_tokens_seen": 216738295, + "step": 10066, + "time_per_iteration": 2.6647820472717285 + }, + { + "auxiliary_loss_clip": 0.01041538, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.01995504, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.6837817754053679, + "language_loss": 0.73520821, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75594026, + "num_input_tokens_seen": 216759875, + "step": 10067, + "time_per_iteration": 2.763331890106201 + }, + { + "auxiliary_loss_clip": 0.0107921, + "auxiliary_loss_mlp": 0.00749293, + "balance_loss_clip": 1.03462327, + "balance_loss_mlp": 1.00043464, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.355016155535404, + "language_loss": 0.69031954, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70860457, + "num_input_tokens_seen": 216780705, + "step": 10068, + "time_per_iteration": 2.715829849243164 + }, + { + "auxiliary_loss_clip": 0.01083438, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.03276622, + "balance_loss_mlp": 1.01807952, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 24.386380969379243, + "language_loss": 0.87039959, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89152801, + "num_input_tokens_seen": 216797625, + "step": 10069, + "time_per_iteration": 2.5622403621673584 + }, + { + "auxiliary_loss_clip": 0.01067523, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.03248048, + "balance_loss_mlp": 1.01555777, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.4402374791531802, + "language_loss": 0.82988822, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85082817, + "num_input_tokens_seen": 216817610, + "step": 10070, + "time_per_iteration": 2.653712511062622 + }, + { + "auxiliary_loss_clip": 0.01084336, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.03616834, + "balance_loss_mlp": 1.01996088, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.669771700309214, + "language_loss": 0.86042464, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88158333, + "num_input_tokens_seen": 216836835, + "step": 10071, + "time_per_iteration": 2.645556688308716 + }, + { + "auxiliary_loss_clip": 0.01092707, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.03568745, + "balance_loss_mlp": 1.02123666, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.5189914225783334, + "language_loss": 0.77140039, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79266173, + "num_input_tokens_seen": 216856760, + "step": 10072, + "time_per_iteration": 2.684025764465332 + }, + { + "auxiliary_loss_clip": 0.01077709, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.03139043, + "balance_loss_mlp": 1.01986098, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.574438824799351, + "language_loss": 0.7460109, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76710981, + "num_input_tokens_seen": 216878795, + "step": 10073, + "time_per_iteration": 2.7042436599731445 + }, + { + "auxiliary_loss_clip": 0.00986104, + "auxiliary_loss_mlp": 0.00998478, + "balance_loss_clip": 1.00492609, + "balance_loss_mlp": 0.99715501, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7578677425629295, + "language_loss": 0.55191046, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.5717563, + "num_input_tokens_seen": 216937800, + "step": 10074, + "time_per_iteration": 3.3289144039154053 + }, + { + "auxiliary_loss_clip": 0.01059329, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_clip": 1.03299224, + "balance_loss_mlp": 1.02812254, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.873154315163698, + "language_loss": 0.81434512, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.83535379, + "num_input_tokens_seen": 216955280, + "step": 10075, + "time_per_iteration": 4.310727119445801 + }, + { + "auxiliary_loss_clip": 0.01091288, + "auxiliary_loss_mlp": 0.01023952, + "balance_loss_clip": 1.0338552, + "balance_loss_mlp": 1.01179266, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 1.7517621280413471, + "language_loss": 0.78516597, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.8063184, + "num_input_tokens_seen": 216976950, + "step": 10076, + "time_per_iteration": 4.229968547821045 + }, + { + "auxiliary_loss_clip": 0.01088809, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.03291249, + "balance_loss_mlp": 1.02348268, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.9964531346826875, + "language_loss": 0.72333097, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74457401, + "num_input_tokens_seen": 216996945, + "step": 10077, + "time_per_iteration": 2.6133601665496826 + }, + { + "auxiliary_loss_clip": 0.01102145, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.03521323, + "balance_loss_mlp": 1.01875114, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 1.612190750355956, + "language_loss": 0.55905259, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.58038318, + "num_input_tokens_seen": 217016580, + "step": 10078, + "time_per_iteration": 2.603788137435913 + }, + { + "auxiliary_loss_clip": 0.010557, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.03148341, + "balance_loss_mlp": 1.02010584, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.6774559897014634, + "language_loss": 0.70409799, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72497737, + "num_input_tokens_seen": 217037300, + "step": 10079, + "time_per_iteration": 2.7471344470977783 + }, + { + "auxiliary_loss_clip": 0.01078433, + "auxiliary_loss_mlp": 0.01038666, + "balance_loss_clip": 1.03333855, + "balance_loss_mlp": 1.02700722, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.937549882828982, + "language_loss": 0.62188888, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64305991, + "num_input_tokens_seen": 217055805, + "step": 10080, + "time_per_iteration": 2.608367919921875 + }, + { + "auxiliary_loss_clip": 0.01077013, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.03275144, + "balance_loss_mlp": 1.01699865, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.7842902171410042, + "language_loss": 0.71146905, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73253095, + "num_input_tokens_seen": 217074175, + "step": 10081, + "time_per_iteration": 2.6534721851348877 + }, + { + "auxiliary_loss_clip": 0.01075411, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.03315067, + "balance_loss_mlp": 1.0173012, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.7347560232348063, + "language_loss": 0.69231248, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71335804, + "num_input_tokens_seen": 217095695, + "step": 10082, + "time_per_iteration": 2.6631672382354736 + }, + { + "auxiliary_loss_clip": 0.01102399, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.03715444, + "balance_loss_mlp": 1.01781774, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.4516205595252551, + "language_loss": 0.66013342, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.68144596, + "num_input_tokens_seen": 217116260, + "step": 10083, + "time_per_iteration": 2.597766399383545 + }, + { + "auxiliary_loss_clip": 0.01088787, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.0339663, + "balance_loss_mlp": 1.01474929, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 3.6362070019684727, + "language_loss": 0.7398622, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76101625, + "num_input_tokens_seen": 217134465, + "step": 10084, + "time_per_iteration": 2.5958075523376465 + }, + { + "auxiliary_loss_clip": 0.01075598, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.03259349, + "balance_loss_mlp": 1.02518034, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.42882088494801, + "language_loss": 0.72711509, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74823934, + "num_input_tokens_seen": 217149920, + "step": 10085, + "time_per_iteration": 2.6432583332061768 + }, + { + "auxiliary_loss_clip": 0.01101358, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.03558815, + "balance_loss_mlp": 1.02420807, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.348506919174231, + "language_loss": 0.76230729, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.7836712, + "num_input_tokens_seen": 217168165, + "step": 10086, + "time_per_iteration": 2.613435983657837 + }, + { + "auxiliary_loss_clip": 0.01065225, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.03179979, + "balance_loss_mlp": 1.02239478, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.3753286817961685, + "language_loss": 0.72695291, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74794817, + "num_input_tokens_seen": 217190070, + "step": 10087, + "time_per_iteration": 4.1400532722473145 + }, + { + "auxiliary_loss_clip": 0.01087378, + "auxiliary_loss_mlp": 0.0102832, + "balance_loss_clip": 1.0342108, + "balance_loss_mlp": 1.01829481, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.60158253843309, + "language_loss": 0.84028554, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86144257, + "num_input_tokens_seen": 217209370, + "step": 10088, + "time_per_iteration": 2.613576889038086 + }, + { + "auxiliary_loss_clip": 0.01042669, + "auxiliary_loss_mlp": 0.00749484, + "balance_loss_clip": 1.02854121, + "balance_loss_mlp": 1.00040531, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.44061091767123, + "language_loss": 0.71190023, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.72982168, + "num_input_tokens_seen": 217226990, + "step": 10089, + "time_per_iteration": 2.9774866104125977 + }, + { + "auxiliary_loss_clip": 0.01090784, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.0358566, + "balance_loss_mlp": 1.02184725, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 2.06413411260775, + "language_loss": 0.82836902, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84960115, + "num_input_tokens_seen": 217244585, + "step": 10090, + "time_per_iteration": 2.591062068939209 + }, + { + "auxiliary_loss_clip": 0.01068125, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.03456068, + "balance_loss_mlp": 1.02567244, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.2936101307494106, + "language_loss": 0.76030177, + "learning_rate": 1.4145758826341e-06, + "loss": 0.78136492, + "num_input_tokens_seen": 217263435, + "step": 10091, + "time_per_iteration": 2.6871726512908936 + }, + { + "auxiliary_loss_clip": 0.01097194, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.03296685, + "balance_loss_mlp": 1.02131701, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 1.5605081081785106, + "language_loss": 0.79731435, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81861544, + "num_input_tokens_seen": 217283725, + "step": 10092, + "time_per_iteration": 2.585750102996826 + }, + { + "auxiliary_loss_clip": 0.01080245, + "auxiliary_loss_mlp": 0.01035061, + "balance_loss_clip": 1.03386211, + "balance_loss_mlp": 1.02283657, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.6703095321472827, + "language_loss": 0.76037693, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78152996, + "num_input_tokens_seen": 217301120, + "step": 10093, + "time_per_iteration": 2.7528412342071533 + }, + { + "auxiliary_loss_clip": 0.01070725, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.03252006, + "balance_loss_mlp": 1.02311969, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.9428135200643086, + "language_loss": 0.87409079, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89515388, + "num_input_tokens_seen": 217319585, + "step": 10094, + "time_per_iteration": 2.6336913108825684 + }, + { + "auxiliary_loss_clip": 0.01092394, + "auxiliary_loss_mlp": 0.01027686, + "balance_loss_clip": 1.03595126, + "balance_loss_mlp": 1.01634324, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.7739494570527548, + "language_loss": 0.72003722, + "learning_rate": 1.413086446353919e-06, + "loss": 0.741238, + "num_input_tokens_seen": 217338880, + "step": 10095, + "time_per_iteration": 2.571922779083252 + }, + { + "auxiliary_loss_clip": 0.0107308, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.0313046, + "balance_loss_mlp": 1.01766825, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 2.5518430012883466, + "language_loss": 0.76625502, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78727365, + "num_input_tokens_seen": 217357480, + "step": 10096, + "time_per_iteration": 2.626918315887451 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01038618, + "balance_loss_clip": 1.0359292, + "balance_loss_mlp": 1.02721608, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 1.8129821743884276, + "language_loss": 0.79666162, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.8180809, + "num_input_tokens_seen": 217374575, + "step": 10097, + "time_per_iteration": 2.6335809230804443 + }, + { + "auxiliary_loss_clip": 0.01072433, + "auxiliary_loss_mlp": 0.01028356, + "balance_loss_clip": 1.03407705, + "balance_loss_mlp": 1.01704943, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.5804862778182935, + "language_loss": 0.6744048, + "learning_rate": 1.411969602780478e-06, + "loss": 0.6954127, + "num_input_tokens_seen": 217392950, + "step": 10098, + "time_per_iteration": 2.7266886234283447 + }, + { + "auxiliary_loss_clip": 0.01100603, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.03484356, + "balance_loss_mlp": 1.01715362, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 2.3424995721632222, + "language_loss": 0.80218661, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82347441, + "num_input_tokens_seen": 217412145, + "step": 10099, + "time_per_iteration": 2.580202341079712 + }, + { + "auxiliary_loss_clip": 0.01071278, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.03245735, + "balance_loss_mlp": 1.01980579, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 1.8131790729579262, + "language_loss": 0.70256746, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72360015, + "num_input_tokens_seen": 217432080, + "step": 10100, + "time_per_iteration": 2.676673650741577 + }, + { + "auxiliary_loss_clip": 0.01063894, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.03386045, + "balance_loss_mlp": 1.0208143, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.7739474646519016, + "language_loss": 0.70673132, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72770458, + "num_input_tokens_seen": 217450945, + "step": 10101, + "time_per_iteration": 2.6916379928588867 + }, + { + "auxiliary_loss_clip": 0.01069553, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.03081059, + "balance_loss_mlp": 1.01744747, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.6552677650765266, + "language_loss": 0.6950298, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71601617, + "num_input_tokens_seen": 217473105, + "step": 10102, + "time_per_iteration": 4.223191022872925 + }, + { + "auxiliary_loss_clip": 0.01102525, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.03588438, + "balance_loss_mlp": 1.02192116, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.934033059196831, + "language_loss": 0.73369884, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75505352, + "num_input_tokens_seen": 217491780, + "step": 10103, + "time_per_iteration": 2.6199803352355957 + }, + { + "auxiliary_loss_clip": 0.0106294, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.0369823, + "balance_loss_mlp": 1.01956201, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.6279277134333467, + "language_loss": 0.76604545, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.78699803, + "num_input_tokens_seen": 217510605, + "step": 10104, + "time_per_iteration": 2.653630018234253 + }, + { + "auxiliary_loss_clip": 0.00999776, + "auxiliary_loss_mlp": 0.01006266, + "balance_loss_clip": 1.00854993, + "balance_loss_mlp": 1.005193, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7226585510604416, + "language_loss": 0.5600884, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58014882, + "num_input_tokens_seen": 217574815, + "step": 10105, + "time_per_iteration": 3.2143824100494385 + }, + { + "auxiliary_loss_clip": 0.01018787, + "auxiliary_loss_mlp": 0.01000083, + "balance_loss_clip": 1.00779462, + "balance_loss_mlp": 0.99906939, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7568963143201096, + "language_loss": 0.56854737, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58873606, + "num_input_tokens_seen": 217632375, + "step": 10106, + "time_per_iteration": 3.090531587600708 + }, + { + "auxiliary_loss_clip": 0.01044452, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.02867758, + "balance_loss_mlp": 1.02010798, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.6697947805985782, + "language_loss": 0.68743926, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70819992, + "num_input_tokens_seen": 217653055, + "step": 10107, + "time_per_iteration": 2.736173629760742 + }, + { + "auxiliary_loss_clip": 0.01087562, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.0328604, + "balance_loss_mlp": 1.01593113, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 1.7683106390314267, + "language_loss": 0.81127405, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83242738, + "num_input_tokens_seen": 217671520, + "step": 10108, + "time_per_iteration": 2.5454704761505127 + }, + { + "auxiliary_loss_clip": 0.01072973, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.03259611, + "balance_loss_mlp": 1.02161121, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.7461725551579135, + "language_loss": 0.71161568, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73268712, + "num_input_tokens_seen": 217691880, + "step": 10109, + "time_per_iteration": 2.749410629272461 + }, + { + "auxiliary_loss_clip": 0.01075085, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.03295016, + "balance_loss_mlp": 1.0197711, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.6956050354745296, + "language_loss": 0.80269885, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82375151, + "num_input_tokens_seen": 217710530, + "step": 10110, + "time_per_iteration": 2.6370322704315186 + }, + { + "auxiliary_loss_clip": 0.01070989, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.03144944, + "balance_loss_mlp": 1.01733613, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.5017341291996944, + "language_loss": 0.70167828, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72268307, + "num_input_tokens_seen": 217728650, + "step": 10111, + "time_per_iteration": 2.602710247039795 + }, + { + "auxiliary_loss_clip": 0.01071296, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.03619266, + "balance_loss_mlp": 1.01553404, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.5777996637504303, + "language_loss": 0.64817899, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.66916931, + "num_input_tokens_seen": 217747135, + "step": 10112, + "time_per_iteration": 2.754070281982422 + }, + { + "auxiliary_loss_clip": 0.01016137, + "auxiliary_loss_mlp": 0.01001722, + "balance_loss_clip": 1.00520635, + "balance_loss_mlp": 1.00062561, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6554067396979516, + "language_loss": 0.49557096, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51574957, + "num_input_tokens_seen": 217811860, + "step": 10113, + "time_per_iteration": 3.2223386764526367 + }, + { + "auxiliary_loss_clip": 0.01016261, + "auxiliary_loss_mlp": 0.01001056, + "balance_loss_clip": 1.00517774, + "balance_loss_mlp": 0.99997121, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8394735792088535, + "language_loss": 0.56971443, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58988762, + "num_input_tokens_seen": 217866510, + "step": 10114, + "time_per_iteration": 3.0705881118774414 + }, + { + "auxiliary_loss_clip": 0.01105022, + "auxiliary_loss_mlp": 0.01027495, + "balance_loss_clip": 1.03594112, + "balance_loss_mlp": 1.01453149, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.7239959894829733, + "language_loss": 0.70312661, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72445178, + "num_input_tokens_seen": 217885650, + "step": 10115, + "time_per_iteration": 3.9940099716186523 + }, + { + "auxiliary_loss_clip": 0.01057663, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.03047466, + "balance_loss_mlp": 1.01793325, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 1.9401214467494736, + "language_loss": 0.72461706, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74549687, + "num_input_tokens_seen": 217905300, + "step": 10116, + "time_per_iteration": 4.35906720161438 + }, + { + "auxiliary_loss_clip": 0.0107373, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.03258371, + "balance_loss_mlp": 1.02056599, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.6454544846409829, + "language_loss": 0.54026097, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.56133687, + "num_input_tokens_seen": 217927845, + "step": 10117, + "time_per_iteration": 2.7638700008392334 + }, + { + "auxiliary_loss_clip": 0.01081614, + "auxiliary_loss_mlp": 0.01024398, + "balance_loss_clip": 1.03484249, + "balance_loss_mlp": 1.01326394, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.9759628880027904, + "language_loss": 0.70183778, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72289789, + "num_input_tokens_seen": 217946145, + "step": 10118, + "time_per_iteration": 2.630816698074341 + }, + { + "auxiliary_loss_clip": 0.01035029, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.02999711, + "balance_loss_mlp": 1.01701856, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.453947338310685, + "language_loss": 0.74568236, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.76632154, + "num_input_tokens_seen": 217965190, + "step": 10119, + "time_per_iteration": 2.689152956008911 + }, + { + "auxiliary_loss_clip": 0.01087741, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.03340268, + "balance_loss_mlp": 1.02076185, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 2.141340064628446, + "language_loss": 0.67183191, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69302952, + "num_input_tokens_seen": 217983625, + "step": 10120, + "time_per_iteration": 2.6829633712768555 + }, + { + "auxiliary_loss_clip": 0.01092724, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.03503776, + "balance_loss_mlp": 1.02144504, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.6774492412533186, + "language_loss": 0.7444942, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76575875, + "num_input_tokens_seen": 218006005, + "step": 10121, + "time_per_iteration": 2.6938679218292236 + }, + { + "auxiliary_loss_clip": 0.01090138, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.03394532, + "balance_loss_mlp": 1.0160625, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 2.1625989288016343, + "language_loss": 0.80523753, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82641023, + "num_input_tokens_seen": 218024195, + "step": 10122, + "time_per_iteration": 2.6125712394714355 + }, + { + "auxiliary_loss_clip": 0.01082078, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.03313375, + "balance_loss_mlp": 1.01919425, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.6392147886919328, + "language_loss": 0.55618191, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57731527, + "num_input_tokens_seen": 218047190, + "step": 10123, + "time_per_iteration": 2.6758220195770264 + }, + { + "auxiliary_loss_clip": 0.01091921, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.03670287, + "balance_loss_mlp": 1.02370405, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.864660029219344, + "language_loss": 0.7424171, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76368678, + "num_input_tokens_seen": 218065945, + "step": 10124, + "time_per_iteration": 2.5916409492492676 + }, + { + "auxiliary_loss_clip": 0.01071955, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.03093171, + "balance_loss_mlp": 1.01868725, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 2.8718802386722384, + "language_loss": 0.65188766, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67291266, + "num_input_tokens_seen": 218085285, + "step": 10125, + "time_per_iteration": 2.6428780555725098 + }, + { + "auxiliary_loss_clip": 0.01100293, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.03544605, + "balance_loss_mlp": 1.02168012, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 6.792653655107613, + "language_loss": 0.75934327, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78067875, + "num_input_tokens_seen": 218104735, + "step": 10126, + "time_per_iteration": 2.5813796520233154 + }, + { + "auxiliary_loss_clip": 0.01064503, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.03298283, + "balance_loss_mlp": 1.01597989, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.4547917879421446, + "language_loss": 0.71535593, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73628342, + "num_input_tokens_seen": 218121855, + "step": 10127, + "time_per_iteration": 4.252619504928589 + }, + { + "auxiliary_loss_clip": 0.0110503, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.03598523, + "balance_loss_mlp": 1.02004886, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.171419317277695, + "language_loss": 0.72738999, + "learning_rate": 1.400812267497691e-06, + "loss": 0.74876839, + "num_input_tokens_seen": 218137325, + "step": 10128, + "time_per_iteration": 2.5727005004882812 + }, + { + "auxiliary_loss_clip": 0.01050457, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.03246498, + "balance_loss_mlp": 1.01584506, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.1653899609877523, + "language_loss": 0.73270315, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75347757, + "num_input_tokens_seen": 218155530, + "step": 10129, + "time_per_iteration": 2.797977924346924 + }, + { + "auxiliary_loss_clip": 0.01098827, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.03312624, + "balance_loss_mlp": 1.02132964, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.4367942803015836, + "language_loss": 0.66132194, + "learning_rate": 1.400069168015626e-06, + "loss": 0.6826362, + "num_input_tokens_seen": 218182535, + "step": 10130, + "time_per_iteration": 2.715801477432251 + }, + { + "auxiliary_loss_clip": 0.01071316, + "auxiliary_loss_mlp": 0.01025395, + "balance_loss_clip": 1.03173852, + "balance_loss_mlp": 1.01503015, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 2.055372855116904, + "language_loss": 0.77191317, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.7928803, + "num_input_tokens_seen": 218201740, + "step": 10131, + "time_per_iteration": 2.5412116050720215 + }, + { + "auxiliary_loss_clip": 0.01067235, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.03305638, + "balance_loss_mlp": 1.01958466, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.6495456882391568, + "language_loss": 0.77387607, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79484183, + "num_input_tokens_seen": 218219800, + "step": 10132, + "time_per_iteration": 2.668187379837036 + }, + { + "auxiliary_loss_clip": 0.01097985, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.03523445, + "balance_loss_mlp": 1.0191673, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.9209680708859178, + "language_loss": 0.75520277, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77648115, + "num_input_tokens_seen": 218237585, + "step": 10133, + "time_per_iteration": 2.5623397827148438 + }, + { + "auxiliary_loss_clip": 0.01087301, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.0328567, + "balance_loss_mlp": 1.01700151, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 2.1844063812508883, + "language_loss": 0.63728553, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.65843832, + "num_input_tokens_seen": 218258700, + "step": 10134, + "time_per_iteration": 2.635329484939575 + }, + { + "auxiliary_loss_clip": 0.01073344, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.03085411, + "balance_loss_mlp": 1.01581585, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.9619530490163946, + "language_loss": 0.78931117, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.8103137, + "num_input_tokens_seen": 218275655, + "step": 10135, + "time_per_iteration": 2.8667030334472656 + }, + { + "auxiliary_loss_clip": 0.01076782, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.03358579, + "balance_loss_mlp": 1.01692653, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 2.0650028913031337, + "language_loss": 0.72209948, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74314404, + "num_input_tokens_seen": 218295720, + "step": 10136, + "time_per_iteration": 2.845855474472046 + }, + { + "auxiliary_loss_clip": 0.01102793, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.03549016, + "balance_loss_mlp": 1.01779675, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 1.9966095798661796, + "language_loss": 0.74810684, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76942945, + "num_input_tokens_seen": 218316745, + "step": 10137, + "time_per_iteration": 2.6322743892669678 + }, + { + "auxiliary_loss_clip": 0.01082881, + "auxiliary_loss_mlp": 0.01037193, + "balance_loss_clip": 1.03124809, + "balance_loss_mlp": 1.02409816, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 1.6854423601190114, + "language_loss": 0.80468607, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82588685, + "num_input_tokens_seen": 218335385, + "step": 10138, + "time_per_iteration": 2.5510189533233643 + }, + { + "auxiliary_loss_clip": 0.01070266, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.03175592, + "balance_loss_mlp": 1.01975989, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.7499601785446985, + "language_loss": 0.81346786, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83447266, + "num_input_tokens_seen": 218353320, + "step": 10139, + "time_per_iteration": 2.5932018756866455 + }, + { + "auxiliary_loss_clip": 0.01060629, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.03295302, + "balance_loss_mlp": 1.02210832, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 1.9472362031399115, + "language_loss": 0.83381903, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85477006, + "num_input_tokens_seen": 218365620, + "step": 10140, + "time_per_iteration": 2.6154377460479736 + }, + { + "auxiliary_loss_clip": 0.01090629, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.03330112, + "balance_loss_mlp": 1.01629829, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 3.8677623255714, + "language_loss": 0.7538904, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77507508, + "num_input_tokens_seen": 218383785, + "step": 10141, + "time_per_iteration": 2.6176371574401855 + }, + { + "auxiliary_loss_clip": 0.01068875, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.03132355, + "balance_loss_mlp": 1.01808977, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 2.284738846819882, + "language_loss": 0.76593286, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78692704, + "num_input_tokens_seen": 218399055, + "step": 10142, + "time_per_iteration": 4.158963918685913 + }, + { + "auxiliary_loss_clip": 0.01099649, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.03383541, + "balance_loss_mlp": 1.01980364, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.7049483971890984, + "language_loss": 0.76883018, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.79014647, + "num_input_tokens_seen": 218419120, + "step": 10143, + "time_per_iteration": 2.658655881881714 + }, + { + "auxiliary_loss_clip": 0.01081349, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.03102446, + "balance_loss_mlp": 1.02260935, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.8331767928843474, + "language_loss": 0.75603288, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77720392, + "num_input_tokens_seen": 218435290, + "step": 10144, + "time_per_iteration": 2.5045669078826904 + }, + { + "auxiliary_loss_clip": 0.01070694, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.03201652, + "balance_loss_mlp": 1.0177443, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.87024605802728, + "language_loss": 0.72900981, + "learning_rate": 1.394498830235383e-06, + "loss": 0.75001293, + "num_input_tokens_seen": 218457880, + "step": 10145, + "time_per_iteration": 2.862640380859375 + }, + { + "auxiliary_loss_clip": 0.01073706, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.03091311, + "balance_loss_mlp": 1.02293563, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 2.1591430509001066, + "language_loss": 0.69106495, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71214175, + "num_input_tokens_seen": 218475930, + "step": 10146, + "time_per_iteration": 2.7534866333007812 + }, + { + "auxiliary_loss_clip": 0.01053546, + "auxiliary_loss_mlp": 0.00749242, + "balance_loss_clip": 1.03247547, + "balance_loss_mlp": 1.00039113, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.6892534784166038, + "language_loss": 0.76692986, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78495771, + "num_input_tokens_seen": 218493675, + "step": 10147, + "time_per_iteration": 2.6584136486053467 + }, + { + "auxiliary_loss_clip": 0.0107513, + "auxiliary_loss_mlp": 0.0102587, + "balance_loss_clip": 1.03049612, + "balance_loss_mlp": 1.01466441, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.9083619515908983, + "language_loss": 0.78562325, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80663323, + "num_input_tokens_seen": 218511780, + "step": 10148, + "time_per_iteration": 2.5431180000305176 + }, + { + "auxiliary_loss_clip": 0.01063717, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.02972472, + "balance_loss_mlp": 1.01785481, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 1.8932747859755352, + "language_loss": 0.53746963, + "learning_rate": 1.39301427737093e-06, + "loss": 0.55841947, + "num_input_tokens_seen": 218531850, + "step": 10149, + "time_per_iteration": 2.725456714630127 + }, + { + "auxiliary_loss_clip": 0.01076457, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.0347898, + "balance_loss_mlp": 1.02121818, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.9878642802340651, + "language_loss": 0.80299401, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82407928, + "num_input_tokens_seen": 218551245, + "step": 10150, + "time_per_iteration": 2.605679750442505 + }, + { + "auxiliary_loss_clip": 0.01077551, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.03622484, + "balance_loss_mlp": 1.0247432, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.4908853478828472, + "language_loss": 0.68715143, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.70828927, + "num_input_tokens_seen": 218571365, + "step": 10151, + "time_per_iteration": 2.6344330310821533 + }, + { + "auxiliary_loss_clip": 0.01096999, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.03205812, + "balance_loss_mlp": 1.01722169, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.779132597024574, + "language_loss": 0.71248335, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.73372793, + "num_input_tokens_seen": 218588315, + "step": 10152, + "time_per_iteration": 2.7183303833007812 + }, + { + "auxiliary_loss_clip": 0.0106454, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.03412151, + "balance_loss_mlp": 1.01723135, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 1.7462743464362407, + "language_loss": 0.78065795, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80158782, + "num_input_tokens_seen": 218605940, + "step": 10153, + "time_per_iteration": 2.643651008605957 + }, + { + "auxiliary_loss_clip": 0.01073141, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.03266621, + "balance_loss_mlp": 1.01799786, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.682030776910062, + "language_loss": 0.79515618, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81618702, + "num_input_tokens_seen": 218626100, + "step": 10154, + "time_per_iteration": 4.2531328201293945 + }, + { + "auxiliary_loss_clip": 0.01088861, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.03435898, + "balance_loss_mlp": 1.01853085, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.6361026841382789, + "language_loss": 0.7046293, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.725815, + "num_input_tokens_seen": 218645060, + "step": 10155, + "time_per_iteration": 2.6735646724700928 + }, + { + "auxiliary_loss_clip": 0.01090743, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.03537726, + "balance_loss_mlp": 1.01956046, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.5160590203849869, + "language_loss": 0.71284401, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73407024, + "num_input_tokens_seen": 218667690, + "step": 10156, + "time_per_iteration": 4.23897385597229 + }, + { + "auxiliary_loss_clip": 0.01076303, + "auxiliary_loss_mlp": 0.01027087, + "balance_loss_clip": 1.03453004, + "balance_loss_mlp": 1.01564932, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.6526908785850005, + "language_loss": 0.67103189, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69206583, + "num_input_tokens_seen": 218687505, + "step": 10157, + "time_per_iteration": 2.596877098083496 + }, + { + "auxiliary_loss_clip": 0.01057055, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.0303117, + "balance_loss_mlp": 1.01289606, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 1.7900248477332896, + "language_loss": 0.7233026, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74411929, + "num_input_tokens_seen": 218705315, + "step": 10158, + "time_per_iteration": 2.7380874156951904 + }, + { + "auxiliary_loss_clip": 0.01085247, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.03418887, + "balance_loss_mlp": 1.01990557, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.5238732980359524, + "language_loss": 0.6929028, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71406877, + "num_input_tokens_seen": 218725735, + "step": 10159, + "time_per_iteration": 2.715744972229004 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.03500867, + "balance_loss_mlp": 1.01914728, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.8005840298221703, + "language_loss": 0.78889394, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81022191, + "num_input_tokens_seen": 218743215, + "step": 10160, + "time_per_iteration": 2.5176713466644287 + }, + { + "auxiliary_loss_clip": 0.0101555, + "auxiliary_loss_mlp": 0.00999324, + "balance_loss_clip": 1.00486732, + "balance_loss_mlp": 0.99826318, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8353371212097578, + "language_loss": 0.61509562, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63524437, + "num_input_tokens_seen": 218806440, + "step": 10161, + "time_per_iteration": 3.2697083950042725 + }, + { + "auxiliary_loss_clip": 0.01082937, + "auxiliary_loss_mlp": 0.00749612, + "balance_loss_clip": 1.03584409, + "balance_loss_mlp": 1.00038302, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.8150981483662023, + "language_loss": 0.7613852, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.77971065, + "num_input_tokens_seen": 218825720, + "step": 10162, + "time_per_iteration": 2.5465760231018066 + }, + { + "auxiliary_loss_clip": 0.0109903, + "auxiliary_loss_mlp": 0.01028104, + "balance_loss_clip": 1.03368282, + "balance_loss_mlp": 1.01642203, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 1.602006219529199, + "language_loss": 0.71518052, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73645186, + "num_input_tokens_seen": 218847735, + "step": 10163, + "time_per_iteration": 2.625885486602783 + }, + { + "auxiliary_loss_clip": 0.01096892, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.03322554, + "balance_loss_mlp": 1.01874328, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 2.207470664475024, + "language_loss": 0.59609789, + "learning_rate": 1.387450491396625e-06, + "loss": 0.6173622, + "num_input_tokens_seen": 218866585, + "step": 10164, + "time_per_iteration": 2.5100255012512207 + }, + { + "auxiliary_loss_clip": 0.01082126, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.03313041, + "balance_loss_mlp": 1.02135086, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.6096491339272432, + "language_loss": 0.75882518, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.77996826, + "num_input_tokens_seen": 218885560, + "step": 10165, + "time_per_iteration": 2.5176198482513428 + }, + { + "auxiliary_loss_clip": 0.01077874, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.03522372, + "balance_loss_mlp": 1.01616549, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.4437730377971867, + "language_loss": 0.79299587, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81405133, + "num_input_tokens_seen": 218905055, + "step": 10166, + "time_per_iteration": 2.599735736846924 + }, + { + "auxiliary_loss_clip": 0.01076896, + "auxiliary_loss_mlp": 0.01027513, + "balance_loss_clip": 1.03403163, + "balance_loss_mlp": 1.01593184, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 2.4489276853681057, + "language_loss": 0.67631102, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69735515, + "num_input_tokens_seen": 218924030, + "step": 10167, + "time_per_iteration": 2.6762430667877197 + }, + { + "auxiliary_loss_clip": 0.01098775, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.03482878, + "balance_loss_mlp": 1.02070153, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.6538900924889082, + "language_loss": 0.79367864, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81498051, + "num_input_tokens_seen": 218943750, + "step": 10168, + "time_per_iteration": 4.156186103820801 + }, + { + "auxiliary_loss_clip": 0.01105603, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.03494322, + "balance_loss_mlp": 1.02351248, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 2.5697830628363802, + "language_loss": 0.85831571, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.8797363, + "num_input_tokens_seen": 218957585, + "step": 10169, + "time_per_iteration": 2.4715895652770996 + }, + { + "auxiliary_loss_clip": 0.01096153, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.0324297, + "balance_loss_mlp": 1.01989412, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 1.655088639535648, + "language_loss": 0.78550446, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.8067652, + "num_input_tokens_seen": 218980025, + "step": 10170, + "time_per_iteration": 2.701984167098999 + }, + { + "auxiliary_loss_clip": 0.01081189, + "auxiliary_loss_mlp": 0.01038087, + "balance_loss_clip": 1.03332365, + "balance_loss_mlp": 1.02536178, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 19.167213909939797, + "language_loss": 0.68571138, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.70690411, + "num_input_tokens_seen": 218998200, + "step": 10171, + "time_per_iteration": 2.569450855255127 + }, + { + "auxiliary_loss_clip": 0.01067998, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.03143454, + "balance_loss_mlp": 1.02335727, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.7160108910405665, + "language_loss": 0.7912178, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.8122617, + "num_input_tokens_seen": 219017910, + "step": 10172, + "time_per_iteration": 2.6851861476898193 + }, + { + "auxiliary_loss_clip": 0.01071356, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.03523207, + "balance_loss_mlp": 1.0209775, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.8880675650021284, + "language_loss": 0.67349446, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.6945377, + "num_input_tokens_seen": 219037730, + "step": 10173, + "time_per_iteration": 2.6364805698394775 + }, + { + "auxiliary_loss_clip": 0.01074516, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.03353691, + "balance_loss_mlp": 1.02333438, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.7409716744537072, + "language_loss": 0.55794817, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57905036, + "num_input_tokens_seen": 219056755, + "step": 10174, + "time_per_iteration": 2.5905051231384277 + }, + { + "auxiliary_loss_clip": 0.01081608, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.03541398, + "balance_loss_mlp": 1.02322662, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 1.869014385401811, + "language_loss": 0.66073024, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68189704, + "num_input_tokens_seen": 219076985, + "step": 10175, + "time_per_iteration": 2.6536874771118164 + }, + { + "auxiliary_loss_clip": 0.01085472, + "auxiliary_loss_mlp": 0.0074935, + "balance_loss_clip": 1.03091407, + "balance_loss_mlp": 1.00040078, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.0634142806922964, + "language_loss": 0.82601875, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84436697, + "num_input_tokens_seen": 219096050, + "step": 10176, + "time_per_iteration": 2.594904661178589 + }, + { + "auxiliary_loss_clip": 0.01081639, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.03443873, + "balance_loss_mlp": 1.0220499, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 1.8550190532333577, + "language_loss": 0.77553809, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79669678, + "num_input_tokens_seen": 219112665, + "step": 10177, + "time_per_iteration": 2.6784119606018066 + }, + { + "auxiliary_loss_clip": 0.01082461, + "auxiliary_loss_mlp": 0.00749454, + "balance_loss_clip": 1.03143477, + "balance_loss_mlp": 1.00032723, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 1.9685823548542811, + "language_loss": 0.75539225, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77371144, + "num_input_tokens_seen": 219129120, + "step": 10178, + "time_per_iteration": 2.5450196266174316 + }, + { + "auxiliary_loss_clip": 0.01073026, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.03238368, + "balance_loss_mlp": 1.0264132, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.7385754361811128, + "language_loss": 0.66755474, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.68868566, + "num_input_tokens_seen": 219148950, + "step": 10179, + "time_per_iteration": 2.6360368728637695 + }, + { + "auxiliary_loss_clip": 0.01081875, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.03566873, + "balance_loss_mlp": 1.02340221, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.8886545792167884, + "language_loss": 0.83691818, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.85807717, + "num_input_tokens_seen": 219165585, + "step": 10180, + "time_per_iteration": 2.8073976039886475 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.03561187, + "balance_loss_mlp": 1.01581383, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.5888155938768953, + "language_loss": 0.77512664, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.7964201, + "num_input_tokens_seen": 219183280, + "step": 10181, + "time_per_iteration": 2.5180065631866455 + }, + { + "auxiliary_loss_clip": 0.01101178, + "auxiliary_loss_mlp": 0.01032198, + "balance_loss_clip": 1.03578794, + "balance_loss_mlp": 1.02046776, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 2.3323762113969853, + "language_loss": 0.80911815, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.83045197, + "num_input_tokens_seen": 219197200, + "step": 10182, + "time_per_iteration": 4.013128280639648 + }, + { + "auxiliary_loss_clip": 0.01060067, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.0312494, + "balance_loss_mlp": 1.01921368, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.6861181522473467, + "language_loss": 0.82871711, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.84961534, + "num_input_tokens_seen": 219216825, + "step": 10183, + "time_per_iteration": 2.663482427597046 + }, + { + "auxiliary_loss_clip": 0.01016006, + "auxiliary_loss_mlp": 0.01008196, + "balance_loss_clip": 1.0073936, + "balance_loss_mlp": 1.00698555, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7123801885079141, + "language_loss": 0.62901068, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64925277, + "num_input_tokens_seen": 219283795, + "step": 10184, + "time_per_iteration": 3.232875347137451 + }, + { + "auxiliary_loss_clip": 0.01092801, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.03662765, + "balance_loss_mlp": 1.01872206, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 2.072074787201864, + "language_loss": 0.82101971, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84224367, + "num_input_tokens_seen": 219302385, + "step": 10185, + "time_per_iteration": 2.5229713916778564 + }, + { + "auxiliary_loss_clip": 0.01073444, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.03378749, + "balance_loss_mlp": 1.01929986, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 6.7442821311146535, + "language_loss": 0.74530202, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76635146, + "num_input_tokens_seen": 219319765, + "step": 10186, + "time_per_iteration": 2.584296703338623 + }, + { + "auxiliary_loss_clip": 0.01081823, + "auxiliary_loss_mlp": 0.01028702, + "balance_loss_clip": 1.03187382, + "balance_loss_mlp": 1.01831901, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.7031483685801858, + "language_loss": 0.7839849, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80509007, + "num_input_tokens_seen": 219337440, + "step": 10187, + "time_per_iteration": 2.512921094894409 + }, + { + "auxiliary_loss_clip": 0.0109735, + "auxiliary_loss_mlp": 0.01027533, + "balance_loss_clip": 1.03243661, + "balance_loss_mlp": 1.0160892, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.5966437829633642, + "language_loss": 0.83134389, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85259271, + "num_input_tokens_seen": 219357525, + "step": 10188, + "time_per_iteration": 2.5254528522491455 + }, + { + "auxiliary_loss_clip": 0.01070333, + "auxiliary_loss_mlp": 0.01026263, + "balance_loss_clip": 1.03435087, + "balance_loss_mlp": 1.01443172, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.8202256292587318, + "language_loss": 0.75559545, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77656144, + "num_input_tokens_seen": 219374855, + "step": 10189, + "time_per_iteration": 2.6067802906036377 + }, + { + "auxiliary_loss_clip": 0.01082641, + "auxiliary_loss_mlp": 0.01029929, + "balance_loss_clip": 1.03181159, + "balance_loss_mlp": 1.01807976, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.5305607434545099, + "language_loss": 0.74201286, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76313853, + "num_input_tokens_seen": 219394740, + "step": 10190, + "time_per_iteration": 2.8385727405548096 + }, + { + "auxiliary_loss_clip": 0.01090824, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.02148795, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.5568318173226525, + "language_loss": 0.68375957, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70500612, + "num_input_tokens_seen": 219413755, + "step": 10191, + "time_per_iteration": 2.5573880672454834 + }, + { + "auxiliary_loss_clip": 0.01082951, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.03102851, + "balance_loss_mlp": 1.02469635, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.064585909714341, + "language_loss": 0.73510754, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75630325, + "num_input_tokens_seen": 219433560, + "step": 10192, + "time_per_iteration": 2.5473110675811768 + }, + { + "auxiliary_loss_clip": 0.01060236, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.03373766, + "balance_loss_mlp": 1.01706398, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 2.2635976683761574, + "language_loss": 0.83386827, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85475111, + "num_input_tokens_seen": 219452640, + "step": 10193, + "time_per_iteration": 2.5723962783813477 + }, + { + "auxiliary_loss_clip": 0.01067337, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.03294301, + "balance_loss_mlp": 1.01985013, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.1641147552370725, + "language_loss": 0.70178252, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.72276723, + "num_input_tokens_seen": 219468585, + "step": 10194, + "time_per_iteration": 2.599034547805786 + }, + { + "auxiliary_loss_clip": 0.00995863, + "auxiliary_loss_mlp": 0.01001837, + "balance_loss_clip": 1.00513172, + "balance_loss_mlp": 1.00085342, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8191349004682127, + "language_loss": 0.58729714, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60727412, + "num_input_tokens_seen": 219523015, + "step": 10195, + "time_per_iteration": 4.430786609649658 + }, + { + "auxiliary_loss_clip": 0.01079812, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.03433156, + "balance_loss_mlp": 1.02168751, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 2.5055479913485335, + "language_loss": 0.69644362, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71757764, + "num_input_tokens_seen": 219539980, + "step": 10196, + "time_per_iteration": 2.556283712387085 + }, + { + "auxiliary_loss_clip": 0.01070068, + "auxiliary_loss_mlp": 0.0103615, + "balance_loss_clip": 1.03098726, + "balance_loss_mlp": 1.02452755, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.8166437238997706, + "language_loss": 0.71181911, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73288131, + "num_input_tokens_seen": 219556980, + "step": 10197, + "time_per_iteration": 4.077419996261597 + }, + { + "auxiliary_loss_clip": 0.01085596, + "auxiliary_loss_mlp": 0.01041406, + "balance_loss_clip": 1.03341138, + "balance_loss_mlp": 1.02778029, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 1.8985077506589318, + "language_loss": 0.79137278, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81264281, + "num_input_tokens_seen": 219576410, + "step": 10198, + "time_per_iteration": 2.5688226222991943 + }, + { + "auxiliary_loss_clip": 0.01073947, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.03521454, + "balance_loss_mlp": 1.0187, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 2.0599087577343673, + "language_loss": 0.74283355, + "learning_rate": 1.374488730519181e-06, + "loss": 0.763879, + "num_input_tokens_seen": 219597180, + "step": 10199, + "time_per_iteration": 2.620746612548828 + }, + { + "auxiliary_loss_clip": 0.01077779, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.034127, + "balance_loss_mlp": 1.02456331, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 3.0810071843155162, + "language_loss": 0.62087917, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64202774, + "num_input_tokens_seen": 219617630, + "step": 10200, + "time_per_iteration": 2.585641384124756 + }, + { + "auxiliary_loss_clip": 0.01073705, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.03373218, + "balance_loss_mlp": 1.02066994, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 3.761031331162947, + "language_loss": 0.68667138, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70773035, + "num_input_tokens_seen": 219637025, + "step": 10201, + "time_per_iteration": 2.6074471473693848 + }, + { + "auxiliary_loss_clip": 0.01077272, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.03224468, + "balance_loss_mlp": 1.01880217, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 3.8283868736076627, + "language_loss": 0.8326689, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85374457, + "num_input_tokens_seen": 219656625, + "step": 10202, + "time_per_iteration": 2.609935998916626 + }, + { + "auxiliary_loss_clip": 0.01025405, + "auxiliary_loss_mlp": 0.0100239, + "balance_loss_clip": 1.00455093, + "balance_loss_mlp": 1.00135851, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.8798777936505509, + "language_loss": 0.67093605, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69121397, + "num_input_tokens_seen": 219718090, + "step": 10203, + "time_per_iteration": 3.0899343490600586 + }, + { + "auxiliary_loss_clip": 0.01091934, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03533268, + "balance_loss_mlp": 1.01744556, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 2.3996887082113627, + "language_loss": 0.61261046, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63382053, + "num_input_tokens_seen": 219740100, + "step": 10204, + "time_per_iteration": 2.7336983680725098 + }, + { + "auxiliary_loss_clip": 0.01066709, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.03422403, + "balance_loss_mlp": 1.01825917, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.7647592333193252, + "language_loss": 0.72516012, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.7461229, + "num_input_tokens_seen": 219761225, + "step": 10205, + "time_per_iteration": 2.6883857250213623 + }, + { + "auxiliary_loss_clip": 0.01085902, + "auxiliary_loss_mlp": 0.01021406, + "balance_loss_clip": 1.03362393, + "balance_loss_mlp": 1.00974751, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.7218515102665488, + "language_loss": 0.7596792, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78075224, + "num_input_tokens_seen": 219780085, + "step": 10206, + "time_per_iteration": 2.563898801803589 + }, + { + "auxiliary_loss_clip": 0.0106698, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03724742, + "balance_loss_mlp": 1.01694775, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 2.370379851319701, + "language_loss": 0.75732827, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77828443, + "num_input_tokens_seen": 219797895, + "step": 10207, + "time_per_iteration": 4.217181921005249 + }, + { + "auxiliary_loss_clip": 0.01086176, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.03244209, + "balance_loss_mlp": 1.02191472, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.033058165494071, + "language_loss": 0.82221007, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.8434, + "num_input_tokens_seen": 219811295, + "step": 10208, + "time_per_iteration": 2.540050506591797 + }, + { + "auxiliary_loss_clip": 0.01081772, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.03457665, + "balance_loss_mlp": 1.02268076, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 1.7729346762014118, + "language_loss": 0.72451818, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74569046, + "num_input_tokens_seen": 219832735, + "step": 10209, + "time_per_iteration": 2.7421655654907227 + }, + { + "auxiliary_loss_clip": 0.0109916, + "auxiliary_loss_mlp": 0.01036459, + "balance_loss_clip": 1.03535724, + "balance_loss_mlp": 1.02518821, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.5308383683532418, + "language_loss": 0.74123836, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76259458, + "num_input_tokens_seen": 219852755, + "step": 10210, + "time_per_iteration": 2.5481419563293457 + }, + { + "auxiliary_loss_clip": 0.00999033, + "auxiliary_loss_mlp": 0.01001741, + "balance_loss_clip": 1.00682163, + "balance_loss_mlp": 1.00056696, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8610941571769227, + "language_loss": 0.650442, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67044973, + "num_input_tokens_seen": 219922785, + "step": 10211, + "time_per_iteration": 3.377122402191162 + }, + { + "auxiliary_loss_clip": 0.01076965, + "auxiliary_loss_mlp": 0.00749406, + "balance_loss_clip": 1.03275061, + "balance_loss_mlp": 1.00027347, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 3.6830056141599923, + "language_loss": 0.75517559, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77343929, + "num_input_tokens_seen": 219942215, + "step": 10212, + "time_per_iteration": 2.640474796295166 + }, + { + "auxiliary_loss_clip": 0.01070548, + "auxiliary_loss_mlp": 0.01038859, + "balance_loss_clip": 1.03147995, + "balance_loss_mlp": 1.02596021, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.5226935163134327, + "language_loss": 0.73996943, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76106352, + "num_input_tokens_seen": 219963830, + "step": 10213, + "time_per_iteration": 2.593977928161621 + }, + { + "auxiliary_loss_clip": 0.01083957, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.0354507, + "balance_loss_mlp": 1.02092957, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.5464496341043052, + "language_loss": 0.73012781, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.7513026, + "num_input_tokens_seen": 219983815, + "step": 10214, + "time_per_iteration": 2.6011431217193604 + }, + { + "auxiliary_loss_clip": 0.01101267, + "auxiliary_loss_mlp": 0.01032165, + "balance_loss_clip": 1.0344305, + "balance_loss_mlp": 1.01993966, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.7064793428988894, + "language_loss": 0.74268347, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76401776, + "num_input_tokens_seen": 220003165, + "step": 10215, + "time_per_iteration": 2.55133056640625 + }, + { + "auxiliary_loss_clip": 0.0108658, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.03458285, + "balance_loss_mlp": 1.01902962, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 2.1527666656440743, + "language_loss": 0.7815429, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80271327, + "num_input_tokens_seen": 220021015, + "step": 10216, + "time_per_iteration": 2.5598244667053223 + }, + { + "auxiliary_loss_clip": 0.01099333, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.03519166, + "balance_loss_mlp": 1.01898623, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 1.9552263929518547, + "language_loss": 0.7955603, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.81686848, + "num_input_tokens_seen": 220035780, + "step": 10217, + "time_per_iteration": 2.478135347366333 + }, + { + "auxiliary_loss_clip": 0.01080493, + "auxiliary_loss_mlp": 0.01024786, + "balance_loss_clip": 1.03377151, + "balance_loss_mlp": 1.01318097, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.4584503925540395, + "language_loss": 0.78585529, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80690801, + "num_input_tokens_seen": 220054280, + "step": 10218, + "time_per_iteration": 2.6650354862213135 + }, + { + "auxiliary_loss_clip": 0.01089515, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.03452301, + "balance_loss_mlp": 1.02142978, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.4945463660987366, + "language_loss": 0.81973094, + "learning_rate": 1.367095017101569e-06, + "loss": 0.8409549, + "num_input_tokens_seen": 220074120, + "step": 10219, + "time_per_iteration": 2.6072890758514404 + }, + { + "auxiliary_loss_clip": 0.01085652, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.03209937, + "balance_loss_mlp": 1.01864469, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1.7614484383620341, + "language_loss": 0.67072266, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.6918841, + "num_input_tokens_seen": 220096320, + "step": 10220, + "time_per_iteration": 2.7387564182281494 + }, + { + "auxiliary_loss_clip": 0.0108634, + "auxiliary_loss_mlp": 0.0102541, + "balance_loss_clip": 1.03204322, + "balance_loss_mlp": 1.01438951, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 1.7830990683266672, + "language_loss": 0.71501666, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73613417, + "num_input_tokens_seen": 220114850, + "step": 10221, + "time_per_iteration": 4.026460647583008 + }, + { + "auxiliary_loss_clip": 0.01050152, + "auxiliary_loss_mlp": 0.01025733, + "balance_loss_clip": 1.028409, + "balance_loss_mlp": 1.0150404, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.8861684353060653, + "language_loss": 0.79437637, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81513524, + "num_input_tokens_seen": 220133395, + "step": 10222, + "time_per_iteration": 2.626450300216675 + }, + { + "auxiliary_loss_clip": 0.01072605, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.03289032, + "balance_loss_mlp": 1.0239327, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 1.8855036674244106, + "language_loss": 0.76138151, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78247184, + "num_input_tokens_seen": 220152790, + "step": 10223, + "time_per_iteration": 2.6476752758026123 + }, + { + "auxiliary_loss_clip": 0.0107666, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.03407991, + "balance_loss_mlp": 1.02191401, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 2.257810372591337, + "language_loss": 0.78323346, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.8043313, + "num_input_tokens_seen": 220169535, + "step": 10224, + "time_per_iteration": 2.5353333950042725 + }, + { + "auxiliary_loss_clip": 0.01059647, + "auxiliary_loss_mlp": 0.01027794, + "balance_loss_clip": 1.02835667, + "balance_loss_mlp": 1.01726758, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.226378663219798, + "language_loss": 0.66463232, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68550676, + "num_input_tokens_seen": 220195305, + "step": 10225, + "time_per_iteration": 2.953566074371338 + }, + { + "auxiliary_loss_clip": 0.01087417, + "auxiliary_loss_mlp": 0.00749439, + "balance_loss_clip": 1.03510642, + "balance_loss_mlp": 1.0002774, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 6.228546751835443, + "language_loss": 0.63411945, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65248805, + "num_input_tokens_seen": 220215040, + "step": 10226, + "time_per_iteration": 2.639986515045166 + }, + { + "auxiliary_loss_clip": 0.01073784, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.03308177, + "balance_loss_mlp": 1.02044141, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.7001316341743011, + "language_loss": 0.75463879, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77570188, + "num_input_tokens_seen": 220234205, + "step": 10227, + "time_per_iteration": 2.602581024169922 + }, + { + "auxiliary_loss_clip": 0.01036095, + "auxiliary_loss_mlp": 0.01042281, + "balance_loss_clip": 1.02884042, + "balance_loss_mlp": 1.02751744, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 2.2847095247248403, + "language_loss": 0.6197933, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.64057702, + "num_input_tokens_seen": 220252730, + "step": 10228, + "time_per_iteration": 2.6161246299743652 + }, + { + "auxiliary_loss_clip": 0.0107294, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.03091264, + "balance_loss_mlp": 1.01776779, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3413246414916582, + "language_loss": 0.74230438, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.7633301, + "num_input_tokens_seen": 220273345, + "step": 10229, + "time_per_iteration": 2.629218816757202 + }, + { + "auxiliary_loss_clip": 0.01102276, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.03647208, + "balance_loss_mlp": 1.02584839, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.6962189794919953, + "language_loss": 0.78138459, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80278754, + "num_input_tokens_seen": 220293845, + "step": 10230, + "time_per_iteration": 2.491396188735962 + }, + { + "auxiliary_loss_clip": 0.01069156, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.0316546, + "balance_loss_mlp": 1.01782465, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.5006593687884073, + "language_loss": 0.73030907, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75129569, + "num_input_tokens_seen": 220316070, + "step": 10231, + "time_per_iteration": 2.665304183959961 + }, + { + "auxiliary_loss_clip": 0.01081081, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.03516936, + "balance_loss_mlp": 1.02259743, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.5534363266655238, + "language_loss": 0.6964339, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71758211, + "num_input_tokens_seen": 220335695, + "step": 10232, + "time_per_iteration": 2.6288352012634277 + }, + { + "auxiliary_loss_clip": 0.01082355, + "auxiliary_loss_mlp": 0.00749315, + "balance_loss_clip": 1.03133726, + "balance_loss_mlp": 1.00024641, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.9191330151324395, + "language_loss": 0.91542399, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93374068, + "num_input_tokens_seen": 220353720, + "step": 10233, + "time_per_iteration": 2.5648491382598877 + }, + { + "auxiliary_loss_clip": 0.01068487, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.03360248, + "balance_loss_mlp": 1.02372336, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.7381121300444287, + "language_loss": 0.71206868, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73308986, + "num_input_tokens_seen": 220372515, + "step": 10234, + "time_per_iteration": 2.623143434524536 + }, + { + "auxiliary_loss_clip": 0.01089744, + "auxiliary_loss_mlp": 0.00749554, + "balance_loss_clip": 1.03293967, + "balance_loss_mlp": 1.00039887, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 6.702823218446223, + "language_loss": 0.66772115, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.68611407, + "num_input_tokens_seen": 220393490, + "step": 10235, + "time_per_iteration": 4.157837152481079 + }, + { + "auxiliary_loss_clip": 0.01085467, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.01716852, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 1.7656490633919184, + "language_loss": 0.80981624, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.830962, + "num_input_tokens_seen": 220412855, + "step": 10236, + "time_per_iteration": 2.585118532180786 + }, + { + "auxiliary_loss_clip": 0.01103043, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.03436625, + "balance_loss_mlp": 1.01981294, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 2.2265214092912324, + "language_loss": 0.80563724, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82698089, + "num_input_tokens_seen": 220433440, + "step": 10237, + "time_per_iteration": 4.127067804336548 + }, + { + "auxiliary_loss_clip": 0.01085513, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.03587604, + "balance_loss_mlp": 1.02686596, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.682999679173296, + "language_loss": 0.76015717, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.7813983, + "num_input_tokens_seen": 220453445, + "step": 10238, + "time_per_iteration": 2.6135010719299316 + }, + { + "auxiliary_loss_clip": 0.00979088, + "auxiliary_loss_mlp": 0.01004224, + "balance_loss_clip": 1.00716925, + "balance_loss_mlp": 1.00284123, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7639728255316666, + "language_loss": 0.5767144, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59654754, + "num_input_tokens_seen": 220509730, + "step": 10239, + "time_per_iteration": 3.2347869873046875 + }, + { + "auxiliary_loss_clip": 0.01080451, + "auxiliary_loss_mlp": 0.01035225, + "balance_loss_clip": 1.03233421, + "balance_loss_mlp": 1.02300596, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 3.101156639847627, + "language_loss": 0.77877128, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79992801, + "num_input_tokens_seen": 220527295, + "step": 10240, + "time_per_iteration": 2.4926834106445312 + }, + { + "auxiliary_loss_clip": 0.01104836, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03751135, + "balance_loss_mlp": 1.01910686, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.1533689834993197, + "language_loss": 0.72500372, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.74636239, + "num_input_tokens_seen": 220542730, + "step": 10241, + "time_per_iteration": 2.482542037963867 + }, + { + "auxiliary_loss_clip": 0.01100102, + "auxiliary_loss_mlp": 0.01027694, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.01649451, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.7118125940122941, + "language_loss": 0.71890199, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.74017996, + "num_input_tokens_seen": 220562995, + "step": 10242, + "time_per_iteration": 2.5108330249786377 + }, + { + "auxiliary_loss_clip": 0.01090579, + "auxiliary_loss_mlp": 0.01028964, + "balance_loss_clip": 1.0342536, + "balance_loss_mlp": 1.01757991, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 1.8512411177884274, + "language_loss": 0.71953064, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74072611, + "num_input_tokens_seen": 220581775, + "step": 10243, + "time_per_iteration": 2.5492610931396484 + }, + { + "auxiliary_loss_clip": 0.01017247, + "auxiliary_loss_mlp": 0.01004065, + "balance_loss_clip": 1.00639606, + "balance_loss_mlp": 1.00296783, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.79153186358119, + "language_loss": 0.56830573, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58851886, + "num_input_tokens_seen": 220646395, + "step": 10244, + "time_per_iteration": 3.132941722869873 + }, + { + "auxiliary_loss_clip": 0.01101142, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.0343318, + "balance_loss_mlp": 1.0201354, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.578013494613147, + "language_loss": 0.63679695, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.658131, + "num_input_tokens_seen": 220668335, + "step": 10245, + "time_per_iteration": 2.609419822692871 + }, + { + "auxiliary_loss_clip": 0.01047148, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.0303123, + "balance_loss_mlp": 1.01669383, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 3.1713714329275287, + "language_loss": 0.79259145, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81334633, + "num_input_tokens_seen": 220688915, + "step": 10246, + "time_per_iteration": 2.677946090698242 + }, + { + "auxiliary_loss_clip": 0.01063863, + "auxiliary_loss_mlp": 0.00749611, + "balance_loss_clip": 1.03388357, + "balance_loss_mlp": 1.00027895, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.7835177651072276, + "language_loss": 0.87254524, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89067996, + "num_input_tokens_seen": 220703465, + "step": 10247, + "time_per_iteration": 4.151145696640015 + }, + { + "auxiliary_loss_clip": 0.01026211, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.03060532, + "balance_loss_mlp": 1.02333915, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 1.6835355650002648, + "language_loss": 0.80110598, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82173681, + "num_input_tokens_seen": 220722090, + "step": 10248, + "time_per_iteration": 2.6813178062438965 + }, + { + "auxiliary_loss_clip": 0.01053237, + "auxiliary_loss_mlp": 0.01027569, + "balance_loss_clip": 1.03181434, + "balance_loss_mlp": 1.0161072, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 1.9230245562520563, + "language_loss": 0.86696053, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.88776863, + "num_input_tokens_seen": 220741075, + "step": 10249, + "time_per_iteration": 2.636194944381714 + }, + { + "auxiliary_loss_clip": 0.01100424, + "auxiliary_loss_mlp": 0.01025259, + "balance_loss_clip": 1.03475118, + "balance_loss_mlp": 1.01246202, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 2.1595303851936527, + "language_loss": 0.68412238, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.70537925, + "num_input_tokens_seen": 220763395, + "step": 10250, + "time_per_iteration": 2.6610398292541504 + }, + { + "auxiliary_loss_clip": 0.01068306, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.03103745, + "balance_loss_mlp": 1.01925266, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 87.70671387625869, + "language_loss": 0.7424286, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76341665, + "num_input_tokens_seen": 220780640, + "step": 10251, + "time_per_iteration": 2.5614583492279053 + }, + { + "auxiliary_loss_clip": 0.01081741, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.0312767, + "balance_loss_mlp": 1.01919591, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.607459099615703, + "language_loss": 0.68374968, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70489264, + "num_input_tokens_seen": 220797960, + "step": 10252, + "time_per_iteration": 2.5854971408843994 + }, + { + "auxiliary_loss_clip": 0.00969017, + "auxiliary_loss_mlp": 0.01000567, + "balance_loss_clip": 1.00973225, + "balance_loss_mlp": 0.9993332, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 2.3127385289059905, + "language_loss": 0.57872027, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59841609, + "num_input_tokens_seen": 220856930, + "step": 10253, + "time_per_iteration": 3.3955748081207275 + }, + { + "auxiliary_loss_clip": 0.01076359, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.03066611, + "balance_loss_mlp": 1.01981544, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.5396889848640658, + "language_loss": 0.79453546, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81561887, + "num_input_tokens_seen": 220877595, + "step": 10254, + "time_per_iteration": 2.8098256587982178 + }, + { + "auxiliary_loss_clip": 0.01079036, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.03515732, + "balance_loss_mlp": 1.01830244, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.756276270527899, + "language_loss": 0.80174166, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82283485, + "num_input_tokens_seen": 220896880, + "step": 10255, + "time_per_iteration": 2.617522954940796 + }, + { + "auxiliary_loss_clip": 0.01083402, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.03492332, + "balance_loss_mlp": 1.02153361, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 3.0764039400723937, + "language_loss": 0.65832376, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67950046, + "num_input_tokens_seen": 220916425, + "step": 10256, + "time_per_iteration": 2.607504367828369 + }, + { + "auxiliary_loss_clip": 0.01090523, + "auxiliary_loss_mlp": 0.01027739, + "balance_loss_clip": 1.03632677, + "balance_loss_mlp": 1.01670599, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.672821824949597, + "language_loss": 0.71663755, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73782015, + "num_input_tokens_seen": 220935050, + "step": 10257, + "time_per_iteration": 2.593907356262207 + }, + { + "auxiliary_loss_clip": 0.01079412, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.03349471, + "balance_loss_mlp": 1.01780903, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 1.7840061005687504, + "language_loss": 0.72257608, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74367172, + "num_input_tokens_seen": 220953085, + "step": 10258, + "time_per_iteration": 2.5811104774475098 + }, + { + "auxiliary_loss_clip": 0.01068663, + "auxiliary_loss_mlp": 0.01040807, + "balance_loss_clip": 1.02981472, + "balance_loss_mlp": 1.02716923, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.2095399887392873, + "language_loss": 0.63715506, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.65824974, + "num_input_tokens_seen": 220969050, + "step": 10259, + "time_per_iteration": 2.62597393989563 + }, + { + "auxiliary_loss_clip": 0.01066761, + "auxiliary_loss_mlp": 0.01033423, + "balance_loss_clip": 1.03524303, + "balance_loss_mlp": 1.02094805, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 7.512009940808471, + "language_loss": 0.7135585, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73456037, + "num_input_tokens_seen": 220985825, + "step": 10260, + "time_per_iteration": 2.6133196353912354 + }, + { + "auxiliary_loss_clip": 0.01098338, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.03881025, + "balance_loss_mlp": 1.02016342, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 1.9727509362212976, + "language_loss": 0.68383539, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70516056, + "num_input_tokens_seen": 221004465, + "step": 10261, + "time_per_iteration": 4.353162527084351 + }, + { + "auxiliary_loss_clip": 0.01067009, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.03465307, + "balance_loss_mlp": 1.02594066, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.8219641682325571, + "language_loss": 0.71732885, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.7383709, + "num_input_tokens_seen": 221023260, + "step": 10262, + "time_per_iteration": 2.71399188041687 + }, + { + "auxiliary_loss_clip": 0.0108165, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.03406835, + "balance_loss_mlp": 1.02120686, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.8144158802435373, + "language_loss": 0.69599485, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.71714449, + "num_input_tokens_seen": 221043090, + "step": 10263, + "time_per_iteration": 2.6135432720184326 + }, + { + "auxiliary_loss_clip": 0.01034228, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.03003287, + "balance_loss_mlp": 1.0188632, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 2.530687859182156, + "language_loss": 0.76273763, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78339243, + "num_input_tokens_seen": 221061435, + "step": 10264, + "time_per_iteration": 2.7581703662872314 + }, + { + "auxiliary_loss_clip": 0.0110158, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.03579211, + "balance_loss_mlp": 1.01962769, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 3.025442547753728, + "language_loss": 0.85307324, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87441021, + "num_input_tokens_seen": 221078705, + "step": 10265, + "time_per_iteration": 2.630876064300537 + }, + { + "auxiliary_loss_clip": 0.0104945, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.03591228, + "balance_loss_mlp": 1.02221382, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 1.7493985698372252, + "language_loss": 0.64845276, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66928947, + "num_input_tokens_seen": 221099245, + "step": 10266, + "time_per_iteration": 2.817269802093506 + }, + { + "auxiliary_loss_clip": 0.01058328, + "auxiliary_loss_mlp": 0.01033752, + "balance_loss_clip": 1.02819991, + "balance_loss_mlp": 1.02196848, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.7656041530800572, + "language_loss": 0.75594491, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77686572, + "num_input_tokens_seen": 221116930, + "step": 10267, + "time_per_iteration": 2.7306582927703857 + }, + { + "auxiliary_loss_clip": 0.01068563, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.032004, + "balance_loss_mlp": 1.01651502, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 1.6164228852610998, + "language_loss": 0.74625266, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.76722986, + "num_input_tokens_seen": 221137660, + "step": 10268, + "time_per_iteration": 2.7897791862487793 + }, + { + "auxiliary_loss_clip": 0.01081538, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03440106, + "balance_loss_mlp": 1.01921201, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 2.316259963834094, + "language_loss": 0.75424474, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77537596, + "num_input_tokens_seen": 221156225, + "step": 10269, + "time_per_iteration": 2.771864891052246 + }, + { + "auxiliary_loss_clip": 0.01098594, + "auxiliary_loss_mlp": 0.01027977, + "balance_loss_clip": 1.03263617, + "balance_loss_mlp": 1.01655078, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 3.1919225925571673, + "language_loss": 0.76606894, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78733468, + "num_input_tokens_seen": 221173820, + "step": 10270, + "time_per_iteration": 2.5331010818481445 + }, + { + "auxiliary_loss_clip": 0.01081327, + "auxiliary_loss_mlp": 0.01026441, + "balance_loss_clip": 1.03358555, + "balance_loss_mlp": 1.01429915, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.849651367835422, + "language_loss": 0.82192749, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84300518, + "num_input_tokens_seen": 221191815, + "step": 10271, + "time_per_iteration": 2.6355674266815186 + }, + { + "auxiliary_loss_clip": 0.01102582, + "auxiliary_loss_mlp": 0.00749499, + "balance_loss_clip": 1.03567934, + "balance_loss_mlp": 1.00028777, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 1.612764817267922, + "language_loss": 0.77177477, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.7902956, + "num_input_tokens_seen": 221211205, + "step": 10272, + "time_per_iteration": 2.6384832859039307 + }, + { + "auxiliary_loss_clip": 0.00999191, + "auxiliary_loss_mlp": 0.01001167, + "balance_loss_clip": 1.00883055, + "balance_loss_mlp": 0.99983758, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8041641097044369, + "language_loss": 0.59099376, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61099732, + "num_input_tokens_seen": 221268430, + "step": 10273, + "time_per_iteration": 3.1092841625213623 + }, + { + "auxiliary_loss_clip": 0.01065376, + "auxiliary_loss_mlp": 0.01038052, + "balance_loss_clip": 1.02898169, + "balance_loss_mlp": 1.02375317, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.374729022160292, + "language_loss": 0.72995055, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75098479, + "num_input_tokens_seen": 221281930, + "step": 10274, + "time_per_iteration": 4.270368814468384 + }, + { + "auxiliary_loss_clip": 0.01090097, + "auxiliary_loss_mlp": 0.00749479, + "balance_loss_clip": 1.03369856, + "balance_loss_mlp": 1.00028396, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 2.273696171118847, + "language_loss": 0.7733705, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79176623, + "num_input_tokens_seen": 221301605, + "step": 10275, + "time_per_iteration": 2.56022572517395 + }, + { + "auxiliary_loss_clip": 0.01065374, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.03477955, + "balance_loss_mlp": 1.02086353, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.7676543130822229, + "language_loss": 0.79492533, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81590092, + "num_input_tokens_seen": 221320105, + "step": 10276, + "time_per_iteration": 2.6945412158966064 + }, + { + "auxiliary_loss_clip": 0.01029579, + "auxiliary_loss_mlp": 0.01042396, + "balance_loss_clip": 1.02935839, + "balance_loss_mlp": 1.02908003, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 2.803588145166734, + "language_loss": 0.80506659, + "learning_rate": 1.345707936733612e-06, + "loss": 0.82578635, + "num_input_tokens_seen": 221335915, + "step": 10277, + "time_per_iteration": 4.3720502853393555 + }, + { + "auxiliary_loss_clip": 0.01073544, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.03488874, + "balance_loss_mlp": 1.01783729, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 2.8224873906434906, + "language_loss": 0.81349134, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83453161, + "num_input_tokens_seen": 221353965, + "step": 10278, + "time_per_iteration": 2.6569180488586426 + }, + { + "auxiliary_loss_clip": 0.01055282, + "auxiliary_loss_mlp": 0.00749335, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.00019145, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.4829953944515861, + "language_loss": 0.73852742, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75657356, + "num_input_tokens_seen": 221374080, + "step": 10279, + "time_per_iteration": 2.805777072906494 + }, + { + "auxiliary_loss_clip": 0.01080865, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.03039765, + "balance_loss_mlp": 1.01916122, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.5339489282296654, + "language_loss": 0.70776039, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72887641, + "num_input_tokens_seen": 221392910, + "step": 10280, + "time_per_iteration": 2.593019723892212 + }, + { + "auxiliary_loss_clip": 0.01100929, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.03464901, + "balance_loss_mlp": 1.0211761, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.495830990329868, + "language_loss": 0.72462308, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.74596989, + "num_input_tokens_seen": 221410990, + "step": 10281, + "time_per_iteration": 2.5439486503601074 + }, + { + "auxiliary_loss_clip": 0.01077652, + "auxiliary_loss_mlp": 0.01029807, + "balance_loss_clip": 1.03497195, + "balance_loss_mlp": 1.01944137, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.5058393541470676, + "language_loss": 0.76486146, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.78593612, + "num_input_tokens_seen": 221431020, + "step": 10282, + "time_per_iteration": 2.724841833114624 + }, + { + "auxiliary_loss_clip": 0.01072886, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.03185892, + "balance_loss_mlp": 1.02041793, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.9759205818597076, + "language_loss": 0.69034386, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71143234, + "num_input_tokens_seen": 221453235, + "step": 10283, + "time_per_iteration": 2.633610963821411 + }, + { + "auxiliary_loss_clip": 0.0109654, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.03488541, + "balance_loss_mlp": 1.01607394, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.5620233764648443, + "language_loss": 0.75102425, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77227432, + "num_input_tokens_seen": 221472560, + "step": 10284, + "time_per_iteration": 2.5653488636016846 + }, + { + "auxiliary_loss_clip": 0.0108651, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.0365665, + "balance_loss_mlp": 1.02098811, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.489693823330326, + "language_loss": 0.75850439, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77969933, + "num_input_tokens_seen": 221492835, + "step": 10285, + "time_per_iteration": 2.6003921031951904 + }, + { + "auxiliary_loss_clip": 0.01063352, + "auxiliary_loss_mlp": 0.01033315, + "balance_loss_clip": 1.03091395, + "balance_loss_mlp": 1.0214951, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.6207321549832487, + "language_loss": 0.72442383, + "learning_rate": 1.342396663517503e-06, + "loss": 0.74539053, + "num_input_tokens_seen": 221511870, + "step": 10286, + "time_per_iteration": 2.651465654373169 + }, + { + "auxiliary_loss_clip": 0.01099048, + "auxiliary_loss_mlp": 0.01028048, + "balance_loss_clip": 1.03435445, + "balance_loss_mlp": 1.01674676, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 1.6734894720488238, + "language_loss": 0.75759494, + "learning_rate": 1.342028868767199e-06, + "loss": 0.77886587, + "num_input_tokens_seen": 221529915, + "step": 10287, + "time_per_iteration": 2.5599021911621094 + }, + { + "auxiliary_loss_clip": 0.01058573, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.03184152, + "balance_loss_mlp": 1.02231574, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 2.6673331630092, + "language_loss": 0.73327374, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75420177, + "num_input_tokens_seen": 221549745, + "step": 10288, + "time_per_iteration": 4.196785926818848 + }, + { + "auxiliary_loss_clip": 0.01083139, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.03193808, + "balance_loss_mlp": 1.01887178, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.7207376680602546, + "language_loss": 0.72730339, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.74843222, + "num_input_tokens_seen": 221572455, + "step": 10289, + "time_per_iteration": 2.7618680000305176 + }, + { + "auxiliary_loss_clip": 0.01072014, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.03199887, + "balance_loss_mlp": 1.02172899, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.6112419646002896, + "language_loss": 0.79339296, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81445348, + "num_input_tokens_seen": 221591325, + "step": 10290, + "time_per_iteration": 2.779489278793335 + }, + { + "auxiliary_loss_clip": 0.0109252, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.03584957, + "balance_loss_mlp": 1.02014005, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.6072438625361385, + "language_loss": 0.81718796, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83843768, + "num_input_tokens_seen": 221611640, + "step": 10291, + "time_per_iteration": 2.5864851474761963 + }, + { + "auxiliary_loss_clip": 0.01102082, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.03629625, + "balance_loss_mlp": 1.02244222, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 3.466327992202581, + "language_loss": 0.77368498, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79504561, + "num_input_tokens_seen": 221631225, + "step": 10292, + "time_per_iteration": 2.6481080055236816 + }, + { + "auxiliary_loss_clip": 0.01067398, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.03187084, + "balance_loss_mlp": 1.02355957, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 2.251789971838595, + "language_loss": 0.73760974, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75866461, + "num_input_tokens_seen": 221651035, + "step": 10293, + "time_per_iteration": 2.614180088043213 + }, + { + "auxiliary_loss_clip": 0.01070268, + "auxiliary_loss_mlp": 0.00749436, + "balance_loss_clip": 1.03452039, + "balance_loss_mlp": 1.00026727, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.7039277185586639, + "language_loss": 0.8319993, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.8501963, + "num_input_tokens_seen": 221671300, + "step": 10294, + "time_per_iteration": 2.637416362762451 + }, + { + "auxiliary_loss_clip": 0.0108178, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.03456938, + "balance_loss_mlp": 1.01741552, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.119668393561129, + "language_loss": 0.70617896, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.72728503, + "num_input_tokens_seen": 221687320, + "step": 10295, + "time_per_iteration": 2.5328822135925293 + }, + { + "auxiliary_loss_clip": 0.0110212, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.03707004, + "balance_loss_mlp": 1.0221622, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.838550069981289, + "language_loss": 0.70220339, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72356445, + "num_input_tokens_seen": 221710175, + "step": 10296, + "time_per_iteration": 2.619986057281494 + }, + { + "auxiliary_loss_clip": 0.01064153, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.03385627, + "balance_loss_mlp": 1.02133775, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 2.041040549040933, + "language_loss": 0.71816313, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73914796, + "num_input_tokens_seen": 221728145, + "step": 10297, + "time_per_iteration": 2.6446754932403564 + }, + { + "auxiliary_loss_clip": 0.01025889, + "auxiliary_loss_mlp": 0.00999592, + "balance_loss_clip": 1.00499988, + "balance_loss_mlp": 0.99850142, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8789675529958266, + "language_loss": 0.6417948, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66204965, + "num_input_tokens_seen": 221786100, + "step": 10298, + "time_per_iteration": 3.0199599266052246 + }, + { + "auxiliary_loss_clip": 0.01104495, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.03687692, + "balance_loss_mlp": 1.020895, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 2.033001240516205, + "language_loss": 0.7460472, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76742101, + "num_input_tokens_seen": 221806450, + "step": 10299, + "time_per_iteration": 2.5376033782958984 + }, + { + "auxiliary_loss_clip": 0.01096185, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03693557, + "balance_loss_mlp": 1.01772714, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.657870247906103, + "language_loss": 0.68417507, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70543361, + "num_input_tokens_seen": 221823330, + "step": 10300, + "time_per_iteration": 2.546718120574951 + }, + { + "auxiliary_loss_clip": 0.01090353, + "auxiliary_loss_mlp": 0.00749514, + "balance_loss_clip": 1.03686309, + "balance_loss_mlp": 1.00029612, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.7380074059693444, + "language_loss": 0.66675788, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.68515658, + "num_input_tokens_seen": 221839360, + "step": 10301, + "time_per_iteration": 4.043840408325195 + }, + { + "auxiliary_loss_clip": 0.01061804, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.03254378, + "balance_loss_mlp": 1.02271914, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 1.6477756571506406, + "language_loss": 0.72879529, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.74975538, + "num_input_tokens_seen": 221859465, + "step": 10302, + "time_per_iteration": 2.6923460960388184 + }, + { + "auxiliary_loss_clip": 0.0107702, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.03631091, + "balance_loss_mlp": 1.01631546, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 4.205863691296317, + "language_loss": 0.80979013, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.83084834, + "num_input_tokens_seen": 221878555, + "step": 10303, + "time_per_iteration": 2.5864310264587402 + }, + { + "auxiliary_loss_clip": 0.01103081, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.03507221, + "balance_loss_mlp": 1.01851523, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.7237485563495312, + "language_loss": 0.76975513, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.79110265, + "num_input_tokens_seen": 221898790, + "step": 10304, + "time_per_iteration": 2.4989922046661377 + }, + { + "auxiliary_loss_clip": 0.01072767, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.03621829, + "balance_loss_mlp": 1.02065706, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 2.0053033151691633, + "language_loss": 0.76377904, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.78483933, + "num_input_tokens_seen": 221918875, + "step": 10305, + "time_per_iteration": 2.668713092803955 + }, + { + "auxiliary_loss_clip": 0.01097692, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.03704858, + "balance_loss_mlp": 1.0225358, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.6486453860637467, + "language_loss": 0.78916013, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81049651, + "num_input_tokens_seen": 221937895, + "step": 10306, + "time_per_iteration": 2.539245843887329 + }, + { + "auxiliary_loss_clip": 0.01039317, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.03203452, + "balance_loss_mlp": 1.01661205, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.6209111259801203, + "language_loss": 0.8028897, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82355762, + "num_input_tokens_seen": 221955920, + "step": 10307, + "time_per_iteration": 2.6863415241241455 + }, + { + "auxiliary_loss_clip": 0.00988635, + "auxiliary_loss_mlp": 0.0099931, + "balance_loss_clip": 1.00673449, + "balance_loss_mlp": 0.99811167, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8074528420532092, + "language_loss": 0.59415007, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61402947, + "num_input_tokens_seen": 222011405, + "step": 10308, + "time_per_iteration": 3.18013596534729 + }, + { + "auxiliary_loss_clip": 0.01074799, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.03428149, + "balance_loss_mlp": 1.01787972, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.6477720591993656, + "language_loss": 0.6790539, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70008302, + "num_input_tokens_seen": 222034545, + "step": 10309, + "time_per_iteration": 2.6816463470458984 + }, + { + "auxiliary_loss_clip": 0.01069587, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.0319221, + "balance_loss_mlp": 1.02323723, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 4.380286349276582, + "language_loss": 0.72190273, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74295568, + "num_input_tokens_seen": 222052690, + "step": 10310, + "time_per_iteration": 2.574796676635742 + }, + { + "auxiliary_loss_clip": 0.01086308, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.03870857, + "balance_loss_mlp": 1.01992583, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 3.4570533664359466, + "language_loss": 0.78691161, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.80810541, + "num_input_tokens_seen": 222069095, + "step": 10311, + "time_per_iteration": 2.5915277004241943 + }, + { + "auxiliary_loss_clip": 0.01069141, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.03407931, + "balance_loss_mlp": 1.02122021, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.8372223110812473, + "language_loss": 0.72655785, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74758542, + "num_input_tokens_seen": 222087360, + "step": 10312, + "time_per_iteration": 2.643068552017212 + }, + { + "auxiliary_loss_clip": 0.01059211, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.03775871, + "balance_loss_mlp": 1.02084851, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 1.8332419310766166, + "language_loss": 0.7187627, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.73968834, + "num_input_tokens_seen": 222106130, + "step": 10313, + "time_per_iteration": 2.622083902359009 + }, + { + "auxiliary_loss_clip": 0.01094969, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.03604388, + "balance_loss_mlp": 1.01742399, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.835680850059523, + "language_loss": 0.78269792, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80394703, + "num_input_tokens_seen": 222123125, + "step": 10314, + "time_per_iteration": 4.110570669174194 + }, + { + "auxiliary_loss_clip": 0.01089104, + "auxiliary_loss_mlp": 0.01035266, + "balance_loss_clip": 1.03266907, + "balance_loss_mlp": 1.02333355, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.9762312564597195, + "language_loss": 0.78699231, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80823594, + "num_input_tokens_seen": 222140655, + "step": 10315, + "time_per_iteration": 2.5279641151428223 + }, + { + "auxiliary_loss_clip": 0.01068387, + "auxiliary_loss_mlp": 0.01034875, + "balance_loss_clip": 1.03694737, + "balance_loss_mlp": 1.02300227, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 1.8672485785093007, + "language_loss": 0.7605927, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78162527, + "num_input_tokens_seen": 222160450, + "step": 10316, + "time_per_iteration": 2.67307448387146 + }, + { + "auxiliary_loss_clip": 0.01102293, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.03349471, + "balance_loss_mlp": 1.01723671, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 2.237449441512192, + "language_loss": 0.77150548, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79282176, + "num_input_tokens_seen": 222179170, + "step": 10317, + "time_per_iteration": 4.095703125 + }, + { + "auxiliary_loss_clip": 0.01006733, + "auxiliary_loss_mlp": 0.01001167, + "balance_loss_clip": 1.00610614, + "balance_loss_mlp": 0.9998495, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6858574115526535, + "language_loss": 0.59151387, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61159289, + "num_input_tokens_seen": 222242660, + "step": 10318, + "time_per_iteration": 3.2474758625030518 + }, + { + "auxiliary_loss_clip": 0.0107263, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.03622079, + "balance_loss_mlp": 1.02386796, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.7506832430545791, + "language_loss": 0.77565551, + "learning_rate": 1.330272686582143e-06, + "loss": 0.79674745, + "num_input_tokens_seen": 222262170, + "step": 10319, + "time_per_iteration": 2.6333045959472656 + }, + { + "auxiliary_loss_clip": 0.01083209, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.03674245, + "balance_loss_mlp": 1.02165377, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 2.304818327441658, + "language_loss": 0.66284031, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68400204, + "num_input_tokens_seen": 222280375, + "step": 10320, + "time_per_iteration": 2.6580681800842285 + }, + { + "auxiliary_loss_clip": 0.01056976, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.03080153, + "balance_loss_mlp": 1.01959348, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.7953996477276044, + "language_loss": 0.76214767, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78302717, + "num_input_tokens_seen": 222297325, + "step": 10321, + "time_per_iteration": 2.6679444313049316 + }, + { + "auxiliary_loss_clip": 0.01074596, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.03309393, + "balance_loss_mlp": 1.01557839, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.6539044605842819, + "language_loss": 0.73272645, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75374144, + "num_input_tokens_seen": 222317095, + "step": 10322, + "time_per_iteration": 2.654000997543335 + }, + { + "auxiliary_loss_clip": 0.01057095, + "auxiliary_loss_mlp": 0.0102645, + "balance_loss_clip": 1.03078914, + "balance_loss_mlp": 1.01472008, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 2.008599026594678, + "language_loss": 0.72735369, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.74818915, + "num_input_tokens_seen": 222337055, + "step": 10323, + "time_per_iteration": 2.6944477558135986 + }, + { + "auxiliary_loss_clip": 0.01098647, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.03680873, + "balance_loss_mlp": 1.01774704, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.6301793123413857, + "language_loss": 0.58670509, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.6079945, + "num_input_tokens_seen": 222354515, + "step": 10324, + "time_per_iteration": 2.6061954498291016 + }, + { + "auxiliary_loss_clip": 0.01062249, + "auxiliary_loss_mlp": 0.01038886, + "balance_loss_clip": 1.03583455, + "balance_loss_mlp": 1.02475405, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 1.9832170524541295, + "language_loss": 0.76427126, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78528255, + "num_input_tokens_seen": 222372755, + "step": 10325, + "time_per_iteration": 2.6156067848205566 + }, + { + "auxiliary_loss_clip": 0.0109378, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.03602779, + "balance_loss_mlp": 1.01627207, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 2.6654354725370624, + "language_loss": 0.7262736, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74749565, + "num_input_tokens_seen": 222391380, + "step": 10326, + "time_per_iteration": 2.5044639110565186 + }, + { + "auxiliary_loss_clip": 0.01094425, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.03652048, + "balance_loss_mlp": 1.02114391, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.1774721967701947, + "language_loss": 0.73824465, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.7595197, + "num_input_tokens_seen": 222411165, + "step": 10327, + "time_per_iteration": 3.936584949493408 + }, + { + "auxiliary_loss_clip": 0.01080584, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.03564775, + "balance_loss_mlp": 1.02411807, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 4.696170089429394, + "language_loss": 0.79748476, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81865859, + "num_input_tokens_seen": 222428110, + "step": 10328, + "time_per_iteration": 2.451385259628296 + }, + { + "auxiliary_loss_clip": 0.01067687, + "auxiliary_loss_mlp": 0.0103936, + "balance_loss_clip": 1.03144705, + "balance_loss_mlp": 1.02709949, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.6696102503223735, + "language_loss": 0.77673453, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.79780495, + "num_input_tokens_seen": 222446385, + "step": 10329, + "time_per_iteration": 2.6005539894104004 + }, + { + "auxiliary_loss_clip": 0.0101935, + "auxiliary_loss_mlp": 0.01009756, + "balance_loss_clip": 1.00858879, + "balance_loss_mlp": 1.00880277, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.824102105405396, + "language_loss": 0.62116754, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64145863, + "num_input_tokens_seen": 222502150, + "step": 10330, + "time_per_iteration": 3.0653295516967773 + }, + { + "auxiliary_loss_clip": 0.01095426, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.03550458, + "balance_loss_mlp": 1.02488613, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 2.8780021573037415, + "language_loss": 0.77333724, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79467702, + "num_input_tokens_seen": 222519880, + "step": 10331, + "time_per_iteration": 2.541382074356079 + }, + { + "auxiliary_loss_clip": 0.0110593, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.03661156, + "balance_loss_mlp": 1.02269292, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 2.0037977355586536, + "language_loss": 0.67917407, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.70058548, + "num_input_tokens_seen": 222538545, + "step": 10332, + "time_per_iteration": 2.4498629570007324 + }, + { + "auxiliary_loss_clip": 0.01069759, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.03358376, + "balance_loss_mlp": 1.01864409, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.544593812207953, + "language_loss": 0.76444608, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78545243, + "num_input_tokens_seen": 222556935, + "step": 10333, + "time_per_iteration": 2.5512771606445312 + }, + { + "auxiliary_loss_clip": 0.01078384, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.03593326, + "balance_loss_mlp": 1.02104139, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.2758556664875864, + "language_loss": 0.69138932, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71250176, + "num_input_tokens_seen": 222574035, + "step": 10334, + "time_per_iteration": 2.4956581592559814 + }, + { + "auxiliary_loss_clip": 0.01080365, + "auxiliary_loss_mlp": 0.00749314, + "balance_loss_clip": 1.03652287, + "balance_loss_mlp": 1.00030601, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 1.7174363431856305, + "language_loss": 0.70324498, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72154176, + "num_input_tokens_seen": 222592290, + "step": 10335, + "time_per_iteration": 2.528226137161255 + }, + { + "auxiliary_loss_clip": 0.01043562, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02774501, + "balance_loss_mlp": 1.02427387, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.607315479209522, + "language_loss": 0.79972839, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82052851, + "num_input_tokens_seen": 222612805, + "step": 10336, + "time_per_iteration": 2.6508853435516357 + }, + { + "auxiliary_loss_clip": 0.01099438, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.03477478, + "balance_loss_mlp": 1.01796865, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.915215108704441, + "language_loss": 0.73543346, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75671911, + "num_input_tokens_seen": 222632260, + "step": 10337, + "time_per_iteration": 2.4796783924102783 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.03626513, + "balance_loss_mlp": 1.01821923, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 2.8865558533414517, + "language_loss": 0.63164008, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65300381, + "num_input_tokens_seen": 222653570, + "step": 10338, + "time_per_iteration": 2.5167226791381836 + }, + { + "auxiliary_loss_clip": 0.01091072, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.03590012, + "balance_loss_mlp": 1.02042615, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 1.5623868248672987, + "language_loss": 0.7121048, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73334175, + "num_input_tokens_seen": 222672480, + "step": 10339, + "time_per_iteration": 2.4858150482177734 + }, + { + "auxiliary_loss_clip": 0.01049453, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.03362072, + "balance_loss_mlp": 1.02257788, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.6067507835036372, + "language_loss": 0.69447333, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.7153219, + "num_input_tokens_seen": 222691200, + "step": 10340, + "time_per_iteration": 2.5828428268432617 + }, + { + "auxiliary_loss_clip": 0.0105394, + "auxiliary_loss_mlp": 0.01027666, + "balance_loss_clip": 1.03014386, + "balance_loss_mlp": 1.01580524, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 1.7478409121452076, + "language_loss": 0.69063365, + "learning_rate": 1.322205369037788e-06, + "loss": 0.71144974, + "num_input_tokens_seen": 222709975, + "step": 10341, + "time_per_iteration": 4.191477537155151 + }, + { + "auxiliary_loss_clip": 0.01092667, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.03684723, + "balance_loss_mlp": 1.0166049, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 1.9220843404004722, + "language_loss": 0.81201059, + "learning_rate": 1.321838967240299e-06, + "loss": 0.8332305, + "num_input_tokens_seen": 222729005, + "step": 10342, + "time_per_iteration": 2.4840896129608154 + }, + { + "auxiliary_loss_clip": 0.01008582, + "auxiliary_loss_mlp": 0.00999695, + "balance_loss_clip": 1.00720978, + "balance_loss_mlp": 0.99849701, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7811558132559989, + "language_loss": 0.57266545, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59274817, + "num_input_tokens_seen": 222786090, + "step": 10343, + "time_per_iteration": 3.0198731422424316 + }, + { + "auxiliary_loss_clip": 0.01055785, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.03041565, + "balance_loss_mlp": 1.01707578, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 2.025161943064434, + "language_loss": 0.72735286, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.74818897, + "num_input_tokens_seen": 222806100, + "step": 10344, + "time_per_iteration": 2.583176374435425 + }, + { + "auxiliary_loss_clip": 0.01091661, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.03590035, + "balance_loss_mlp": 1.02787375, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.6824368267752716, + "language_loss": 0.60332412, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62462735, + "num_input_tokens_seen": 222826575, + "step": 10345, + "time_per_iteration": 2.516293525695801 + }, + { + "auxiliary_loss_clip": 0.01033376, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.02811813, + "balance_loss_mlp": 1.02368891, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 2.321859645390501, + "language_loss": 0.77726328, + "learning_rate": 1.320373617348614e-06, + "loss": 0.79796284, + "num_input_tokens_seen": 222845285, + "step": 10346, + "time_per_iteration": 2.6351044178009033 + }, + { + "auxiliary_loss_clip": 0.01071077, + "auxiliary_loss_mlp": 0.01033975, + "balance_loss_clip": 1.03516865, + "balance_loss_mlp": 1.02124929, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.615184134293076, + "language_loss": 0.71684408, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73789459, + "num_input_tokens_seen": 222864575, + "step": 10347, + "time_per_iteration": 2.6207189559936523 + }, + { + "auxiliary_loss_clip": 0.01086706, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.03246808, + "balance_loss_mlp": 1.01617241, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.890907226721798, + "language_loss": 0.71750438, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.73864937, + "num_input_tokens_seen": 222884420, + "step": 10348, + "time_per_iteration": 2.5267555713653564 + }, + { + "auxiliary_loss_clip": 0.0100188, + "auxiliary_loss_mlp": 0.01002823, + "balance_loss_clip": 1.01325274, + "balance_loss_mlp": 1.00173795, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8087866926409464, + "language_loss": 0.54150629, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56155336, + "num_input_tokens_seen": 222944690, + "step": 10349, + "time_per_iteration": 3.1234583854675293 + }, + { + "auxiliary_loss_clip": 0.01060913, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.03155863, + "balance_loss_mlp": 1.01522613, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 1.8159612476338667, + "language_loss": 0.70007879, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.72096145, + "num_input_tokens_seen": 222962990, + "step": 10350, + "time_per_iteration": 2.588977813720703 + }, + { + "auxiliary_loss_clip": 0.01102782, + "auxiliary_loss_mlp": 0.01029123, + "balance_loss_clip": 1.0355804, + "balance_loss_mlp": 1.01747632, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 2.006660210221208, + "language_loss": 0.56909341, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59041238, + "num_input_tokens_seen": 222980715, + "step": 10351, + "time_per_iteration": 2.492259979248047 + }, + { + "auxiliary_loss_clip": 0.01019011, + "auxiliary_loss_mlp": 0.01002682, + "balance_loss_clip": 1.01031756, + "balance_loss_mlp": 1.00140095, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.806733058993999, + "language_loss": 0.6116873, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63190424, + "num_input_tokens_seen": 223040685, + "step": 10352, + "time_per_iteration": 3.0520665645599365 + }, + { + "auxiliary_loss_clip": 0.01097742, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.03359771, + "balance_loss_mlp": 1.02022862, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.0669551338942598, + "language_loss": 0.82181156, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84310734, + "num_input_tokens_seen": 223059000, + "step": 10353, + "time_per_iteration": 2.539541482925415 + }, + { + "auxiliary_loss_clip": 0.0108512, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.03324831, + "balance_loss_mlp": 1.01863003, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.6785402890748942, + "language_loss": 0.75654131, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77768439, + "num_input_tokens_seen": 223079345, + "step": 10354, + "time_per_iteration": 2.5699260234832764 + }, + { + "auxiliary_loss_clip": 0.01054669, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.03238416, + "balance_loss_mlp": 1.01765108, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 2.0692066175916537, + "language_loss": 0.78645158, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80729151, + "num_input_tokens_seen": 223097880, + "step": 10355, + "time_per_iteration": 4.0779128074646 + }, + { + "auxiliary_loss_clip": 0.01091225, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.03652334, + "balance_loss_mlp": 1.01962471, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.6498403948952316, + "language_loss": 0.78197002, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.8031944, + "num_input_tokens_seen": 223118185, + "step": 10356, + "time_per_iteration": 2.567049026489258 + }, + { + "auxiliary_loss_clip": 0.01083315, + "auxiliary_loss_mlp": 0.00749664, + "balance_loss_clip": 1.03468084, + "balance_loss_mlp": 1.00034666, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 2.1682121343776775, + "language_loss": 0.67777586, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69610566, + "num_input_tokens_seen": 223137600, + "step": 10357, + "time_per_iteration": 4.06717586517334 + }, + { + "auxiliary_loss_clip": 0.01079586, + "auxiliary_loss_mlp": 0.01028327, + "balance_loss_clip": 1.03332233, + "balance_loss_mlp": 1.01490438, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 3.0032932915455586, + "language_loss": 0.75912941, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78020859, + "num_input_tokens_seen": 223154360, + "step": 10358, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.01077365, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.03272152, + "balance_loss_mlp": 1.01938677, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.416892387688773, + "language_loss": 0.81642163, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.83750379, + "num_input_tokens_seen": 223172255, + "step": 10359, + "time_per_iteration": 2.566148281097412 + }, + { + "auxiliary_loss_clip": 0.01069355, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_clip": 1.03091073, + "balance_loss_mlp": 1.0315249, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 2.186077520632934, + "language_loss": 0.73508435, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75623012, + "num_input_tokens_seen": 223186965, + "step": 10360, + "time_per_iteration": 2.487501859664917 + }, + { + "auxiliary_loss_clip": 0.01087361, + "auxiliary_loss_mlp": 0.01038285, + "balance_loss_clip": 1.0318644, + "balance_loss_mlp": 1.02637029, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 2.5417639004987786, + "language_loss": 0.78088629, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.8021428, + "num_input_tokens_seen": 223206045, + "step": 10361, + "time_per_iteration": 2.468942880630493 + }, + { + "auxiliary_loss_clip": 0.0105882, + "auxiliary_loss_mlp": 0.01029732, + "balance_loss_clip": 1.03324437, + "balance_loss_mlp": 1.0180316, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 1.5979952379629974, + "language_loss": 0.67439586, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69528139, + "num_input_tokens_seen": 223224820, + "step": 10362, + "time_per_iteration": 2.5407724380493164 + }, + { + "auxiliary_loss_clip": 0.01079796, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.0323689, + "balance_loss_mlp": 1.01940084, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 2.5619362531873273, + "language_loss": 0.67255938, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.6936785, + "num_input_tokens_seen": 223243205, + "step": 10363, + "time_per_iteration": 2.5884218215942383 + }, + { + "auxiliary_loss_clip": 0.01051943, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.03130746, + "balance_loss_mlp": 1.02268279, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.7637617588232108, + "language_loss": 0.85974598, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88062131, + "num_input_tokens_seen": 223261370, + "step": 10364, + "time_per_iteration": 2.574248790740967 + }, + { + "auxiliary_loss_clip": 0.0100723, + "auxiliary_loss_mlp": 0.01007521, + "balance_loss_clip": 1.00655127, + "balance_loss_mlp": 1.00629282, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.8806592316249903, + "language_loss": 0.60783517, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62798268, + "num_input_tokens_seen": 223315050, + "step": 10365, + "time_per_iteration": 3.1232686042785645 + }, + { + "auxiliary_loss_clip": 0.01069103, + "auxiliary_loss_mlp": 0.00749633, + "balance_loss_clip": 1.03569579, + "balance_loss_mlp": 1.00026596, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 2.1023641211832484, + "language_loss": 0.75589836, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.7740857, + "num_input_tokens_seen": 223332130, + "step": 10366, + "time_per_iteration": 2.6113786697387695 + }, + { + "auxiliary_loss_clip": 0.01092372, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.03509653, + "balance_loss_mlp": 1.02407432, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 1.8263291190459379, + "language_loss": 0.76296496, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78425008, + "num_input_tokens_seen": 223351605, + "step": 10367, + "time_per_iteration": 4.040396690368652 + }, + { + "auxiliary_loss_clip": 0.01088712, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.0348742, + "balance_loss_mlp": 1.02334976, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.514747954251422, + "language_loss": 0.7866267, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80786395, + "num_input_tokens_seen": 223372090, + "step": 10368, + "time_per_iteration": 2.5248701572418213 + }, + { + "auxiliary_loss_clip": 0.0103166, + "auxiliary_loss_mlp": 0.01034748, + "balance_loss_clip": 1.03119874, + "balance_loss_mlp": 1.02181983, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.9383625827538449, + "language_loss": 0.68581295, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70647705, + "num_input_tokens_seen": 223390110, + "step": 10369, + "time_per_iteration": 2.6589419841766357 + }, + { + "auxiliary_loss_clip": 0.0110297, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.03608108, + "balance_loss_mlp": 1.02209151, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.1164766361759058, + "language_loss": 0.87875259, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.9001236, + "num_input_tokens_seen": 223404205, + "step": 10370, + "time_per_iteration": 2.4474079608917236 + }, + { + "auxiliary_loss_clip": 0.0109902, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.03398061, + "balance_loss_mlp": 1.0169456, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.5869240370051154, + "language_loss": 0.66209674, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68337476, + "num_input_tokens_seen": 223424855, + "step": 10371, + "time_per_iteration": 2.522376775741577 + }, + { + "auxiliary_loss_clip": 0.01084275, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.03261232, + "balance_loss_mlp": 1.02149296, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.3770172275911865, + "language_loss": 0.77475417, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79590964, + "num_input_tokens_seen": 223447225, + "step": 10372, + "time_per_iteration": 2.577564239501953 + }, + { + "auxiliary_loss_clip": 0.010892, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.03197122, + "balance_loss_mlp": 1.02042055, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.5983917644191856, + "language_loss": 0.77378148, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79500568, + "num_input_tokens_seen": 223467520, + "step": 10373, + "time_per_iteration": 2.53576397895813 + }, + { + "auxiliary_loss_clip": 0.01087214, + "auxiliary_loss_mlp": 0.01028283, + "balance_loss_clip": 1.03426826, + "balance_loss_mlp": 1.01692808, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.672449378467282, + "language_loss": 0.69396853, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71512353, + "num_input_tokens_seen": 223488130, + "step": 10374, + "time_per_iteration": 2.5096402168273926 + }, + { + "auxiliary_loss_clip": 0.01085562, + "auxiliary_loss_mlp": 0.01028135, + "balance_loss_clip": 1.03676009, + "balance_loss_mlp": 1.01660752, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.8086331851377704, + "language_loss": 0.77232409, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79346108, + "num_input_tokens_seen": 223505105, + "step": 10375, + "time_per_iteration": 2.535750150680542 + }, + { + "auxiliary_loss_clip": 0.01075399, + "auxiliary_loss_mlp": 0.01028299, + "balance_loss_clip": 1.03447986, + "balance_loss_mlp": 1.01667595, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.354771433710405, + "language_loss": 0.70082808, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72186512, + "num_input_tokens_seen": 223528065, + "step": 10376, + "time_per_iteration": 2.6641786098480225 + }, + { + "auxiliary_loss_clip": 0.01071125, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.034096, + "balance_loss_mlp": 1.01822829, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 1.6843368275833928, + "language_loss": 0.76784086, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78886282, + "num_input_tokens_seen": 223547305, + "step": 10377, + "time_per_iteration": 2.715162992477417 + }, + { + "auxiliary_loss_clip": 0.01081643, + "auxiliary_loss_mlp": 0.01028066, + "balance_loss_clip": 1.03568101, + "balance_loss_mlp": 1.01719415, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 2.604689217317055, + "language_loss": 0.68078756, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70188475, + "num_input_tokens_seen": 223567205, + "step": 10378, + "time_per_iteration": 2.5488245487213135 + }, + { + "auxiliary_loss_clip": 0.010782, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.03459466, + "balance_loss_mlp": 1.01994896, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.8951476788001183, + "language_loss": 0.76340449, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78450334, + "num_input_tokens_seen": 223586560, + "step": 10379, + "time_per_iteration": 2.575134515762329 + }, + { + "auxiliary_loss_clip": 0.01075918, + "auxiliary_loss_mlp": 0.01029619, + "balance_loss_clip": 1.03334665, + "balance_loss_mlp": 1.01789474, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.3314623676748152, + "language_loss": 0.79259825, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81365359, + "num_input_tokens_seen": 223610595, + "step": 10380, + "time_per_iteration": 2.6394996643066406 + }, + { + "auxiliary_loss_clip": 0.0108591, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.03565001, + "balance_loss_mlp": 1.02076519, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.6107141084214838, + "language_loss": 0.79921758, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82039118, + "num_input_tokens_seen": 223630230, + "step": 10381, + "time_per_iteration": 4.00520133972168 + }, + { + "auxiliary_loss_clip": 0.01072306, + "auxiliary_loss_mlp": 0.01037984, + "balance_loss_clip": 1.0317657, + "balance_loss_mlp": 1.0244894, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.880229187634507, + "language_loss": 0.74105561, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76215851, + "num_input_tokens_seen": 223648360, + "step": 10382, + "time_per_iteration": 2.5277130603790283 + }, + { + "auxiliary_loss_clip": 0.01086931, + "auxiliary_loss_mlp": 0.01026166, + "balance_loss_clip": 1.03305721, + "balance_loss_mlp": 1.01463258, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.42264730364623, + "language_loss": 0.78472728, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80585825, + "num_input_tokens_seen": 223671255, + "step": 10383, + "time_per_iteration": 2.5620124340057373 + }, + { + "auxiliary_loss_clip": 0.01062406, + "auxiliary_loss_mlp": 0.01026752, + "balance_loss_clip": 1.03026056, + "balance_loss_mlp": 1.01560593, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 1.8215760631657163, + "language_loss": 0.75353664, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77442825, + "num_input_tokens_seen": 223689860, + "step": 10384, + "time_per_iteration": 2.5928189754486084 + }, + { + "auxiliary_loss_clip": 0.01074551, + "auxiliary_loss_mlp": 0.01039449, + "balance_loss_clip": 1.03225231, + "balance_loss_mlp": 1.02596045, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 4.82032691426885, + "language_loss": 0.66221511, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68335509, + "num_input_tokens_seen": 223707835, + "step": 10385, + "time_per_iteration": 2.5408878326416016 + }, + { + "auxiliary_loss_clip": 0.01011323, + "auxiliary_loss_mlp": 0.00998943, + "balance_loss_clip": 1.0094769, + "balance_loss_mlp": 0.99780446, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7633867826381968, + "language_loss": 0.61978745, + "learning_rate": 1.305742943921692e-06, + "loss": 0.63989007, + "num_input_tokens_seen": 223771875, + "step": 10386, + "time_per_iteration": 3.141087055206299 + }, + { + "auxiliary_loss_clip": 0.01091576, + "auxiliary_loss_mlp": 0.01034199, + "balance_loss_clip": 1.03426099, + "balance_loss_mlp": 1.02211154, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.7619923784223657, + "language_loss": 0.71801692, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.73927468, + "num_input_tokens_seen": 223788895, + "step": 10387, + "time_per_iteration": 2.52005934715271 + }, + { + "auxiliary_loss_clip": 0.0109757, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.03612781, + "balance_loss_mlp": 1.02271581, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.4519891545846644, + "language_loss": 0.65799415, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67933011, + "num_input_tokens_seen": 223810385, + "step": 10388, + "time_per_iteration": 2.5899314880371094 + }, + { + "auxiliary_loss_clip": 0.01064158, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.03530264, + "balance_loss_mlp": 1.01534605, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.6617064710299285, + "language_loss": 0.79061121, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81151175, + "num_input_tokens_seen": 223826040, + "step": 10389, + "time_per_iteration": 2.5527961254119873 + }, + { + "auxiliary_loss_clip": 0.01072454, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.03086519, + "balance_loss_mlp": 1.02406406, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.7067716658110357, + "language_loss": 0.60806835, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62915373, + "num_input_tokens_seen": 223842300, + "step": 10390, + "time_per_iteration": 2.516050100326538 + }, + { + "auxiliary_loss_clip": 0.01080485, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.03365684, + "balance_loss_mlp": 1.02040863, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.8478848099519807, + "language_loss": 0.76978707, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79091728, + "num_input_tokens_seen": 223858320, + "step": 10391, + "time_per_iteration": 2.571960687637329 + }, + { + "auxiliary_loss_clip": 0.01081987, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.03549314, + "balance_loss_mlp": 1.01751757, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.4226102157591638, + "language_loss": 0.64484859, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66596437, + "num_input_tokens_seen": 223883545, + "step": 10392, + "time_per_iteration": 2.7345457077026367 + }, + { + "auxiliary_loss_clip": 0.01082799, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.03602064, + "balance_loss_mlp": 1.01829147, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.7723257916295, + "language_loss": 0.76931161, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.79044175, + "num_input_tokens_seen": 223901445, + "step": 10393, + "time_per_iteration": 2.572258710861206 + }, + { + "auxiliary_loss_clip": 0.01060252, + "auxiliary_loss_mlp": 0.00749497, + "balance_loss_clip": 1.03236866, + "balance_loss_mlp": 1.00017691, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.7621663268025836, + "language_loss": 0.82597083, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84406829, + "num_input_tokens_seen": 223920170, + "step": 10394, + "time_per_iteration": 4.051709890365601 + }, + { + "auxiliary_loss_clip": 0.01082803, + "auxiliary_loss_mlp": 0.01039088, + "balance_loss_clip": 1.03473043, + "balance_loss_mlp": 1.02682114, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 34.02056993400736, + "language_loss": 0.75222027, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77343917, + "num_input_tokens_seen": 223936495, + "step": 10395, + "time_per_iteration": 2.552734136581421 + }, + { + "auxiliary_loss_clip": 0.01079444, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.03129971, + "balance_loss_mlp": 1.01859677, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.9019345053306447, + "language_loss": 0.72718441, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74828559, + "num_input_tokens_seen": 223950070, + "step": 10396, + "time_per_iteration": 2.5136542320251465 + }, + { + "auxiliary_loss_clip": 0.01062385, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.03537571, + "balance_loss_mlp": 1.02207232, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.6386459664072388, + "language_loss": 0.76046479, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.78142583, + "num_input_tokens_seen": 223970065, + "step": 10397, + "time_per_iteration": 4.057545900344849 + }, + { + "auxiliary_loss_clip": 0.01078854, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.03523779, + "balance_loss_mlp": 1.02161074, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.4510283300404914, + "language_loss": 0.74907005, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77019262, + "num_input_tokens_seen": 223990315, + "step": 10398, + "time_per_iteration": 2.597501754760742 + }, + { + "auxiliary_loss_clip": 0.01104311, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.03469598, + "balance_loss_mlp": 1.01891971, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 2.1272513000152116, + "language_loss": 0.74061412, + "learning_rate": 1.300997001489483e-06, + "loss": 0.76197237, + "num_input_tokens_seen": 224009960, + "step": 10399, + "time_per_iteration": 2.5103330612182617 + }, + { + "auxiliary_loss_clip": 0.01064776, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.03328824, + "balance_loss_mlp": 1.02368808, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.5361810242594356, + "language_loss": 0.74450988, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76551425, + "num_input_tokens_seen": 224028870, + "step": 10400, + "time_per_iteration": 2.568101644515991 + }, + { + "auxiliary_loss_clip": 0.01001981, + "auxiliary_loss_mlp": 0.01000654, + "balance_loss_clip": 1.00919294, + "balance_loss_mlp": 0.99955171, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8457362393085883, + "language_loss": 0.56497741, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58500373, + "num_input_tokens_seen": 224094140, + "step": 10401, + "time_per_iteration": 3.2339699268341064 + }, + { + "auxiliary_loss_clip": 0.01092586, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.03416276, + "balance_loss_mlp": 1.01924622, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.4171204207847077, + "language_loss": 0.83099097, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85222995, + "num_input_tokens_seen": 224113235, + "step": 10402, + "time_per_iteration": 2.496110200881958 + }, + { + "auxiliary_loss_clip": 0.0102578, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.03296983, + "balance_loss_mlp": 1.02534413, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 1.9691551761200874, + "language_loss": 0.69390351, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71452898, + "num_input_tokens_seen": 224134530, + "step": 10403, + "time_per_iteration": 2.8920438289642334 + }, + { + "auxiliary_loss_clip": 0.01063554, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.03052998, + "balance_loss_mlp": 1.01768303, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.7455618075737849, + "language_loss": 0.7188074, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.73975384, + "num_input_tokens_seen": 224154170, + "step": 10404, + "time_per_iteration": 2.9179437160491943 + }, + { + "auxiliary_loss_clip": 0.01055041, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.0310266, + "balance_loss_mlp": 1.02295732, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 3.357201365122274, + "language_loss": 0.69302607, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71392632, + "num_input_tokens_seen": 224172730, + "step": 10405, + "time_per_iteration": 2.719054937362671 + }, + { + "auxiliary_loss_clip": 0.01071053, + "auxiliary_loss_mlp": 0.01035989, + "balance_loss_clip": 1.03298664, + "balance_loss_mlp": 1.02304268, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.5694074272837997, + "language_loss": 0.79179859, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81286901, + "num_input_tokens_seen": 224192620, + "step": 10406, + "time_per_iteration": 2.55629825592041 + }, + { + "auxiliary_loss_clip": 0.01061602, + "auxiliary_loss_mlp": 0.01035905, + "balance_loss_clip": 1.03298688, + "balance_loss_mlp": 1.0244844, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 1.812220514490299, + "language_loss": 0.68727964, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.70825469, + "num_input_tokens_seen": 224214660, + "step": 10407, + "time_per_iteration": 4.164888858795166 + }, + { + "auxiliary_loss_clip": 0.01087328, + "auxiliary_loss_mlp": 0.00749208, + "balance_loss_clip": 1.03390086, + "balance_loss_mlp": 1.00022554, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.6775108328527453, + "language_loss": 0.85325181, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.8716172, + "num_input_tokens_seen": 224234170, + "step": 10408, + "time_per_iteration": 2.5632741451263428 + }, + { + "auxiliary_loss_clip": 0.01075935, + "auxiliary_loss_mlp": 0.00749233, + "balance_loss_clip": 1.03135848, + "balance_loss_mlp": 1.0002172, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.7095213048348952, + "language_loss": 0.79908049, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81733215, + "num_input_tokens_seen": 224253115, + "step": 10409, + "time_per_iteration": 2.5779380798339844 + }, + { + "auxiliary_loss_clip": 0.01077693, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.03333211, + "balance_loss_mlp": 1.02218652, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.1637887076311304, + "language_loss": 0.69435048, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.7154637, + "num_input_tokens_seen": 224271375, + "step": 10410, + "time_per_iteration": 2.5953104496002197 + }, + { + "auxiliary_loss_clip": 0.01049096, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.03069568, + "balance_loss_mlp": 1.01630616, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.839491389379602, + "language_loss": 0.67793787, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69870317, + "num_input_tokens_seen": 224290315, + "step": 10411, + "time_per_iteration": 2.68302321434021 + }, + { + "auxiliary_loss_clip": 0.01053606, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.03118944, + "balance_loss_mlp": 1.02398634, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 2.2170945910218935, + "language_loss": 0.69403839, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71493196, + "num_input_tokens_seen": 224310545, + "step": 10412, + "time_per_iteration": 2.685258626937866 + }, + { + "auxiliary_loss_clip": 0.01067113, + "auxiliary_loss_mlp": 0.01045345, + "balance_loss_clip": 1.0316397, + "balance_loss_mlp": 1.03267336, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.4460127761641732, + "language_loss": 0.69589543, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71702003, + "num_input_tokens_seen": 224331115, + "step": 10413, + "time_per_iteration": 2.593233823776245 + }, + { + "auxiliary_loss_clip": 0.0106959, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.03307605, + "balance_loss_mlp": 1.01685941, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 4.027347177200793, + "language_loss": 0.80600703, + "learning_rate": 1.295526482316796e-06, + "loss": 0.827003, + "num_input_tokens_seen": 224347525, + "step": 10414, + "time_per_iteration": 2.59133243560791 + }, + { + "auxiliary_loss_clip": 0.01092687, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.0372957, + "balance_loss_mlp": 1.02507973, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.931882380915717, + "language_loss": 0.74458748, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.76588094, + "num_input_tokens_seen": 224367045, + "step": 10415, + "time_per_iteration": 2.5531914234161377 + }, + { + "auxiliary_loss_clip": 0.01046281, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.03300321, + "balance_loss_mlp": 1.01775908, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.647538717296548, + "language_loss": 0.74390227, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76465893, + "num_input_tokens_seen": 224388860, + "step": 10416, + "time_per_iteration": 2.65840482711792 + }, + { + "auxiliary_loss_clip": 0.01074199, + "auxiliary_loss_mlp": 0.01029264, + "balance_loss_clip": 1.03509355, + "balance_loss_mlp": 1.01796865, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.7302490798640533, + "language_loss": 0.84092438, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86195904, + "num_input_tokens_seen": 224409645, + "step": 10417, + "time_per_iteration": 2.6308658123016357 + }, + { + "auxiliary_loss_clip": 0.01090738, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03373885, + "balance_loss_mlp": 1.01789451, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 6.150927006005818, + "language_loss": 0.56506377, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.58627379, + "num_input_tokens_seen": 224428530, + "step": 10418, + "time_per_iteration": 2.460869550704956 + }, + { + "auxiliary_loss_clip": 0.0109399, + "auxiliary_loss_mlp": 0.01037812, + "balance_loss_clip": 1.03421474, + "balance_loss_mlp": 1.02487826, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 2.72340520500162, + "language_loss": 0.84931374, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.87063181, + "num_input_tokens_seen": 224447175, + "step": 10419, + "time_per_iteration": 2.4819507598876953 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.03768981, + "balance_loss_mlp": 1.01839983, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 2.6927797190849434, + "language_loss": 0.64500493, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66635573, + "num_input_tokens_seen": 224469445, + "step": 10420, + "time_per_iteration": 2.5484490394592285 + }, + { + "auxiliary_loss_clip": 0.01058056, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.03197801, + "balance_loss_mlp": 1.0189476, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 1.938482003902791, + "language_loss": 0.86384654, + "learning_rate": 1.292975627485741e-06, + "loss": 0.884745, + "num_input_tokens_seen": 224486590, + "step": 10421, + "time_per_iteration": 4.077703952789307 + }, + { + "auxiliary_loss_clip": 0.01058429, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.03124118, + "balance_loss_mlp": 1.01845551, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 53.36081665366682, + "language_loss": 0.79287434, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81375372, + "num_input_tokens_seen": 224502795, + "step": 10422, + "time_per_iteration": 2.5469255447387695 + }, + { + "auxiliary_loss_clip": 0.01089422, + "auxiliary_loss_mlp": 0.01024699, + "balance_loss_clip": 1.03352571, + "balance_loss_mlp": 1.01261091, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 2.499094338063995, + "language_loss": 0.74111241, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76225364, + "num_input_tokens_seen": 224522300, + "step": 10423, + "time_per_iteration": 2.5407779216766357 + }, + { + "auxiliary_loss_clip": 0.01099462, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.03434551, + "balance_loss_mlp": 1.01746964, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 2.540012632424398, + "language_loss": 0.77737689, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79865837, + "num_input_tokens_seen": 224538260, + "step": 10424, + "time_per_iteration": 2.513864517211914 + }, + { + "auxiliary_loss_clip": 0.01100963, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.03564477, + "balance_loss_mlp": 1.01768613, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.8085931864159526, + "language_loss": 0.69133747, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71264744, + "num_input_tokens_seen": 224559155, + "step": 10425, + "time_per_iteration": 2.6433181762695312 + }, + { + "auxiliary_loss_clip": 0.01075128, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.03328562, + "balance_loss_mlp": 1.01938701, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.5148101053563845, + "language_loss": 0.74554801, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76659662, + "num_input_tokens_seen": 224578660, + "step": 10426, + "time_per_iteration": 2.589895486831665 + }, + { + "auxiliary_loss_clip": 0.0109117, + "auxiliary_loss_mlp": 0.00749317, + "balance_loss_clip": 1.03440642, + "balance_loss_mlp": 1.00022995, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.61395571219019, + "language_loss": 0.80262792, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82103276, + "num_input_tokens_seen": 224599080, + "step": 10427, + "time_per_iteration": 2.543191432952881 + }, + { + "auxiliary_loss_clip": 0.01067486, + "auxiliary_loss_mlp": 0.01037589, + "balance_loss_clip": 1.03332996, + "balance_loss_mlp": 1.02544713, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 2.892511336108521, + "language_loss": 0.68388504, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70493579, + "num_input_tokens_seen": 224614225, + "step": 10428, + "time_per_iteration": 2.5741400718688965 + }, + { + "auxiliary_loss_clip": 0.01052371, + "auxiliary_loss_mlp": 0.01045718, + "balance_loss_clip": 1.03108668, + "balance_loss_mlp": 1.0323782, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.8986259327662558, + "language_loss": 0.71520489, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73618573, + "num_input_tokens_seen": 224632365, + "step": 10429, + "time_per_iteration": 2.5303831100463867 + }, + { + "auxiliary_loss_clip": 0.01093669, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.03597832, + "balance_loss_mlp": 1.01874566, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.468330498998376, + "language_loss": 0.79982483, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82107961, + "num_input_tokens_seen": 224651125, + "step": 10430, + "time_per_iteration": 2.502256155014038 + }, + { + "auxiliary_loss_clip": 0.01026043, + "auxiliary_loss_mlp": 0.01007774, + "balance_loss_clip": 1.00529063, + "balance_loss_mlp": 1.00657547, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.764335258577718, + "language_loss": 0.59134191, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61168003, + "num_input_tokens_seen": 224716115, + "step": 10431, + "time_per_iteration": 3.149484157562256 + }, + { + "auxiliary_loss_clip": 0.01006199, + "auxiliary_loss_mlp": 0.01004258, + "balance_loss_clip": 1.00562501, + "balance_loss_mlp": 1.00313139, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8801212234622485, + "language_loss": 0.63808829, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65819287, + "num_input_tokens_seen": 224782930, + "step": 10432, + "time_per_iteration": 3.1553127765655518 + }, + { + "auxiliary_loss_clip": 0.01076461, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.03345048, + "balance_loss_mlp": 1.01805246, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.773087904749886, + "language_loss": 0.64620453, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.66725361, + "num_input_tokens_seen": 224802010, + "step": 10433, + "time_per_iteration": 2.6122796535491943 + }, + { + "auxiliary_loss_clip": 0.01094375, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.03638792, + "balance_loss_mlp": 1.02238178, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.080914742076475, + "language_loss": 0.61790812, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.63920128, + "num_input_tokens_seen": 224818875, + "step": 10434, + "time_per_iteration": 3.946399211883545 + }, + { + "auxiliary_loss_clip": 0.01065226, + "auxiliary_loss_mlp": 0.01024034, + "balance_loss_clip": 1.03219306, + "balance_loss_mlp": 1.01287603, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.7553138440116358, + "language_loss": 0.84492242, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86581498, + "num_input_tokens_seen": 224837790, + "step": 10435, + "time_per_iteration": 2.5689589977264404 + }, + { + "auxiliary_loss_clip": 0.01025344, + "auxiliary_loss_mlp": 0.01000602, + "balance_loss_clip": 1.00473714, + "balance_loss_mlp": 0.99945182, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7307747441473798, + "language_loss": 0.61547846, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.6357379, + "num_input_tokens_seen": 224899685, + "step": 10436, + "time_per_iteration": 4.631080389022827 + }, + { + "auxiliary_loss_clip": 0.01081887, + "auxiliary_loss_mlp": 0.01034282, + "balance_loss_clip": 1.03667951, + "balance_loss_mlp": 1.02180088, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.5683398034644753, + "language_loss": 0.77540189, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79656357, + "num_input_tokens_seen": 224918650, + "step": 10437, + "time_per_iteration": 2.6071035861968994 + }, + { + "auxiliary_loss_clip": 0.01015525, + "auxiliary_loss_mlp": 0.01000746, + "balance_loss_clip": 1.0046953, + "balance_loss_mlp": 0.99967909, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7238346657528337, + "language_loss": 0.54320467, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56336737, + "num_input_tokens_seen": 224981575, + "step": 10438, + "time_per_iteration": 3.0503175258636475 + }, + { + "auxiliary_loss_clip": 0.01046296, + "auxiliary_loss_mlp": 0.01043691, + "balance_loss_clip": 1.02940023, + "balance_loss_mlp": 1.03094113, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 1.6667118360654039, + "language_loss": 0.83945858, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86035848, + "num_input_tokens_seen": 225000820, + "step": 10439, + "time_per_iteration": 2.7291598320007324 + }, + { + "auxiliary_loss_clip": 0.01061016, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.0356735, + "balance_loss_mlp": 1.02597237, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.423103669016813, + "language_loss": 0.80520451, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.82619846, + "num_input_tokens_seen": 225017585, + "step": 10440, + "time_per_iteration": 2.5770654678344727 + }, + { + "auxiliary_loss_clip": 0.01043292, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.02965295, + "balance_loss_mlp": 1.02018332, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 2.4267026865746937, + "language_loss": 0.7435509, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76429111, + "num_input_tokens_seen": 225039085, + "step": 10441, + "time_per_iteration": 2.75693416595459 + }, + { + "auxiliary_loss_clip": 0.01080378, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.0321523, + "balance_loss_mlp": 1.01698422, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.8400595615707696, + "language_loss": 0.72241777, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74351072, + "num_input_tokens_seen": 225058105, + "step": 10442, + "time_per_iteration": 2.4946401119232178 + }, + { + "auxiliary_loss_clip": 0.0106405, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.03090692, + "balance_loss_mlp": 1.02088165, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.4759845386809767, + "language_loss": 0.71638274, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73734689, + "num_input_tokens_seen": 225077605, + "step": 10443, + "time_per_iteration": 2.614793539047241 + }, + { + "auxiliary_loss_clip": 0.01101031, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.03555775, + "balance_loss_mlp": 1.01848471, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 1.944027087507944, + "language_loss": 0.73742437, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75873613, + "num_input_tokens_seen": 225097775, + "step": 10444, + "time_per_iteration": 2.4984307289123535 + }, + { + "auxiliary_loss_clip": 0.01067492, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.03794885, + "balance_loss_mlp": 1.01667488, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.9980416654520228, + "language_loss": 0.7279582, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74891829, + "num_input_tokens_seen": 225115585, + "step": 10445, + "time_per_iteration": 2.601511001586914 + }, + { + "auxiliary_loss_clip": 0.01086508, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.03361309, + "balance_loss_mlp": 1.01534629, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.6297396588319228, + "language_loss": 0.68970513, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71083891, + "num_input_tokens_seen": 225135575, + "step": 10446, + "time_per_iteration": 2.519167423248291 + }, + { + "auxiliary_loss_clip": 0.0105795, + "auxiliary_loss_mlp": 0.01032997, + "balance_loss_clip": 1.03337073, + "balance_loss_mlp": 1.02031279, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 2.0021431518507633, + "language_loss": 0.73494512, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.75585461, + "num_input_tokens_seen": 225154230, + "step": 10447, + "time_per_iteration": 4.058156251907349 + }, + { + "auxiliary_loss_clip": 0.01017353, + "auxiliary_loss_mlp": 0.01000356, + "balance_loss_clip": 1.00643516, + "balance_loss_mlp": 0.99925882, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6777339014888808, + "language_loss": 0.52362323, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54380023, + "num_input_tokens_seen": 225213650, + "step": 10448, + "time_per_iteration": 2.980393648147583 + }, + { + "auxiliary_loss_clip": 0.0107296, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.03240633, + "balance_loss_mlp": 1.03747797, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.5666246519965616, + "language_loss": 0.91342688, + "learning_rate": 1.282785392633079e-06, + "loss": 0.934668, + "num_input_tokens_seen": 225230135, + "step": 10449, + "time_per_iteration": 2.5251128673553467 + }, + { + "auxiliary_loss_clip": 0.01099406, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.03406954, + "balance_loss_mlp": 1.02193427, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.6264887016999952, + "language_loss": 0.60228896, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62360847, + "num_input_tokens_seen": 225253520, + "step": 10450, + "time_per_iteration": 2.6641225814819336 + }, + { + "auxiliary_loss_clip": 0.01076855, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.03590882, + "balance_loss_mlp": 1.0188477, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.5843859330754293, + "language_loss": 0.76659971, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.78766698, + "num_input_tokens_seen": 225272460, + "step": 10451, + "time_per_iteration": 2.5334315299987793 + }, + { + "auxiliary_loss_clip": 0.01073358, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.03101242, + "balance_loss_mlp": 1.01790392, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 18.961806076208497, + "language_loss": 0.77355087, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79457974, + "num_input_tokens_seen": 225291700, + "step": 10452, + "time_per_iteration": 2.609550952911377 + }, + { + "auxiliary_loss_clip": 0.01072072, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.03590906, + "balance_loss_mlp": 1.02006257, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 2.129142539299793, + "language_loss": 0.72812182, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74916297, + "num_input_tokens_seen": 225311470, + "step": 10453, + "time_per_iteration": 2.645972967147827 + }, + { + "auxiliary_loss_clip": 0.01037605, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.02833247, + "balance_loss_mlp": 1.02385712, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.8005656724162824, + "language_loss": 0.80681038, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82756782, + "num_input_tokens_seen": 225328385, + "step": 10454, + "time_per_iteration": 2.5815224647521973 + }, + { + "auxiliary_loss_clip": 0.0107188, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.03670609, + "balance_loss_mlp": 1.02285826, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 1.84736685692316, + "language_loss": 0.82034653, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84140635, + "num_input_tokens_seen": 225348415, + "step": 10455, + "time_per_iteration": 2.6013379096984863 + }, + { + "auxiliary_loss_clip": 0.01047736, + "auxiliary_loss_mlp": 0.00749486, + "balance_loss_clip": 1.02869129, + "balance_loss_mlp": 1.0002079, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.625831127586694, + "language_loss": 0.81695956, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83493179, + "num_input_tokens_seen": 225367740, + "step": 10456, + "time_per_iteration": 2.654930353164673 + }, + { + "auxiliary_loss_clip": 0.01075552, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.0371443, + "balance_loss_mlp": 1.01795685, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.494489862761876, + "language_loss": 0.71729493, + "learning_rate": 1.27987780006486e-06, + "loss": 0.73835051, + "num_input_tokens_seen": 225388405, + "step": 10457, + "time_per_iteration": 2.5595662593841553 + }, + { + "auxiliary_loss_clip": 0.01093149, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.03294039, + "balance_loss_mlp": 1.02110386, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 1.7724717391672054, + "language_loss": 0.79683995, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81810725, + "num_input_tokens_seen": 225408360, + "step": 10458, + "time_per_iteration": 2.5484237670898438 + }, + { + "auxiliary_loss_clip": 0.01092726, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.03495729, + "balance_loss_mlp": 1.02114975, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.6106492784046242, + "language_loss": 0.60826266, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.62952071, + "num_input_tokens_seen": 225431310, + "step": 10459, + "time_per_iteration": 2.6028428077697754 + }, + { + "auxiliary_loss_clip": 0.01078384, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.03583407, + "balance_loss_mlp": 1.01690912, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 2.174653254392444, + "language_loss": 0.78972888, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.81078947, + "num_input_tokens_seen": 225450385, + "step": 10460, + "time_per_iteration": 2.5532498359680176 + }, + { + "auxiliary_loss_clip": 0.01055243, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.03090143, + "balance_loss_mlp": 1.01863587, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.7225310778682539, + "language_loss": 0.74139231, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76224744, + "num_input_tokens_seen": 225467325, + "step": 10461, + "time_per_iteration": 4.102842807769775 + }, + { + "auxiliary_loss_clip": 0.01077848, + "auxiliary_loss_mlp": 0.01040648, + "balance_loss_clip": 1.03297281, + "balance_loss_mlp": 1.02876902, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 2.649507732546579, + "language_loss": 0.69922924, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72041416, + "num_input_tokens_seen": 225487370, + "step": 10462, + "time_per_iteration": 2.610414505004883 + }, + { + "auxiliary_loss_clip": 0.01095643, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.03466487, + "balance_loss_mlp": 1.0182693, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 10.71399650553705, + "language_loss": 0.71944672, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74068844, + "num_input_tokens_seen": 225506915, + "step": 10463, + "time_per_iteration": 2.5501692295074463 + }, + { + "auxiliary_loss_clip": 0.01082253, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.03876913, + "balance_loss_mlp": 1.02250576, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.8993124423455399, + "language_loss": 0.72821808, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74938136, + "num_input_tokens_seen": 225525670, + "step": 10464, + "time_per_iteration": 2.5766351222991943 + }, + { + "auxiliary_loss_clip": 0.01070651, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.03483593, + "balance_loss_mlp": 1.02105081, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.6714772826891298, + "language_loss": 0.69392627, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.7149508, + "num_input_tokens_seen": 225542235, + "step": 10465, + "time_per_iteration": 2.5231380462646484 + }, + { + "auxiliary_loss_clip": 0.01015416, + "auxiliary_loss_mlp": 0.00999836, + "balance_loss_clip": 1.00456047, + "balance_loss_mlp": 0.99870378, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6786393919119984, + "language_loss": 0.59727681, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61742938, + "num_input_tokens_seen": 225607185, + "step": 10466, + "time_per_iteration": 3.1856751441955566 + }, + { + "auxiliary_loss_clip": 0.01060686, + "auxiliary_loss_mlp": 0.01030393, + "balance_loss_clip": 1.03015459, + "balance_loss_mlp": 1.02005136, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 2.1381019256819243, + "language_loss": 0.64588189, + "learning_rate": 1.276245767820154e-06, + "loss": 0.66679275, + "num_input_tokens_seen": 225628785, + "step": 10467, + "time_per_iteration": 2.7813737392425537 + }, + { + "auxiliary_loss_clip": 0.01008539, + "auxiliary_loss_mlp": 0.01002297, + "balance_loss_clip": 1.00756001, + "balance_loss_mlp": 1.00113475, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7927844982499919, + "language_loss": 0.56894666, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58905506, + "num_input_tokens_seen": 225678980, + "step": 10468, + "time_per_iteration": 2.8971354961395264 + }, + { + "auxiliary_loss_clip": 0.00986791, + "auxiliary_loss_mlp": 0.0100044, + "balance_loss_clip": 1.00763905, + "balance_loss_mlp": 0.99930775, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7275890998238099, + "language_loss": 0.57934016, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.59921247, + "num_input_tokens_seen": 225740295, + "step": 10469, + "time_per_iteration": 3.1225531101226807 + }, + { + "auxiliary_loss_clip": 0.01021052, + "auxiliary_loss_mlp": 0.01011798, + "balance_loss_clip": 1.0158385, + "balance_loss_mlp": 1.01073062, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6869370787235848, + "language_loss": 0.52149737, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54182583, + "num_input_tokens_seen": 225805615, + "step": 10470, + "time_per_iteration": 3.177097797393799 + }, + { + "auxiliary_loss_clip": 0.01086735, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.03491426, + "balance_loss_mlp": 1.02226937, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.6892874835848442, + "language_loss": 0.74362755, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76483595, + "num_input_tokens_seen": 225826585, + "step": 10471, + "time_per_iteration": 2.742770195007324 + }, + { + "auxiliary_loss_clip": 0.01073184, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.0360136, + "balance_loss_mlp": 1.01756418, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.9598049249918528, + "language_loss": 0.62942767, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65044588, + "num_input_tokens_seen": 225844095, + "step": 10472, + "time_per_iteration": 2.575550079345703 + }, + { + "auxiliary_loss_clip": 0.01105416, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.03766203, + "balance_loss_mlp": 1.02199638, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.915950228373273, + "language_loss": 0.69359481, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71498454, + "num_input_tokens_seen": 225864310, + "step": 10473, + "time_per_iteration": 4.038186073303223 + }, + { + "auxiliary_loss_clip": 0.01073276, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.0306617, + "balance_loss_mlp": 1.01732814, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.962189564588764, + "language_loss": 0.74572468, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.76673937, + "num_input_tokens_seen": 225883830, + "step": 10474, + "time_per_iteration": 2.5680887699127197 + }, + { + "auxiliary_loss_clip": 0.01074729, + "auxiliary_loss_mlp": 0.00749339, + "balance_loss_clip": 1.03182399, + "balance_loss_mlp": 1.00028396, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.5809638013037783, + "language_loss": 0.66288984, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68113053, + "num_input_tokens_seen": 225905755, + "step": 10475, + "time_per_iteration": 2.6477038860321045 + }, + { + "auxiliary_loss_clip": 0.01050601, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.03092909, + "balance_loss_mlp": 1.01737738, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 1.8104409978291556, + "language_loss": 0.90195847, + "learning_rate": 1.272979284940101e-06, + "loss": 0.9227469, + "num_input_tokens_seen": 225922155, + "step": 10476, + "time_per_iteration": 2.6072378158569336 + }, + { + "auxiliary_loss_clip": 0.01100138, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.03577316, + "balance_loss_mlp": 1.01990759, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.9076514959774953, + "language_loss": 0.75184071, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.77314913, + "num_input_tokens_seen": 225941060, + "step": 10477, + "time_per_iteration": 2.5510201454162598 + }, + { + "auxiliary_loss_clip": 0.01086273, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.03397584, + "balance_loss_mlp": 1.01598907, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.715875988598187, + "language_loss": 0.70156044, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72270083, + "num_input_tokens_seen": 225960870, + "step": 10478, + "time_per_iteration": 3.9921679496765137 + }, + { + "auxiliary_loss_clip": 0.01094458, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.03505707, + "balance_loss_mlp": 1.01767743, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.3285216141607017, + "language_loss": 0.67308557, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69433272, + "num_input_tokens_seen": 225977895, + "step": 10479, + "time_per_iteration": 2.493844747543335 + }, + { + "auxiliary_loss_clip": 0.01074529, + "auxiliary_loss_mlp": 0.00749311, + "balance_loss_clip": 1.03371441, + "balance_loss_mlp": 1.00019479, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 1.6065832221861671, + "language_loss": 0.73732805, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75556642, + "num_input_tokens_seen": 225997835, + "step": 10480, + "time_per_iteration": 2.5840814113616943 + }, + { + "auxiliary_loss_clip": 0.01088687, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.0331645, + "balance_loss_mlp": 1.01564741, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 1.890994827333392, + "language_loss": 0.78581804, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.80698758, + "num_input_tokens_seen": 226017620, + "step": 10481, + "time_per_iteration": 2.5373473167419434 + }, + { + "auxiliary_loss_clip": 0.01015102, + "auxiliary_loss_mlp": 0.00999279, + "balance_loss_clip": 1.01299834, + "balance_loss_mlp": 0.99818802, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8919433819368713, + "language_loss": 0.61888295, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63902676, + "num_input_tokens_seen": 226068755, + "step": 10482, + "time_per_iteration": 2.8930537700653076 + }, + { + "auxiliary_loss_clip": 0.01095541, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.03536236, + "balance_loss_mlp": 1.01897347, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.604889331803194, + "language_loss": 0.83052039, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85179222, + "num_input_tokens_seen": 226084395, + "step": 10483, + "time_per_iteration": 2.520824670791626 + }, + { + "auxiliary_loss_clip": 0.01078476, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.03257036, + "balance_loss_mlp": 1.01929331, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.578911699588369, + "language_loss": 0.72637081, + "learning_rate": 1.270077618961487e-06, + "loss": 0.74745858, + "num_input_tokens_seen": 226105890, + "step": 10484, + "time_per_iteration": 2.6282799243927 + }, + { + "auxiliary_loss_clip": 0.01071482, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.0341481, + "balance_loss_mlp": 1.01437235, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.8571645912712595, + "language_loss": 0.74539256, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76637012, + "num_input_tokens_seen": 226126760, + "step": 10485, + "time_per_iteration": 2.665893077850342 + }, + { + "auxiliary_loss_clip": 0.01077136, + "auxiliary_loss_mlp": 0.00749439, + "balance_loss_clip": 1.03467667, + "balance_loss_mlp": 1.00027931, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.872252107529726, + "language_loss": 0.81601942, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83428514, + "num_input_tokens_seen": 226147315, + "step": 10486, + "time_per_iteration": 2.652974843978882 + }, + { + "auxiliary_loss_clip": 0.01078227, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.03572083, + "balance_loss_mlp": 1.02359688, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 2.0758362770102248, + "language_loss": 0.63913751, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.66026568, + "num_input_tokens_seen": 226165935, + "step": 10487, + "time_per_iteration": 4.15027642250061 + }, + { + "auxiliary_loss_clip": 0.01100643, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.03506303, + "balance_loss_mlp": 1.02419472, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.5192488080881368, + "language_loss": 0.66901124, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69037402, + "num_input_tokens_seen": 226186890, + "step": 10488, + "time_per_iteration": 2.5466232299804688 + }, + { + "auxiliary_loss_clip": 0.01080448, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.03382349, + "balance_loss_mlp": 1.0185256, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.7017545728648593, + "language_loss": 0.67198491, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69308388, + "num_input_tokens_seen": 226206710, + "step": 10489, + "time_per_iteration": 2.590768814086914 + }, + { + "auxiliary_loss_clip": 0.01061823, + "auxiliary_loss_mlp": 0.01040988, + "balance_loss_clip": 1.033324, + "balance_loss_mlp": 1.02745199, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.7467320589812672, + "language_loss": 0.69464409, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71567219, + "num_input_tokens_seen": 226225565, + "step": 10490, + "time_per_iteration": 2.564068555831909 + }, + { + "auxiliary_loss_clip": 0.0107587, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.03207135, + "balance_loss_mlp": 1.0235374, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 3.2918751022681207, + "language_loss": 0.78061354, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80172843, + "num_input_tokens_seen": 226243680, + "step": 10491, + "time_per_iteration": 2.6325461864471436 + }, + { + "auxiliary_loss_clip": 0.01073646, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.0336374, + "balance_loss_mlp": 1.02092004, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.939293621913047, + "language_loss": 0.55440903, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.5754683, + "num_input_tokens_seen": 226264345, + "step": 10492, + "time_per_iteration": 2.610079050064087 + }, + { + "auxiliary_loss_clip": 0.01103302, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.03532636, + "balance_loss_mlp": 1.02489305, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 1.9233695492228757, + "language_loss": 0.64075804, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66216242, + "num_input_tokens_seen": 226283165, + "step": 10493, + "time_per_iteration": 2.525522470474243 + }, + { + "auxiliary_loss_clip": 0.0106735, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.03477454, + "balance_loss_mlp": 1.01737177, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.8792726122913987, + "language_loss": 0.82817346, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84913838, + "num_input_tokens_seen": 226304080, + "step": 10494, + "time_per_iteration": 2.6414918899536133 + }, + { + "auxiliary_loss_clip": 0.0108733, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.03794241, + "balance_loss_mlp": 1.02458572, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.7330828664991573, + "language_loss": 0.78881437, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81005329, + "num_input_tokens_seen": 226325925, + "step": 10495, + "time_per_iteration": 2.803687810897827 + }, + { + "auxiliary_loss_clip": 0.01069314, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.03086352, + "balance_loss_mlp": 1.02111089, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 2.257929533760152, + "language_loss": 0.70472658, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72575837, + "num_input_tokens_seen": 226344190, + "step": 10496, + "time_per_iteration": 2.5315968990325928 + }, + { + "auxiliary_loss_clip": 0.0108253, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.03479564, + "balance_loss_mlp": 1.02579987, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 5.530862954412217, + "language_loss": 0.80238944, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82359421, + "num_input_tokens_seen": 226361520, + "step": 10497, + "time_per_iteration": 2.584486961364746 + }, + { + "auxiliary_loss_clip": 0.01065678, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.0312345, + "balance_loss_mlp": 1.02110577, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 2.266447602578696, + "language_loss": 0.74036884, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76134753, + "num_input_tokens_seen": 226381920, + "step": 10498, + "time_per_iteration": 2.59061598777771 + }, + { + "auxiliary_loss_clip": 0.010884, + "auxiliary_loss_mlp": 0.01035094, + "balance_loss_clip": 1.03181875, + "balance_loss_mlp": 1.02337015, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 1.9798102704440221, + "language_loss": 0.69993204, + "learning_rate": 1.264641775364217e-06, + "loss": 0.72116697, + "num_input_tokens_seen": 226400035, + "step": 10499, + "time_per_iteration": 2.6034610271453857 + }, + { + "auxiliary_loss_clip": 0.01091104, + "auxiliary_loss_mlp": 0.01043847, + "balance_loss_clip": 1.03677487, + "balance_loss_mlp": 1.03229547, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 2.363089286727622, + "language_loss": 0.6962238, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.71757334, + "num_input_tokens_seen": 226418280, + "step": 10500, + "time_per_iteration": 2.5842297077178955 + }, + { + "auxiliary_loss_clip": 0.01101988, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03566515, + "balance_loss_mlp": 1.0219214, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 1.9049214746310763, + "language_loss": 0.74605155, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76740199, + "num_input_tokens_seen": 226436650, + "step": 10501, + "time_per_iteration": 2.5246307849884033 + }, + { + "auxiliary_loss_clip": 0.01088775, + "auxiliary_loss_mlp": 0.00749329, + "balance_loss_clip": 1.03411198, + "balance_loss_mlp": 1.0002265, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.7231959103195085, + "language_loss": 0.75368255, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77206361, + "num_input_tokens_seen": 226456275, + "step": 10502, + "time_per_iteration": 4.072481870651245 + }, + { + "auxiliary_loss_clip": 0.01092652, + "auxiliary_loss_mlp": 0.01043969, + "balance_loss_clip": 1.03508949, + "balance_loss_mlp": 1.03167307, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 2.0465726922000975, + "language_loss": 0.85159361, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87295985, + "num_input_tokens_seen": 226473610, + "step": 10503, + "time_per_iteration": 2.573000907897949 + }, + { + "auxiliary_loss_clip": 0.0107014, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.03319228, + "balance_loss_mlp": 1.02269077, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 2.0944985173388946, + "language_loss": 0.86780858, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88885605, + "num_input_tokens_seen": 226493665, + "step": 10504, + "time_per_iteration": 2.595083475112915 + }, + { + "auxiliary_loss_clip": 0.01070968, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.03563643, + "balance_loss_mlp": 1.02070332, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.6138866286973017, + "language_loss": 0.76688766, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78793365, + "num_input_tokens_seen": 226511625, + "step": 10505, + "time_per_iteration": 2.5868260860443115 + }, + { + "auxiliary_loss_clip": 0.01055383, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.03058982, + "balance_loss_mlp": 1.01961184, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 2.447824589544766, + "language_loss": 0.82009494, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.84097385, + "num_input_tokens_seen": 226530085, + "step": 10506, + "time_per_iteration": 2.6817715167999268 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.03663588, + "balance_loss_mlp": 1.02268696, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.8347081660916291, + "language_loss": 0.7454018, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76679015, + "num_input_tokens_seen": 226548115, + "step": 10507, + "time_per_iteration": 2.5373599529266357 + }, + { + "auxiliary_loss_clip": 0.01081521, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.0365088, + "balance_loss_mlp": 1.02486515, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.7715614040891183, + "language_loss": 0.67937362, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.70056021, + "num_input_tokens_seen": 226567955, + "step": 10508, + "time_per_iteration": 2.567774772644043 + }, + { + "auxiliary_loss_clip": 0.01063232, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.03114152, + "balance_loss_mlp": 1.02265739, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.882619314134015, + "language_loss": 0.71096402, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73194206, + "num_input_tokens_seen": 226588205, + "step": 10509, + "time_per_iteration": 2.6592836380004883 + }, + { + "auxiliary_loss_clip": 0.01085178, + "auxiliary_loss_mlp": 0.01028297, + "balance_loss_clip": 1.03521824, + "balance_loss_mlp": 1.01718688, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.8290360726663113, + "language_loss": 0.79551554, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81665033, + "num_input_tokens_seen": 226606965, + "step": 10510, + "time_per_iteration": 2.5660314559936523 + }, + { + "auxiliary_loss_clip": 0.01059446, + "auxiliary_loss_mlp": 0.00749512, + "balance_loss_clip": 1.03409719, + "balance_loss_mlp": 1.00022101, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.4702078783818908, + "language_loss": 0.70610213, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72419178, + "num_input_tokens_seen": 226627845, + "step": 10511, + "time_per_iteration": 2.6862263679504395 + }, + { + "auxiliary_loss_clip": 0.01098974, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.03433847, + "balance_loss_mlp": 1.02545273, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.6859300880775296, + "language_loss": 0.80244869, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82380229, + "num_input_tokens_seen": 226645855, + "step": 10512, + "time_per_iteration": 2.5510973930358887 + }, + { + "auxiliary_loss_clip": 0.01091094, + "auxiliary_loss_mlp": 0.01033357, + "balance_loss_clip": 1.03550935, + "balance_loss_mlp": 1.02097082, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.889829317500527, + "language_loss": 0.7062223, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72746676, + "num_input_tokens_seen": 226665375, + "step": 10513, + "time_per_iteration": 4.083660364151001 + }, + { + "auxiliary_loss_clip": 0.01091282, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.03422403, + "balance_loss_mlp": 1.02121305, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.6431091467003023, + "language_loss": 0.66384768, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68509817, + "num_input_tokens_seen": 226685270, + "step": 10514, + "time_per_iteration": 2.529298782348633 + }, + { + "auxiliary_loss_clip": 0.01061607, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.02941656, + "balance_loss_mlp": 1.01878095, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.9957047783163566, + "language_loss": 0.74542487, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76634908, + "num_input_tokens_seen": 226705325, + "step": 10515, + "time_per_iteration": 2.690445899963379 + }, + { + "auxiliary_loss_clip": 0.01074816, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.034482, + "balance_loss_mlp": 1.01795006, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.7586772998700448, + "language_loss": 0.89522934, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.91626859, + "num_input_tokens_seen": 226723815, + "step": 10516, + "time_per_iteration": 2.529099702835083 + }, + { + "auxiliary_loss_clip": 0.01107801, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.03744721, + "balance_loss_mlp": 1.01844585, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.8371907797438545, + "language_loss": 0.81978875, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84118128, + "num_input_tokens_seen": 226741550, + "step": 10517, + "time_per_iteration": 2.537585973739624 + }, + { + "auxiliary_loss_clip": 0.01046154, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.03343081, + "balance_loss_mlp": 1.02311838, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.6721122293979633, + "language_loss": 0.77705681, + "learning_rate": 1.257765386189541e-06, + "loss": 0.7978642, + "num_input_tokens_seen": 226761115, + "step": 10518, + "time_per_iteration": 4.3056480884552 + }, + { + "auxiliary_loss_clip": 0.01084882, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.03362918, + "balance_loss_mlp": 1.01991343, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.4416627716935524, + "language_loss": 0.8526243, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87378776, + "num_input_tokens_seen": 226782225, + "step": 10519, + "time_per_iteration": 3.017761707305908 + }, + { + "auxiliary_loss_clip": 0.01073858, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.03305387, + "balance_loss_mlp": 1.02246666, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.5223216799051817, + "language_loss": 0.72012949, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.74120939, + "num_input_tokens_seen": 226802375, + "step": 10520, + "time_per_iteration": 2.616429090499878 + }, + { + "auxiliary_loss_clip": 0.0108722, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.03269506, + "balance_loss_mlp": 1.01886535, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.8394367441572277, + "language_loss": 0.71326202, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73443592, + "num_input_tokens_seen": 226822165, + "step": 10521, + "time_per_iteration": 2.747070789337158 + }, + { + "auxiliary_loss_clip": 0.01059859, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.03311884, + "balance_loss_mlp": 1.02613544, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.6788602869828289, + "language_loss": 0.7183584, + "learning_rate": 1.256319016853377e-06, + "loss": 0.73936379, + "num_input_tokens_seen": 226841645, + "step": 10522, + "time_per_iteration": 2.6821110248565674 + }, + { + "auxiliary_loss_clip": 0.0105957, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.03540361, + "balance_loss_mlp": 1.02062368, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.9426728585363164, + "language_loss": 0.81654358, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83746254, + "num_input_tokens_seen": 226860355, + "step": 10523, + "time_per_iteration": 2.690279722213745 + }, + { + "auxiliary_loss_clip": 0.01089938, + "auxiliary_loss_mlp": 0.01025897, + "balance_loss_clip": 1.03445518, + "balance_loss_mlp": 1.01432157, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.390494746816635, + "language_loss": 0.73576236, + "learning_rate": 1.255596001333195e-06, + "loss": 0.7569207, + "num_input_tokens_seen": 226878390, + "step": 10524, + "time_per_iteration": 2.571794033050537 + }, + { + "auxiliary_loss_clip": 0.01087093, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.03477454, + "balance_loss_mlp": 1.02270818, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 2.290787686777311, + "language_loss": 0.83917028, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86039871, + "num_input_tokens_seen": 226898420, + "step": 10525, + "time_per_iteration": 2.659130334854126 + }, + { + "auxiliary_loss_clip": 0.01064784, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.02903068, + "balance_loss_mlp": 1.01904058, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 1.6158186097460971, + "language_loss": 0.66762513, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.68859029, + "num_input_tokens_seen": 226916305, + "step": 10526, + "time_per_iteration": 2.5544517040252686 + }, + { + "auxiliary_loss_clip": 0.01095181, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.03726625, + "balance_loss_mlp": 1.02138436, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 1.458390106572676, + "language_loss": 0.73253953, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75383478, + "num_input_tokens_seen": 226937705, + "step": 10527, + "time_per_iteration": 4.150975465774536 + }, + { + "auxiliary_loss_clip": 0.01087507, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.03444588, + "balance_loss_mlp": 1.02160501, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 2.2654169619582794, + "language_loss": 0.71448207, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.7356835, + "num_input_tokens_seen": 226954880, + "step": 10528, + "time_per_iteration": 2.5408051013946533 + }, + { + "auxiliary_loss_clip": 0.01089458, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.03537655, + "balance_loss_mlp": 1.01589417, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 3.248068421449386, + "language_loss": 0.66502213, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68619251, + "num_input_tokens_seen": 226972595, + "step": 10529, + "time_per_iteration": 2.5698161125183105 + }, + { + "auxiliary_loss_clip": 0.01090622, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.0343399, + "balance_loss_mlp": 1.02296519, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 2.082122642995034, + "language_loss": 0.74851751, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.76977932, + "num_input_tokens_seen": 226991910, + "step": 10530, + "time_per_iteration": 2.5082738399505615 + }, + { + "auxiliary_loss_clip": 0.01092068, + "auxiliary_loss_mlp": 0.00749201, + "balance_loss_clip": 1.03623486, + "balance_loss_mlp": 1.00021696, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 2.5506080498968235, + "language_loss": 0.73867416, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75708687, + "num_input_tokens_seen": 227010175, + "step": 10531, + "time_per_iteration": 2.49798583984375 + }, + { + "auxiliary_loss_clip": 0.01065893, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.0352087, + "balance_loss_mlp": 1.0167073, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.675645375620033, + "language_loss": 0.79922336, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.82016671, + "num_input_tokens_seen": 227025540, + "step": 10532, + "time_per_iteration": 2.598762273788452 + }, + { + "auxiliary_loss_clip": 0.01088061, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.03415966, + "balance_loss_mlp": 1.01944613, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.5341546270596866, + "language_loss": 0.74403501, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.7652154, + "num_input_tokens_seen": 227045520, + "step": 10533, + "time_per_iteration": 2.5622549057006836 + }, + { + "auxiliary_loss_clip": 0.01081152, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.03694105, + "balance_loss_mlp": 1.02459741, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.2836786253637564, + "language_loss": 0.76981491, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79100001, + "num_input_tokens_seen": 227059420, + "step": 10534, + "time_per_iteration": 2.5596389770507812 + }, + { + "auxiliary_loss_clip": 0.01061666, + "auxiliary_loss_mlp": 0.01040995, + "balance_loss_clip": 1.03171062, + "balance_loss_mlp": 1.02770877, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.657944524163105, + "language_loss": 0.8555553, + "learning_rate": 1.251621437204777e-06, + "loss": 0.87658185, + "num_input_tokens_seen": 227081310, + "step": 10535, + "time_per_iteration": 2.687034845352173 + }, + { + "auxiliary_loss_clip": 0.01093474, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.03572214, + "balance_loss_mlp": 1.02012253, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 1.7721133106139604, + "language_loss": 0.76356238, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78481907, + "num_input_tokens_seen": 227100365, + "step": 10536, + "time_per_iteration": 2.6258749961853027 + }, + { + "auxiliary_loss_clip": 0.01085804, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.03583288, + "balance_loss_mlp": 1.01832795, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.8771576808355686, + "language_loss": 0.60199773, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62316561, + "num_input_tokens_seen": 227119680, + "step": 10537, + "time_per_iteration": 2.5956711769104004 + }, + { + "auxiliary_loss_clip": 0.00998829, + "auxiliary_loss_mlp": 0.00998135, + "balance_loss_clip": 1.00808454, + "balance_loss_mlp": 0.9969846, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.8727481095180234, + "language_loss": 0.52463561, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54460526, + "num_input_tokens_seen": 227184465, + "step": 10538, + "time_per_iteration": 3.250807046890259 + }, + { + "auxiliary_loss_clip": 0.01084479, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.03730547, + "balance_loss_mlp": 1.02025044, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 2.1886029150207658, + "language_loss": 0.83539772, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85657167, + "num_input_tokens_seen": 227202185, + "step": 10539, + "time_per_iteration": 2.6140658855438232 + }, + { + "auxiliary_loss_clip": 0.01072126, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.03294277, + "balance_loss_mlp": 1.01893091, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.716811824751692, + "language_loss": 0.86741883, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88845462, + "num_input_tokens_seen": 227222020, + "step": 10540, + "time_per_iteration": 2.7036221027374268 + }, + { + "auxiliary_loss_clip": 0.01077122, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.0344162, + "balance_loss_mlp": 1.01768541, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.7637995901849486, + "language_loss": 0.72590047, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74695134, + "num_input_tokens_seen": 227240885, + "step": 10541, + "time_per_iteration": 2.676015853881836 + }, + { + "auxiliary_loss_clip": 0.0109462, + "auxiliary_loss_mlp": 0.0103351, + "balance_loss_clip": 1.03519487, + "balance_loss_mlp": 1.02092731, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.623467683412661, + "language_loss": 0.84772241, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.86900371, + "num_input_tokens_seen": 227257880, + "step": 10542, + "time_per_iteration": 4.050605535507202 + }, + { + "auxiliary_loss_clip": 0.01090738, + "auxiliary_loss_mlp": 0.01029014, + "balance_loss_clip": 1.0352993, + "balance_loss_mlp": 1.01639009, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 2.650493036308114, + "language_loss": 0.77625084, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.79744834, + "num_input_tokens_seen": 227274840, + "step": 10543, + "time_per_iteration": 2.512352466583252 + }, + { + "auxiliary_loss_clip": 0.01049212, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.03361821, + "balance_loss_mlp": 1.01974678, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.5265145187693927, + "language_loss": 0.73617303, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75696874, + "num_input_tokens_seen": 227294835, + "step": 10544, + "time_per_iteration": 2.6823508739471436 + }, + { + "auxiliary_loss_clip": 0.01074297, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.03520763, + "balance_loss_mlp": 1.02414119, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 2.0848929637073867, + "language_loss": 0.68507648, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70618445, + "num_input_tokens_seen": 227314935, + "step": 10545, + "time_per_iteration": 2.6619553565979004 + }, + { + "auxiliary_loss_clip": 0.01076018, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.03160179, + "balance_loss_mlp": 1.02295375, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.1890842973404787, + "language_loss": 0.7051906, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.72630197, + "num_input_tokens_seen": 227332905, + "step": 10546, + "time_per_iteration": 2.5912575721740723 + }, + { + "auxiliary_loss_clip": 0.01088576, + "auxiliary_loss_mlp": 0.01026174, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.01543915, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.366528526001327, + "language_loss": 0.78056556, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80171305, + "num_input_tokens_seen": 227354915, + "step": 10547, + "time_per_iteration": 2.6396331787109375 + }, + { + "auxiliary_loss_clip": 0.01054326, + "auxiliary_loss_mlp": 0.0103087, + "balance_loss_clip": 1.03025484, + "balance_loss_mlp": 1.01914597, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.7232318415366985, + "language_loss": 0.63134682, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65219879, + "num_input_tokens_seen": 227372990, + "step": 10548, + "time_per_iteration": 2.6726818084716797 + }, + { + "auxiliary_loss_clip": 0.0107085, + "auxiliary_loss_mlp": 0.01028443, + "balance_loss_clip": 1.03206444, + "balance_loss_mlp": 1.01697516, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.7710881080195136, + "language_loss": 0.61904401, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.64003694, + "num_input_tokens_seen": 227393270, + "step": 10549, + "time_per_iteration": 2.706522226333618 + }, + { + "auxiliary_loss_clip": 0.01053611, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.0345366, + "balance_loss_mlp": 1.01946104, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.8441863277035864, + "language_loss": 0.73344415, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75428158, + "num_input_tokens_seen": 227413630, + "step": 10550, + "time_per_iteration": 2.667072057723999 + }, + { + "auxiliary_loss_clip": 0.0100147, + "auxiliary_loss_mlp": 0.01004262, + "balance_loss_clip": 1.01278436, + "balance_loss_mlp": 1.00307035, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.7076061547964976, + "language_loss": 0.57782865, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59788597, + "num_input_tokens_seen": 227476630, + "step": 10551, + "time_per_iteration": 3.178623676300049 + }, + { + "auxiliary_loss_clip": 0.01067855, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.03560781, + "balance_loss_mlp": 1.01542592, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.7340295398383352, + "language_loss": 0.66983944, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69078028, + "num_input_tokens_seen": 227496060, + "step": 10552, + "time_per_iteration": 2.7595701217651367 + }, + { + "auxiliary_loss_clip": 0.01061736, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.03178751, + "balance_loss_mlp": 1.0166012, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.7249094488522025, + "language_loss": 0.82062709, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84153724, + "num_input_tokens_seen": 227513440, + "step": 10553, + "time_per_iteration": 4.119039297103882 + }, + { + "auxiliary_loss_clip": 0.01091725, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.03424823, + "balance_loss_mlp": 1.01804614, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 4.050430682499314, + "language_loss": 0.55225205, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.57346737, + "num_input_tokens_seen": 227535395, + "step": 10554, + "time_per_iteration": 2.7348477840423584 + }, + { + "auxiliary_loss_clip": 0.01082826, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.03608561, + "balance_loss_mlp": 1.02009881, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 2.517132515247849, + "language_loss": 0.70778656, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72893059, + "num_input_tokens_seen": 227554545, + "step": 10555, + "time_per_iteration": 2.615098476409912 + }, + { + "auxiliary_loss_clip": 0.01005602, + "auxiliary_loss_mlp": 0.01000793, + "balance_loss_clip": 1.00444162, + "balance_loss_mlp": 0.99966604, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.8028468800800409, + "language_loss": 0.55326772, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57333171, + "num_input_tokens_seen": 227608575, + "step": 10556, + "time_per_iteration": 3.0364136695861816 + }, + { + "auxiliary_loss_clip": 0.01080974, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.03283799, + "balance_loss_mlp": 1.01748288, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 4.286424025430047, + "language_loss": 0.68645918, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70757526, + "num_input_tokens_seen": 227628175, + "step": 10557, + "time_per_iteration": 2.619201183319092 + }, + { + "auxiliary_loss_clip": 0.01069611, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.03382409, + "balance_loss_mlp": 1.02269423, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.5596667632747123, + "language_loss": 0.69965541, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72069657, + "num_input_tokens_seen": 227645330, + "step": 10558, + "time_per_iteration": 4.039411306381226 + }, + { + "auxiliary_loss_clip": 0.01068555, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.03287244, + "balance_loss_mlp": 1.01915526, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.6447699545548091, + "language_loss": 0.7826156, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80361116, + "num_input_tokens_seen": 227665250, + "step": 10559, + "time_per_iteration": 2.5760903358459473 + }, + { + "auxiliary_loss_clip": 0.0107186, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.03228295, + "balance_loss_mlp": 1.01973152, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 1.7918283074684458, + "language_loss": 0.68046892, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70150936, + "num_input_tokens_seen": 227685070, + "step": 10560, + "time_per_iteration": 2.5708160400390625 + }, + { + "auxiliary_loss_clip": 0.01072085, + "auxiliary_loss_mlp": 0.01043403, + "balance_loss_clip": 1.03214097, + "balance_loss_mlp": 1.02988446, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.892477955975432, + "language_loss": 0.77016866, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79132354, + "num_input_tokens_seen": 227704430, + "step": 10561, + "time_per_iteration": 2.5997610092163086 + }, + { + "auxiliary_loss_clip": 0.01078349, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.03255153, + "balance_loss_mlp": 1.01986229, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 7.565692158642145, + "language_loss": 0.71890819, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74000561, + "num_input_tokens_seen": 227724920, + "step": 10562, + "time_per_iteration": 2.622844934463501 + }, + { + "auxiliary_loss_clip": 0.01087796, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03520131, + "balance_loss_mlp": 1.01716757, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.0065491058733755, + "language_loss": 0.80839682, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.82957101, + "num_input_tokens_seen": 227743400, + "step": 10563, + "time_per_iteration": 2.570688486099243 + }, + { + "auxiliary_loss_clip": 0.01074259, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.03583765, + "balance_loss_mlp": 1.02066278, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.5140061093435686, + "language_loss": 0.81101972, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83208972, + "num_input_tokens_seen": 227759990, + "step": 10564, + "time_per_iteration": 2.7065348625183105 + }, + { + "auxiliary_loss_clip": 0.01069648, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.03511453, + "balance_loss_mlp": 1.02051258, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.6594673371616304, + "language_loss": 0.72850055, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74953246, + "num_input_tokens_seen": 227780835, + "step": 10565, + "time_per_iteration": 2.682851791381836 + }, + { + "auxiliary_loss_clip": 0.01088129, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.03601038, + "balance_loss_mlp": 1.02015841, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.226171187654655, + "language_loss": 0.68778527, + "learning_rate": 1.240438926700324e-06, + "loss": 0.70899397, + "num_input_tokens_seen": 227798580, + "step": 10566, + "time_per_iteration": 2.526870012283325 + }, + { + "auxiliary_loss_clip": 0.01089237, + "auxiliary_loss_mlp": 0.01031865, + "balance_loss_clip": 1.03531265, + "balance_loss_mlp": 1.02065372, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.576410379757501, + "language_loss": 0.69481349, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71602452, + "num_input_tokens_seen": 227819210, + "step": 10567, + "time_per_iteration": 4.120081663131714 + }, + { + "auxiliary_loss_clip": 0.01087799, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.03732896, + "balance_loss_mlp": 1.01914108, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 1.905767872370008, + "language_loss": 0.84522235, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86639875, + "num_input_tokens_seen": 227838340, + "step": 10568, + "time_per_iteration": 2.5448460578918457 + }, + { + "auxiliary_loss_clip": 0.01038629, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.03222442, + "balance_loss_mlp": 1.02262664, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.6925260550459482, + "language_loss": 0.84010595, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86084747, + "num_input_tokens_seen": 227859170, + "step": 10569, + "time_per_iteration": 2.7044336795806885 + }, + { + "auxiliary_loss_clip": 0.01088314, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.03427827, + "balance_loss_mlp": 1.01734519, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.824884659480737, + "language_loss": 0.6934613, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71463531, + "num_input_tokens_seen": 227878545, + "step": 10570, + "time_per_iteration": 2.5516035556793213 + }, + { + "auxiliary_loss_clip": 0.0109118, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_clip": 1.03350317, + "balance_loss_mlp": 1.01938033, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.739701240975854, + "language_loss": 0.65743828, + "learning_rate": 1.2386378775476e-06, + "loss": 0.6786654, + "num_input_tokens_seen": 227898875, + "step": 10571, + "time_per_iteration": 2.5990395545959473 + }, + { + "auxiliary_loss_clip": 0.01096759, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.03738999, + "balance_loss_mlp": 1.01454079, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.5987026446428783, + "language_loss": 0.70818853, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.72941983, + "num_input_tokens_seen": 227917130, + "step": 10572, + "time_per_iteration": 2.5096213817596436 + }, + { + "auxiliary_loss_clip": 0.01068835, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.03466558, + "balance_loss_mlp": 1.02025568, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.4356169991674963, + "language_loss": 0.81335747, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.8343637, + "num_input_tokens_seen": 227939550, + "step": 10573, + "time_per_iteration": 2.609532117843628 + }, + { + "auxiliary_loss_clip": 0.01083407, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.03538609, + "balance_loss_mlp": 1.02101684, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.7147287260151514, + "language_loss": 0.68759567, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.70875162, + "num_input_tokens_seen": 227962200, + "step": 10574, + "time_per_iteration": 2.7896833419799805 + }, + { + "auxiliary_loss_clip": 0.01100911, + "auxiliary_loss_mlp": 0.01027289, + "balance_loss_clip": 1.0354476, + "balance_loss_mlp": 1.01557112, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.2486236190419167, + "language_loss": 0.86875272, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.89003468, + "num_input_tokens_seen": 227979270, + "step": 10575, + "time_per_iteration": 2.4884824752807617 + }, + { + "auxiliary_loss_clip": 0.01100881, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.03538847, + "balance_loss_mlp": 1.02124178, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.6073105752453, + "language_loss": 0.72126532, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.7426011, + "num_input_tokens_seen": 228000550, + "step": 10576, + "time_per_iteration": 2.5290772914886475 + }, + { + "auxiliary_loss_clip": 0.01080174, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.03395915, + "balance_loss_mlp": 1.01848245, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.7003117372917906, + "language_loss": 0.69653094, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71764052, + "num_input_tokens_seen": 228022005, + "step": 10577, + "time_per_iteration": 2.606088399887085 + }, + { + "auxiliary_loss_clip": 0.01061821, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.03504229, + "balance_loss_mlp": 1.02245927, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.5889013363589122, + "language_loss": 0.72068667, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.7416479, + "num_input_tokens_seen": 228043770, + "step": 10578, + "time_per_iteration": 2.8182408809661865 + }, + { + "auxiliary_loss_clip": 0.01002319, + "auxiliary_loss_mlp": 0.0074697, + "balance_loss_clip": 1.01002455, + "balance_loss_mlp": 1.000193, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7038517552843552, + "language_loss": 0.54502559, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56251848, + "num_input_tokens_seen": 228104985, + "step": 10579, + "time_per_iteration": 3.2412490844726562 + }, + { + "auxiliary_loss_clip": 0.01075714, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.0328722, + "balance_loss_mlp": 1.01738048, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 2.236673483162899, + "language_loss": 0.77229536, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79334748, + "num_input_tokens_seen": 228125620, + "step": 10580, + "time_per_iteration": 2.610539674758911 + }, + { + "auxiliary_loss_clip": 0.01071157, + "auxiliary_loss_mlp": 0.00749473, + "balance_loss_clip": 1.03429508, + "balance_loss_mlp": 1.0002985, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.590054923537448, + "language_loss": 0.66816241, + "learning_rate": 1.235037946268301e-06, + "loss": 0.6863687, + "num_input_tokens_seen": 228143495, + "step": 10581, + "time_per_iteration": 2.657151222229004 + }, + { + "auxiliary_loss_clip": 0.0108697, + "auxiliary_loss_mlp": 0.0103071, + "balance_loss_clip": 1.033355, + "balance_loss_mlp": 1.01998723, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.364123864115636, + "language_loss": 0.6852541, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70643091, + "num_input_tokens_seen": 228166500, + "step": 10582, + "time_per_iteration": 4.107583999633789 + }, + { + "auxiliary_loss_clip": 0.01076309, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.03419793, + "balance_loss_mlp": 1.02539587, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 2.0407979111980317, + "language_loss": 0.84685838, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86798829, + "num_input_tokens_seen": 228185325, + "step": 10583, + "time_per_iteration": 2.6355528831481934 + }, + { + "auxiliary_loss_clip": 0.01080177, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.03736651, + "balance_loss_mlp": 1.01565623, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.6333534613355893, + "language_loss": 0.75376379, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77483881, + "num_input_tokens_seen": 228204050, + "step": 10584, + "time_per_iteration": 2.579406261444092 + }, + { + "auxiliary_loss_clip": 0.01077114, + "auxiliary_loss_mlp": 0.01036914, + "balance_loss_clip": 1.03526258, + "balance_loss_mlp": 1.02406335, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.9153726572454914, + "language_loss": 0.72671831, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.74785864, + "num_input_tokens_seen": 228222430, + "step": 10585, + "time_per_iteration": 2.5591745376586914 + }, + { + "auxiliary_loss_clip": 0.01064954, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.03274715, + "balance_loss_mlp": 1.02136135, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 2.8572511090336774, + "language_loss": 0.82610989, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.84707892, + "num_input_tokens_seen": 228241925, + "step": 10586, + "time_per_iteration": 2.7161810398101807 + }, + { + "auxiliary_loss_clip": 0.01087645, + "auxiliary_loss_mlp": 0.01025423, + "balance_loss_clip": 1.03391027, + "balance_loss_mlp": 1.014575, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 1.6022727945585322, + "language_loss": 0.72144639, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74257708, + "num_input_tokens_seen": 228262535, + "step": 10587, + "time_per_iteration": 2.5725600719451904 + }, + { + "auxiliary_loss_clip": 0.01080209, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.03646255, + "balance_loss_mlp": 1.01840401, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 1.9089994551584262, + "language_loss": 0.7660898, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.78718418, + "num_input_tokens_seen": 228281340, + "step": 10588, + "time_per_iteration": 2.610818386077881 + }, + { + "auxiliary_loss_clip": 0.010517, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.03294563, + "balance_loss_mlp": 1.01837695, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.4652784953538303, + "language_loss": 0.79899585, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.81981266, + "num_input_tokens_seen": 228300865, + "step": 10589, + "time_per_iteration": 2.72843074798584 + }, + { + "auxiliary_loss_clip": 0.01076283, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.03229117, + "balance_loss_mlp": 1.02062535, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 7.784588682925299, + "language_loss": 0.67142898, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69251621, + "num_input_tokens_seen": 228320815, + "step": 10590, + "time_per_iteration": 2.635420083999634 + }, + { + "auxiliary_loss_clip": 0.01098771, + "auxiliary_loss_mlp": 0.01036659, + "balance_loss_clip": 1.03690267, + "balance_loss_mlp": 1.0246253, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.6849720231445249, + "language_loss": 0.78821635, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.80957067, + "num_input_tokens_seen": 228339065, + "step": 10591, + "time_per_iteration": 2.494450330734253 + }, + { + "auxiliary_loss_clip": 0.01077483, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.03403842, + "balance_loss_mlp": 1.01824236, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.4366632617488218, + "language_loss": 0.8892566, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91032231, + "num_input_tokens_seen": 228359210, + "step": 10592, + "time_per_iteration": 2.6064906120300293 + }, + { + "auxiliary_loss_clip": 0.01083767, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_clip": 1.03187919, + "balance_loss_mlp": 1.01686764, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.3919403332150275, + "language_loss": 0.68489313, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70600688, + "num_input_tokens_seen": 228379630, + "step": 10593, + "time_per_iteration": 4.131492376327515 + }, + { + "auxiliary_loss_clip": 0.01042564, + "auxiliary_loss_mlp": 0.0104028, + "balance_loss_clip": 1.02648354, + "balance_loss_mlp": 1.02783453, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.7647234447630349, + "language_loss": 0.63440931, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65523773, + "num_input_tokens_seen": 228401410, + "step": 10594, + "time_per_iteration": 2.730931520462036 + }, + { + "auxiliary_loss_clip": 0.01016427, + "auxiliary_loss_mlp": 0.01008477, + "balance_loss_clip": 1.00821638, + "balance_loss_mlp": 1.00735021, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7692602867678175, + "language_loss": 0.54616082, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56640983, + "num_input_tokens_seen": 228470335, + "step": 10595, + "time_per_iteration": 3.2093329429626465 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.01038514, + "balance_loss_clip": 1.03713322, + "balance_loss_mlp": 1.02628946, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 2.142546480337996, + "language_loss": 0.67202759, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69346869, + "num_input_tokens_seen": 228490765, + "step": 10596, + "time_per_iteration": 2.5406503677368164 + }, + { + "auxiliary_loss_clip": 0.01083904, + "auxiliary_loss_mlp": 0.01032781, + "balance_loss_clip": 1.03466964, + "balance_loss_mlp": 1.02109861, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.0970134321359883, + "language_loss": 0.78450817, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.80567503, + "num_input_tokens_seen": 228509700, + "step": 10597, + "time_per_iteration": 2.623852491378784 + }, + { + "auxiliary_loss_clip": 0.01092207, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.03695691, + "balance_loss_mlp": 1.02270329, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.7580267289235545, + "language_loss": 0.74832952, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.76958537, + "num_input_tokens_seen": 228529050, + "step": 10598, + "time_per_iteration": 3.9990530014038086 + }, + { + "auxiliary_loss_clip": 0.01068447, + "auxiliary_loss_mlp": 0.00749421, + "balance_loss_clip": 1.03288817, + "balance_loss_mlp": 1.0002569, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.807727235607575, + "language_loss": 0.68440068, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70257938, + "num_input_tokens_seen": 228544665, + "step": 10599, + "time_per_iteration": 2.5615415573120117 + }, + { + "auxiliary_loss_clip": 0.010597, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.03121245, + "balance_loss_mlp": 1.01588857, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.5208419265182163, + "language_loss": 0.80715662, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82803786, + "num_input_tokens_seen": 228562060, + "step": 10600, + "time_per_iteration": 2.5885732173919678 + }, + { + "auxiliary_loss_clip": 0.01087514, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.03244555, + "balance_loss_mlp": 1.01914907, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.5266229085210057, + "language_loss": 0.79946637, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82064533, + "num_input_tokens_seen": 228582550, + "step": 10601, + "time_per_iteration": 2.5841174125671387 + }, + { + "auxiliary_loss_clip": 0.01069523, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.03385901, + "balance_loss_mlp": 1.0175612, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 2.0490745892844013, + "language_loss": 0.67236269, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69334722, + "num_input_tokens_seen": 228604960, + "step": 10602, + "time_per_iteration": 2.626803398132324 + }, + { + "auxiliary_loss_clip": 0.01019285, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.02757096, + "balance_loss_mlp": 1.01623976, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.66709378869357, + "language_loss": 0.79755014, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81802368, + "num_input_tokens_seen": 228622195, + "step": 10603, + "time_per_iteration": 2.891356945037842 + }, + { + "auxiliary_loss_clip": 0.01058974, + "auxiliary_loss_mlp": 0.00749322, + "balance_loss_clip": 1.03307366, + "balance_loss_mlp": 1.00026894, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 2.286406764995719, + "language_loss": 0.76757675, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78565967, + "num_input_tokens_seen": 228639735, + "step": 10604, + "time_per_iteration": 2.7888896465301514 + }, + { + "auxiliary_loss_clip": 0.01082024, + "auxiliary_loss_mlp": 0.01031799, + "balance_loss_clip": 1.03269589, + "balance_loss_mlp": 1.01953816, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.6002237953261456, + "language_loss": 0.76796728, + "learning_rate": 1.226409972197281e-06, + "loss": 0.78910553, + "num_input_tokens_seen": 228658195, + "step": 10605, + "time_per_iteration": 2.586054563522339 + }, + { + "auxiliary_loss_clip": 0.01039738, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.03106856, + "balance_loss_mlp": 1.01864243, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 2.3529205111366527, + "language_loss": 0.6588223, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67955804, + "num_input_tokens_seen": 228677415, + "step": 10606, + "time_per_iteration": 4.258561849594116 + }, + { + "auxiliary_loss_clip": 0.01074991, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.03349471, + "balance_loss_mlp": 1.02135825, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.5294793753059368, + "language_loss": 0.75402433, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77509296, + "num_input_tokens_seen": 228696450, + "step": 10607, + "time_per_iteration": 2.5863540172576904 + }, + { + "auxiliary_loss_clip": 0.01079028, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.03539968, + "balance_loss_mlp": 1.02191806, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.7724379596058326, + "language_loss": 0.65711993, + "learning_rate": 1.225332659627278e-06, + "loss": 0.67824686, + "num_input_tokens_seen": 228721600, + "step": 10608, + "time_per_iteration": 2.962078094482422 + }, + { + "auxiliary_loss_clip": 0.00975046, + "auxiliary_loss_mlp": 0.01000243, + "balance_loss_clip": 1.01304317, + "balance_loss_mlp": 0.99887246, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7888558875236389, + "language_loss": 0.51912254, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53887546, + "num_input_tokens_seen": 228784535, + "step": 10609, + "time_per_iteration": 3.5450220108032227 + }, + { + "auxiliary_loss_clip": 0.01085282, + "auxiliary_loss_mlp": 0.01023361, + "balance_loss_clip": 1.03163564, + "balance_loss_mlp": 1.0130012, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.5462301983105948, + "language_loss": 0.74828446, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76937091, + "num_input_tokens_seen": 228804110, + "step": 10610, + "time_per_iteration": 3.2531847953796387 + }, + { + "auxiliary_loss_clip": 0.01004488, + "auxiliary_loss_mlp": 0.01002017, + "balance_loss_clip": 1.00403965, + "balance_loss_mlp": 1.00102162, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8437165012855758, + "language_loss": 0.63173819, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65180326, + "num_input_tokens_seen": 228867705, + "step": 10611, + "time_per_iteration": 3.2254629135131836 + }, + { + "auxiliary_loss_clip": 0.01088107, + "auxiliary_loss_mlp": 0.01031621, + "balance_loss_clip": 1.03431702, + "balance_loss_mlp": 1.01981902, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 2.0886167558532716, + "language_loss": 0.72276199, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74395931, + "num_input_tokens_seen": 228889215, + "step": 10612, + "time_per_iteration": 2.6405208110809326 + }, + { + "auxiliary_loss_clip": 0.0101074, + "auxiliary_loss_mlp": 0.0099892, + "balance_loss_clip": 1.00891471, + "balance_loss_mlp": 0.99779379, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7138323392987722, + "language_loss": 0.57863796, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.5987345, + "num_input_tokens_seen": 228948465, + "step": 10613, + "time_per_iteration": 3.134366273880005 + }, + { + "auxiliary_loss_clip": 0.01054447, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.03125334, + "balance_loss_mlp": 1.01855648, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.7307009785244556, + "language_loss": 0.75462234, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77547717, + "num_input_tokens_seen": 228967955, + "step": 10614, + "time_per_iteration": 2.746138334274292 + }, + { + "auxiliary_loss_clip": 0.01080818, + "auxiliary_loss_mlp": 0.00749332, + "balance_loss_clip": 1.03617704, + "balance_loss_mlp": 1.00023472, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.795048985999116, + "language_loss": 0.79859787, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81689936, + "num_input_tokens_seen": 228985495, + "step": 10615, + "time_per_iteration": 2.685659170150757 + }, + { + "auxiliary_loss_clip": 0.01007151, + "auxiliary_loss_mlp": 0.01002531, + "balance_loss_clip": 1.0085187, + "balance_loss_mlp": 1.00135064, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6601400643763026, + "language_loss": 0.55636573, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57646263, + "num_input_tokens_seen": 229052995, + "step": 10616, + "time_per_iteration": 3.249549150466919 + }, + { + "auxiliary_loss_clip": 0.01069715, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.03156722, + "balance_loss_mlp": 1.02029157, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.8362527114209337, + "language_loss": 0.84212601, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86315113, + "num_input_tokens_seen": 229071030, + "step": 10617, + "time_per_iteration": 2.6080307960510254 + }, + { + "auxiliary_loss_clip": 0.01091708, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.03478396, + "balance_loss_mlp": 1.02554035, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.7554012302129167, + "language_loss": 0.86659288, + "learning_rate": 1.221743529196936e-06, + "loss": 0.88789213, + "num_input_tokens_seen": 229088275, + "step": 10618, + "time_per_iteration": 2.569408416748047 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.03660238, + "balance_loss_mlp": 1.02269757, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 2.749423202546277, + "language_loss": 0.73302734, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75390476, + "num_input_tokens_seen": 229105190, + "step": 10619, + "time_per_iteration": 2.763613224029541 + }, + { + "auxiliary_loss_clip": 0.01078053, + "auxiliary_loss_mlp": 0.01037073, + "balance_loss_clip": 1.03386247, + "balance_loss_mlp": 1.02373362, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 2.113146042046671, + "language_loss": 0.76398528, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78513652, + "num_input_tokens_seen": 229122290, + "step": 10620, + "time_per_iteration": 2.5803043842315674 + }, + { + "auxiliary_loss_clip": 0.01077697, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.03444433, + "balance_loss_mlp": 1.01586831, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 3.03238166656388, + "language_loss": 0.7020117, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.7230581, + "num_input_tokens_seen": 229141620, + "step": 10621, + "time_per_iteration": 2.648430824279785 + }, + { + "auxiliary_loss_clip": 0.01071351, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.0305115, + "balance_loss_mlp": 1.01728308, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.596870470352075, + "language_loss": 0.77813506, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79912424, + "num_input_tokens_seen": 229161570, + "step": 10622, + "time_per_iteration": 4.11418080329895 + }, + { + "auxiliary_loss_clip": 0.01059614, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.03194833, + "balance_loss_mlp": 1.02176726, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.8542971963438806, + "language_loss": 0.74801165, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.76893836, + "num_input_tokens_seen": 229178465, + "step": 10623, + "time_per_iteration": 2.642799139022827 + }, + { + "auxiliary_loss_clip": 0.01075873, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.03271306, + "balance_loss_mlp": 1.02001882, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.3737911550546065, + "language_loss": 0.76485252, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78591418, + "num_input_tokens_seen": 229198975, + "step": 10624, + "time_per_iteration": 2.620746612548828 + }, + { + "auxiliary_loss_clip": 0.01035175, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.03014851, + "balance_loss_mlp": 1.02373433, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.65403149842201, + "language_loss": 0.80249023, + "learning_rate": 1.21923289302382e-06, + "loss": 0.8232004, + "num_input_tokens_seen": 229218825, + "step": 10625, + "time_per_iteration": 2.6523120403289795 + }, + { + "auxiliary_loss_clip": 0.01080801, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.03846335, + "balance_loss_mlp": 1.01934266, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 2.293119958379724, + "language_loss": 0.72905612, + "learning_rate": 1.218874349031654e-06, + "loss": 0.75017214, + "num_input_tokens_seen": 229236060, + "step": 10626, + "time_per_iteration": 2.613217830657959 + }, + { + "auxiliary_loss_clip": 0.01078388, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.03242683, + "balance_loss_mlp": 1.02363837, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.8156723357888125, + "language_loss": 0.72635823, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74749446, + "num_input_tokens_seen": 229255160, + "step": 10627, + "time_per_iteration": 2.539393186569214 + }, + { + "auxiliary_loss_clip": 0.01079539, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.03749657, + "balance_loss_mlp": 1.01512122, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 1.8324560038607913, + "language_loss": 0.66656172, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.68764055, + "num_input_tokens_seen": 229278705, + "step": 10628, + "time_per_iteration": 2.663139820098877 + }, + { + "auxiliary_loss_clip": 0.01096588, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.03403115, + "balance_loss_mlp": 1.016927, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.7871209597958277, + "language_loss": 0.68072766, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.70196891, + "num_input_tokens_seen": 229299990, + "step": 10629, + "time_per_iteration": 2.544264554977417 + }, + { + "auxiliary_loss_clip": 0.01061335, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.0328474, + "balance_loss_mlp": 1.02179432, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.5342177877844083, + "language_loss": 0.75504547, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77600938, + "num_input_tokens_seen": 229319230, + "step": 10630, + "time_per_iteration": 2.653212308883667 + }, + { + "auxiliary_loss_clip": 0.01069576, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.03007638, + "balance_loss_mlp": 1.0234139, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.5162424213773262, + "language_loss": 0.70439529, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72542989, + "num_input_tokens_seen": 229338600, + "step": 10631, + "time_per_iteration": 2.5835120677948 + }, + { + "auxiliary_loss_clip": 0.01006494, + "auxiliary_loss_mlp": 0.01015182, + "balance_loss_clip": 1.00870752, + "balance_loss_mlp": 1.01381135, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7888114592581785, + "language_loss": 0.63010621, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.65032291, + "num_input_tokens_seen": 229402420, + "step": 10632, + "time_per_iteration": 3.176847457885742 + }, + { + "auxiliary_loss_clip": 0.01076082, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.0328846, + "balance_loss_mlp": 1.02074814, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 1.9445594725333248, + "language_loss": 0.6674158, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68849599, + "num_input_tokens_seen": 229419185, + "step": 10633, + "time_per_iteration": 4.264237642288208 + }, + { + "auxiliary_loss_clip": 0.01037787, + "auxiliary_loss_mlp": 0.01027116, + "balance_loss_clip": 1.03488183, + "balance_loss_mlp": 1.01585031, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 1.8621961069013324, + "language_loss": 0.81963921, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84028816, + "num_input_tokens_seen": 229436735, + "step": 10634, + "time_per_iteration": 2.744370937347412 + }, + { + "auxiliary_loss_clip": 0.01078444, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.03582799, + "balance_loss_mlp": 1.02200794, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.6105081140924091, + "language_loss": 0.74756241, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.76869059, + "num_input_tokens_seen": 229455595, + "step": 10635, + "time_per_iteration": 2.5812456607818604 + }, + { + "auxiliary_loss_clip": 0.01085022, + "auxiliary_loss_mlp": 0.01029174, + "balance_loss_clip": 1.03450394, + "balance_loss_mlp": 1.01776004, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.684224130145429, + "language_loss": 0.71607482, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73721683, + "num_input_tokens_seen": 229476230, + "step": 10636, + "time_per_iteration": 2.565913200378418 + }, + { + "auxiliary_loss_clip": 0.01080986, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.03469229, + "balance_loss_mlp": 1.02073514, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 1.7462170730342912, + "language_loss": 0.73362458, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75476408, + "num_input_tokens_seen": 229494300, + "step": 10637, + "time_per_iteration": 4.134231805801392 + }, + { + "auxiliary_loss_clip": 0.01086093, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.03411138, + "balance_loss_mlp": 1.01858878, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.7751504169220798, + "language_loss": 0.78223324, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.80339789, + "num_input_tokens_seen": 229512985, + "step": 10638, + "time_per_iteration": 2.589346408843994 + }, + { + "auxiliary_loss_clip": 0.0107854, + "auxiliary_loss_mlp": 0.01030907, + "balance_loss_clip": 1.03448331, + "balance_loss_mlp": 1.01918244, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 1.4961537649187098, + "language_loss": 0.81645, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83754444, + "num_input_tokens_seen": 229534270, + "step": 10639, + "time_per_iteration": 2.61564302444458 + }, + { + "auxiliary_loss_clip": 0.01006552, + "auxiliary_loss_mlp": 0.01004331, + "balance_loss_clip": 1.00556684, + "balance_loss_mlp": 1.00316834, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8116837905876907, + "language_loss": 0.59050769, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61061651, + "num_input_tokens_seen": 229596455, + "step": 10640, + "time_per_iteration": 3.1171817779541016 + }, + { + "auxiliary_loss_clip": 0.01063429, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.02907419, + "balance_loss_mlp": 1.01571631, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.8398257519389272, + "language_loss": 0.78509265, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80599141, + "num_input_tokens_seen": 229612860, + "step": 10641, + "time_per_iteration": 2.5245327949523926 + }, + { + "auxiliary_loss_clip": 0.01053515, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.03187871, + "balance_loss_mlp": 1.01893365, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.9183534936248146, + "language_loss": 0.62964344, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65048802, + "num_input_tokens_seen": 229633960, + "step": 10642, + "time_per_iteration": 2.657534599304199 + }, + { + "auxiliary_loss_clip": 0.00994119, + "auxiliary_loss_mlp": 0.01012936, + "balance_loss_clip": 1.00496817, + "balance_loss_mlp": 1.01174951, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.9309105924101002, + "language_loss": 0.55986387, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57993436, + "num_input_tokens_seen": 229686730, + "step": 10643, + "time_per_iteration": 3.055265426635742 + }, + { + "auxiliary_loss_clip": 0.01064034, + "auxiliary_loss_mlp": 0.01024894, + "balance_loss_clip": 1.03114665, + "balance_loss_mlp": 1.01300251, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 1.8255024553541634, + "language_loss": 0.76786566, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.78875494, + "num_input_tokens_seen": 229704800, + "step": 10644, + "time_per_iteration": 2.5705769062042236 + }, + { + "auxiliary_loss_clip": 0.01069088, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.03611743, + "balance_loss_mlp": 1.01964891, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.4071805444199055, + "language_loss": 0.82449353, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84549737, + "num_input_tokens_seen": 229725265, + "step": 10645, + "time_per_iteration": 2.656747341156006 + }, + { + "auxiliary_loss_clip": 0.01090326, + "auxiliary_loss_mlp": 0.01040968, + "balance_loss_clip": 1.03321564, + "balance_loss_mlp": 1.02810574, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.9135249059599255, + "language_loss": 0.7392298, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.76054275, + "num_input_tokens_seen": 229744840, + "step": 10646, + "time_per_iteration": 4.207927227020264 + }, + { + "auxiliary_loss_clip": 0.0106031, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.03123689, + "balance_loss_mlp": 1.0173223, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.0314525419746254, + "language_loss": 0.79854286, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.81944597, + "num_input_tokens_seen": 229759095, + "step": 10647, + "time_per_iteration": 2.57478404045105 + }, + { + "auxiliary_loss_clip": 0.01052697, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.03247404, + "balance_loss_mlp": 1.01985931, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.6617745714116343, + "language_loss": 0.7568897, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.77772743, + "num_input_tokens_seen": 229777750, + "step": 10648, + "time_per_iteration": 2.679349660873413 + }, + { + "auxiliary_loss_clip": 0.01070854, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.03199673, + "balance_loss_mlp": 1.01969934, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 2.0380880635605174, + "language_loss": 0.78349435, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80451906, + "num_input_tokens_seen": 229796785, + "step": 10649, + "time_per_iteration": 2.567732810974121 + }, + { + "auxiliary_loss_clip": 0.01046745, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03480387, + "balance_loss_mlp": 1.01919496, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 1.8669322214095065, + "language_loss": 0.75990093, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.78068745, + "num_input_tokens_seen": 229815425, + "step": 10650, + "time_per_iteration": 2.655939817428589 + }, + { + "auxiliary_loss_clip": 0.01100816, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.03450513, + "balance_loss_mlp": 1.01964867, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.6219494383626907, + "language_loss": 0.70849705, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72982109, + "num_input_tokens_seen": 229834545, + "step": 10651, + "time_per_iteration": 2.574126720428467 + }, + { + "auxiliary_loss_clip": 0.01066363, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.03178704, + "balance_loss_mlp": 1.02155626, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.809035438527766, + "language_loss": 0.63769794, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65869921, + "num_input_tokens_seen": 229849175, + "step": 10652, + "time_per_iteration": 2.6821727752685547 + }, + { + "auxiliary_loss_clip": 0.01074172, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.03145564, + "balance_loss_mlp": 1.01496506, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 1.9289376987419193, + "language_loss": 0.79189795, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81290185, + "num_input_tokens_seen": 229865400, + "step": 10653, + "time_per_iteration": 2.5743908882141113 + }, + { + "auxiliary_loss_clip": 0.01071093, + "auxiliary_loss_mlp": 0.01057904, + "balance_loss_clip": 1.03060174, + "balance_loss_mlp": 1.04252577, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 1.9470485808985158, + "language_loss": 0.70117879, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72246873, + "num_input_tokens_seen": 229882945, + "step": 10654, + "time_per_iteration": 2.569899082183838 + }, + { + "auxiliary_loss_clip": 0.01093152, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.03456092, + "balance_loss_mlp": 1.02082205, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.6130032739460145, + "language_loss": 0.72491544, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.74617851, + "num_input_tokens_seen": 229901590, + "step": 10655, + "time_per_iteration": 2.5041677951812744 + }, + { + "auxiliary_loss_clip": 0.0106825, + "auxiliary_loss_mlp": 0.01033227, + "balance_loss_clip": 1.03511298, + "balance_loss_mlp": 1.0213654, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 15.000473643111787, + "language_loss": 0.82680476, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.84781951, + "num_input_tokens_seen": 229922535, + "step": 10656, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.01043879, + "auxiliary_loss_mlp": 0.01032068, + "balance_loss_clip": 1.03093493, + "balance_loss_mlp": 1.02123821, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.2175850351518336, + "language_loss": 0.72311437, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74387383, + "num_input_tokens_seen": 229939575, + "step": 10657, + "time_per_iteration": 2.7105281352996826 + }, + { + "auxiliary_loss_clip": 0.01066447, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.03231251, + "balance_loss_mlp": 1.02228391, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 1.9829354549700133, + "language_loss": 0.77298927, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79398811, + "num_input_tokens_seen": 229958840, + "step": 10658, + "time_per_iteration": 2.6191842555999756 + }, + { + "auxiliary_loss_clip": 0.01103796, + "auxiliary_loss_mlp": 0.01041147, + "balance_loss_clip": 1.0357728, + "balance_loss_mlp": 1.02821898, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 3.9606925296864284, + "language_loss": 0.76400959, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78545904, + "num_input_tokens_seen": 229979680, + "step": 10659, + "time_per_iteration": 2.5143094062805176 + }, + { + "auxiliary_loss_clip": 0.01089345, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.03420973, + "balance_loss_mlp": 1.0193584, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 2.0112784418053455, + "language_loss": 0.77847457, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.79968178, + "num_input_tokens_seen": 229996830, + "step": 10660, + "time_per_iteration": 2.5099334716796875 + }, + { + "auxiliary_loss_clip": 0.01078417, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.03406584, + "balance_loss_mlp": 1.01978254, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 2.068018965662529, + "language_loss": 0.68721515, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70832455, + "num_input_tokens_seen": 230015115, + "step": 10661, + "time_per_iteration": 2.5669894218444824 + }, + { + "auxiliary_loss_clip": 0.01100295, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.03669763, + "balance_loss_mlp": 1.02269316, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.669331544238654, + "language_loss": 0.76136506, + "learning_rate": 1.205986598033362e-06, + "loss": 0.78270197, + "num_input_tokens_seen": 230035515, + "step": 10662, + "time_per_iteration": 4.013808488845825 + }, + { + "auxiliary_loss_clip": 0.01081819, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.03116262, + "balance_loss_mlp": 1.01876903, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 1.811000684970528, + "language_loss": 0.69905907, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.72019136, + "num_input_tokens_seen": 230054355, + "step": 10663, + "time_per_iteration": 2.578136444091797 + }, + { + "auxiliary_loss_clip": 0.01069208, + "auxiliary_loss_mlp": 0.01038621, + "balance_loss_clip": 1.03516603, + "balance_loss_mlp": 1.02500725, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 2.009206290965699, + "language_loss": 0.68068314, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70176136, + "num_input_tokens_seen": 230074605, + "step": 10664, + "time_per_iteration": 2.6640636920928955 + }, + { + "auxiliary_loss_clip": 0.01074672, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.03146315, + "balance_loss_mlp": 1.02022815, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.852520520175523, + "language_loss": 0.66521835, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68627578, + "num_input_tokens_seen": 230093820, + "step": 10665, + "time_per_iteration": 2.5981194972991943 + }, + { + "auxiliary_loss_clip": 0.0109151, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.0354377, + "balance_loss_mlp": 1.02045405, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.5504980833102697, + "language_loss": 0.64314699, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66438293, + "num_input_tokens_seen": 230114285, + "step": 10666, + "time_per_iteration": 2.539816379547119 + }, + { + "auxiliary_loss_clip": 0.0109112, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.03451455, + "balance_loss_mlp": 1.0196631, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.6682830378155453, + "language_loss": 0.71272886, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73395407, + "num_input_tokens_seen": 230132760, + "step": 10667, + "time_per_iteration": 2.5588998794555664 + }, + { + "auxiliary_loss_clip": 0.0104726, + "auxiliary_loss_mlp": 0.00749882, + "balance_loss_clip": 1.02995443, + "balance_loss_mlp": 1.00035453, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.051928953600417, + "language_loss": 0.77630579, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79427719, + "num_input_tokens_seen": 230149690, + "step": 10668, + "time_per_iteration": 2.7028610706329346 + }, + { + "auxiliary_loss_clip": 0.01095568, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.03851223, + "balance_loss_mlp": 1.02439666, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.7720600546294516, + "language_loss": 0.67491609, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69623488, + "num_input_tokens_seen": 230166950, + "step": 10669, + "time_per_iteration": 2.5888020992279053 + }, + { + "auxiliary_loss_clip": 0.01101446, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.03952789, + "balance_loss_mlp": 1.02738047, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 2.0370834416109846, + "language_loss": 0.78277713, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80419159, + "num_input_tokens_seen": 230184785, + "step": 10670, + "time_per_iteration": 2.574361562728882 + }, + { + "auxiliary_loss_clip": 0.01071484, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.03685701, + "balance_loss_mlp": 1.02264798, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 5.396025963219392, + "language_loss": 0.88414836, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90521514, + "num_input_tokens_seen": 230201385, + "step": 10671, + "time_per_iteration": 2.582103729248047 + }, + { + "auxiliary_loss_clip": 0.01091036, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.03595424, + "balance_loss_mlp": 1.01767075, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.7728945360159416, + "language_loss": 0.69102597, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71222115, + "num_input_tokens_seen": 230220380, + "step": 10672, + "time_per_iteration": 2.5639593601226807 + }, + { + "auxiliary_loss_clip": 0.01097871, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.03657293, + "balance_loss_mlp": 1.01824629, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.8525498698790175, + "language_loss": 0.73917019, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76046997, + "num_input_tokens_seen": 230239845, + "step": 10673, + "time_per_iteration": 4.0790112018585205 + }, + { + "auxiliary_loss_clip": 0.01062805, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.03198743, + "balance_loss_mlp": 1.02051008, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 3.611734738254592, + "language_loss": 0.69176972, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71273375, + "num_input_tokens_seen": 230262420, + "step": 10674, + "time_per_iteration": 2.6773507595062256 + }, + { + "auxiliary_loss_clip": 0.01106326, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.03528976, + "balance_loss_mlp": 1.01861918, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 1.8758553403982752, + "language_loss": 0.66778606, + "learning_rate": 1.201342244560338e-06, + "loss": 0.6891613, + "num_input_tokens_seen": 230279950, + "step": 10675, + "time_per_iteration": 2.5555899143218994 + }, + { + "auxiliary_loss_clip": 0.01105829, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.03809345, + "balance_loss_mlp": 1.02599347, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 1.8932984661291874, + "language_loss": 0.66174448, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68318093, + "num_input_tokens_seen": 230299705, + "step": 10676, + "time_per_iteration": 2.579756259918213 + }, + { + "auxiliary_loss_clip": 0.01106747, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.03817821, + "balance_loss_mlp": 1.01934671, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 1.9588941343785276, + "language_loss": 0.75906336, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.78045917, + "num_input_tokens_seen": 230320030, + "step": 10677, + "time_per_iteration": 4.13675332069397 + }, + { + "auxiliary_loss_clip": 0.01004942, + "auxiliary_loss_mlp": 0.01002203, + "balance_loss_clip": 1.00427318, + "balance_loss_mlp": 1.00114202, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7627373441353873, + "language_loss": 0.60722089, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62729234, + "num_input_tokens_seen": 230381495, + "step": 10678, + "time_per_iteration": 3.260075330734253 + }, + { + "auxiliary_loss_clip": 0.010861, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.03410125, + "balance_loss_mlp": 1.02467132, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.915820146081945, + "language_loss": 0.67383945, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69506592, + "num_input_tokens_seen": 230401385, + "step": 10679, + "time_per_iteration": 2.6155154705047607 + }, + { + "auxiliary_loss_clip": 0.01086966, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.03426671, + "balance_loss_mlp": 1.02267313, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.6458850441058517, + "language_loss": 0.72974449, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75098586, + "num_input_tokens_seen": 230421340, + "step": 10680, + "time_per_iteration": 2.6213178634643555 + }, + { + "auxiliary_loss_clip": 0.01066807, + "auxiliary_loss_mlp": 0.01024975, + "balance_loss_clip": 1.03353441, + "balance_loss_mlp": 1.01429987, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.7869430313229124, + "language_loss": 0.67495239, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.69587016, + "num_input_tokens_seen": 230441270, + "step": 10681, + "time_per_iteration": 2.630812644958496 + }, + { + "auxiliary_loss_clip": 0.01100622, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.03488088, + "balance_loss_mlp": 1.01951957, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.9477073603566986, + "language_loss": 0.74773663, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76905453, + "num_input_tokens_seen": 230457455, + "step": 10682, + "time_per_iteration": 2.4676477909088135 + }, + { + "auxiliary_loss_clip": 0.01035737, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.03134751, + "balance_loss_mlp": 1.02140307, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.533365896009854, + "language_loss": 0.79415572, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81483537, + "num_input_tokens_seen": 230478955, + "step": 10683, + "time_per_iteration": 2.733407735824585 + }, + { + "auxiliary_loss_clip": 0.0110471, + "auxiliary_loss_mlp": 0.01036491, + "balance_loss_clip": 1.03637457, + "balance_loss_mlp": 1.02398038, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.6522240281149987, + "language_loss": 0.67159259, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69300461, + "num_input_tokens_seen": 230496425, + "step": 10684, + "time_per_iteration": 2.4682695865631104 + }, + { + "auxiliary_loss_clip": 0.01091129, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.03404033, + "balance_loss_mlp": 1.01852298, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.0270622397775537, + "language_loss": 0.71438611, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73560369, + "num_input_tokens_seen": 230516245, + "step": 10685, + "time_per_iteration": 2.629305839538574 + }, + { + "auxiliary_loss_clip": 0.01064033, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.03175211, + "balance_loss_mlp": 1.02090657, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 2.3368899665656317, + "language_loss": 0.75501871, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77598268, + "num_input_tokens_seen": 230534745, + "step": 10686, + "time_per_iteration": 4.057294845581055 + }, + { + "auxiliary_loss_clip": 0.010712, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.03594553, + "balance_loss_mlp": 1.02107334, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.1388407938996052, + "language_loss": 0.68489712, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70594978, + "num_input_tokens_seen": 230555895, + "step": 10687, + "time_per_iteration": 2.628136157989502 + }, + { + "auxiliary_loss_clip": 0.01076391, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.03398204, + "balance_loss_mlp": 1.02296889, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 1.8511021209049006, + "language_loss": 0.66658902, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68770403, + "num_input_tokens_seen": 230577460, + "step": 10688, + "time_per_iteration": 2.603435754776001 + }, + { + "auxiliary_loss_clip": 0.01101211, + "auxiliary_loss_mlp": 0.01027626, + "balance_loss_clip": 1.03433371, + "balance_loss_mlp": 1.01568079, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.9728993475202978, + "language_loss": 0.72988665, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75117505, + "num_input_tokens_seen": 230595030, + "step": 10689, + "time_per_iteration": 2.446035385131836 + }, + { + "auxiliary_loss_clip": 0.01084034, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.03495979, + "balance_loss_mlp": 1.02243876, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 3.2993426103875496, + "language_loss": 0.72074658, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74192309, + "num_input_tokens_seen": 230615135, + "step": 10690, + "time_per_iteration": 2.5181987285614014 + }, + { + "auxiliary_loss_clip": 0.01068639, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.03108931, + "balance_loss_mlp": 1.0193212, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 7.319673842477303, + "language_loss": 0.77366918, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.79466599, + "num_input_tokens_seen": 230631965, + "step": 10691, + "time_per_iteration": 2.494621753692627 + }, + { + "auxiliary_loss_clip": 0.01080454, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.03440833, + "balance_loss_mlp": 1.01876998, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.6615850257390836, + "language_loss": 0.74825692, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76936281, + "num_input_tokens_seen": 230649565, + "step": 10692, + "time_per_iteration": 2.5139448642730713 + }, + { + "auxiliary_loss_clip": 0.01091925, + "auxiliary_loss_mlp": 0.01035988, + "balance_loss_clip": 1.03563774, + "balance_loss_mlp": 1.02445483, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 3.391238661124453, + "language_loss": 0.61681336, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63809252, + "num_input_tokens_seen": 230669265, + "step": 10693, + "time_per_iteration": 2.5135767459869385 + }, + { + "auxiliary_loss_clip": 0.01064924, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.03372598, + "balance_loss_mlp": 1.01590133, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.886310166056458, + "language_loss": 0.59954041, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.62046617, + "num_input_tokens_seen": 230690575, + "step": 10694, + "time_per_iteration": 2.6393165588378906 + }, + { + "auxiliary_loss_clip": 0.01075854, + "auxiliary_loss_mlp": 0.0103373, + "balance_loss_clip": 1.03465331, + "balance_loss_mlp": 1.02177286, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.5533770270910068, + "language_loss": 0.79983175, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82092762, + "num_input_tokens_seen": 230709420, + "step": 10695, + "time_per_iteration": 2.5878641605377197 + }, + { + "auxiliary_loss_clip": 0.01103322, + "auxiliary_loss_mlp": 0.01037261, + "balance_loss_clip": 1.03525424, + "balance_loss_mlp": 1.02517343, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.9479530041488522, + "language_loss": 0.73417032, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75557613, + "num_input_tokens_seen": 230729350, + "step": 10696, + "time_per_iteration": 2.6135048866271973 + }, + { + "auxiliary_loss_clip": 0.01063101, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.03130412, + "balance_loss_mlp": 1.01720333, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 2.1385687918636287, + "language_loss": 0.75941944, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.78034008, + "num_input_tokens_seen": 230749220, + "step": 10697, + "time_per_iteration": 2.5954432487487793 + }, + { + "auxiliary_loss_clip": 0.01075925, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.03266978, + "balance_loss_mlp": 1.02184057, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.867495698592849, + "language_loss": 0.66375935, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68485212, + "num_input_tokens_seen": 230770245, + "step": 10698, + "time_per_iteration": 2.6960577964782715 + }, + { + "auxiliary_loss_clip": 0.01026443, + "auxiliary_loss_mlp": 0.00999078, + "balance_loss_clip": 1.00539088, + "balance_loss_mlp": 0.99798757, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8301345613901941, + "language_loss": 0.63449192, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65474713, + "num_input_tokens_seen": 230837030, + "step": 10699, + "time_per_iteration": 3.077766180038452 + }, + { + "auxiliary_loss_clip": 0.01090448, + "auxiliary_loss_mlp": 0.01022807, + "balance_loss_clip": 1.03601456, + "balance_loss_mlp": 1.01239407, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 2.0671295974441537, + "language_loss": 0.69431204, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71544462, + "num_input_tokens_seen": 230856845, + "step": 10700, + "time_per_iteration": 2.5569872856140137 + }, + { + "auxiliary_loss_clip": 0.01103653, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.0360961, + "balance_loss_mlp": 1.01631594, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.8874420476689477, + "language_loss": 0.73561686, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75693411, + "num_input_tokens_seen": 230878785, + "step": 10701, + "time_per_iteration": 2.508073568344116 + }, + { + "auxiliary_loss_clip": 0.01092394, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.033741, + "balance_loss_mlp": 1.01861739, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 1.9698957480193913, + "language_loss": 0.82013303, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84137416, + "num_input_tokens_seen": 230895445, + "step": 10702, + "time_per_iteration": 3.966193914413452 + }, + { + "auxiliary_loss_clip": 0.01068158, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.03135037, + "balance_loss_mlp": 1.0276345, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 1.965935694038455, + "language_loss": 0.74839061, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76946872, + "num_input_tokens_seen": 230911375, + "step": 10703, + "time_per_iteration": 2.5409317016601562 + }, + { + "auxiliary_loss_clip": 0.00980726, + "auxiliary_loss_mlp": 0.00998179, + "balance_loss_clip": 1.01224089, + "balance_loss_mlp": 0.99707669, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6845454295592448, + "language_loss": 0.54584819, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56563723, + "num_input_tokens_seen": 230975990, + "step": 10704, + "time_per_iteration": 3.3504743576049805 + }, + { + "auxiliary_loss_clip": 0.01059737, + "auxiliary_loss_mlp": 0.01022534, + "balance_loss_clip": 1.03466046, + "balance_loss_mlp": 1.01206779, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.6512870743312735, + "language_loss": 0.76927614, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79009885, + "num_input_tokens_seen": 230997110, + "step": 10705, + "time_per_iteration": 2.833517074584961 + }, + { + "auxiliary_loss_clip": 0.01066332, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.03247738, + "balance_loss_mlp": 1.0268625, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.678055173883992, + "language_loss": 0.79085732, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81190604, + "num_input_tokens_seen": 231015590, + "step": 10706, + "time_per_iteration": 2.563314199447632 + }, + { + "auxiliary_loss_clip": 0.01054141, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.02793503, + "balance_loss_mlp": 1.0207963, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 2.081340045421352, + "language_loss": 0.79736668, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.81824243, + "num_input_tokens_seen": 231033800, + "step": 10707, + "time_per_iteration": 2.6121280193328857 + }, + { + "auxiliary_loss_clip": 0.01090051, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.03394842, + "balance_loss_mlp": 1.01831269, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.956369813225086, + "language_loss": 0.85961175, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.88080782, + "num_input_tokens_seen": 231053160, + "step": 10708, + "time_per_iteration": 2.5561282634735107 + }, + { + "auxiliary_loss_clip": 0.01054364, + "auxiliary_loss_mlp": 0.01040839, + "balance_loss_clip": 1.03440762, + "balance_loss_mlp": 1.02708209, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.2803593817359045, + "language_loss": 0.65220916, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67316115, + "num_input_tokens_seen": 231069470, + "step": 10709, + "time_per_iteration": 2.6035361289978027 + }, + { + "auxiliary_loss_clip": 0.0110036, + "auxiliary_loss_mlp": 0.01027759, + "balance_loss_clip": 1.03430164, + "balance_loss_mlp": 1.01644003, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 1.8272495268739057, + "language_loss": 0.80466533, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.82594657, + "num_input_tokens_seen": 231088205, + "step": 10710, + "time_per_iteration": 2.507126808166504 + }, + { + "auxiliary_loss_clip": 0.01088024, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.03196442, + "balance_loss_mlp": 1.01612401, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 2.5460213426638547, + "language_loss": 0.66149253, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68264592, + "num_input_tokens_seen": 231107850, + "step": 10711, + "time_per_iteration": 2.623960018157959 + }, + { + "auxiliary_loss_clip": 0.01065155, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.0376029, + "balance_loss_mlp": 1.0211184, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.618376291827065, + "language_loss": 0.78931189, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.81029236, + "num_input_tokens_seen": 231127200, + "step": 10712, + "time_per_iteration": 2.6292707920074463 + }, + { + "auxiliary_loss_clip": 0.01087329, + "auxiliary_loss_mlp": 0.01033473, + "balance_loss_clip": 1.03339636, + "balance_loss_mlp": 1.02174294, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.7260676895059268, + "language_loss": 0.8251667, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84637469, + "num_input_tokens_seen": 231146360, + "step": 10713, + "time_per_iteration": 4.166895389556885 + }, + { + "auxiliary_loss_clip": 0.01098453, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.03526258, + "balance_loss_mlp": 1.02424991, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.6057409591251384, + "language_loss": 0.78348672, + "learning_rate": 1.187440012188684e-06, + "loss": 0.8048225, + "num_input_tokens_seen": 231168350, + "step": 10714, + "time_per_iteration": 2.5292792320251465 + }, + { + "auxiliary_loss_clip": 0.01067745, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.0343436, + "balance_loss_mlp": 1.01775658, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.4434115130356613, + "language_loss": 0.81702763, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83798897, + "num_input_tokens_seen": 231188385, + "step": 10715, + "time_per_iteration": 2.6305477619171143 + }, + { + "auxiliary_loss_clip": 0.01068805, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.02973306, + "balance_loss_mlp": 1.01848435, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 2.415153728898465, + "language_loss": 0.8157258, + "learning_rate": 1.186728333672332e-06, + "loss": 0.8367157, + "num_input_tokens_seen": 231209880, + "step": 10716, + "time_per_iteration": 2.612056016921997 + }, + { + "auxiliary_loss_clip": 0.01068009, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.03329039, + "balance_loss_mlp": 1.02583075, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.7005581164344867, + "language_loss": 0.78278685, + "learning_rate": 1.186372540666424e-06, + "loss": 0.8038584, + "num_input_tokens_seen": 231230765, + "step": 10717, + "time_per_iteration": 4.192925453186035 + }, + { + "auxiliary_loss_clip": 0.01098417, + "auxiliary_loss_mlp": 0.0103093, + "balance_loss_clip": 1.03511429, + "balance_loss_mlp": 1.01971281, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.8476124385973336, + "language_loss": 0.68489385, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70618737, + "num_input_tokens_seen": 231252350, + "step": 10718, + "time_per_iteration": 2.513195276260376 + }, + { + "auxiliary_loss_clip": 0.01015467, + "auxiliary_loss_mlp": 0.01005597, + "balance_loss_clip": 1.00456786, + "balance_loss_mlp": 1.00461388, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7803347435565725, + "language_loss": 0.49613345, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51634407, + "num_input_tokens_seen": 231313865, + "step": 10719, + "time_per_iteration": 3.222029447555542 + }, + { + "auxiliary_loss_clip": 0.0110545, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.03714514, + "balance_loss_mlp": 1.02487659, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 1.921556313571357, + "language_loss": 0.77930176, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80073261, + "num_input_tokens_seen": 231331710, + "step": 10720, + "time_per_iteration": 2.4845898151397705 + }, + { + "auxiliary_loss_clip": 0.01080181, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.03462911, + "balance_loss_mlp": 1.01796734, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 1.9017206239691273, + "language_loss": 0.77281076, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79391468, + "num_input_tokens_seen": 231350705, + "step": 10721, + "time_per_iteration": 2.559081554412842 + }, + { + "auxiliary_loss_clip": 0.01064772, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.03598833, + "balance_loss_mlp": 1.01963735, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 1.8550151306648424, + "language_loss": 0.72536218, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.74632436, + "num_input_tokens_seen": 231369550, + "step": 10722, + "time_per_iteration": 2.681723117828369 + }, + { + "auxiliary_loss_clip": 0.01100382, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.03555846, + "balance_loss_mlp": 1.01831996, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.54365065232814, + "language_loss": 0.77881455, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80011475, + "num_input_tokens_seen": 231389285, + "step": 10723, + "time_per_iteration": 2.567713499069214 + }, + { + "auxiliary_loss_clip": 0.01089882, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.03342485, + "balance_loss_mlp": 1.02060401, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.5559618223581941, + "language_loss": 0.58350337, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60472894, + "num_input_tokens_seen": 231408820, + "step": 10724, + "time_per_iteration": 2.595956802368164 + }, + { + "auxiliary_loss_clip": 0.0108735, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.03439915, + "balance_loss_mlp": 1.02047288, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.9339403391055314, + "language_loss": 0.83762699, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85880983, + "num_input_tokens_seen": 231428100, + "step": 10725, + "time_per_iteration": 2.5540244579315186 + }, + { + "auxiliary_loss_clip": 0.01073266, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.0298214, + "balance_loss_mlp": 1.022156, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 2.264414309612061, + "language_loss": 0.82052976, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84160411, + "num_input_tokens_seen": 231445810, + "step": 10726, + "time_per_iteration": 2.638833999633789 + }, + { + "auxiliary_loss_clip": 0.01084622, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.03367066, + "balance_loss_mlp": 1.0211091, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 2.2557296627895194, + "language_loss": 0.81616688, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.837345, + "num_input_tokens_seen": 231463570, + "step": 10727, + "time_per_iteration": 3.949965000152588 + }, + { + "auxiliary_loss_clip": 0.01098899, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03725743, + "balance_loss_mlp": 1.02007532, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.558139671859646, + "language_loss": 0.79130769, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81262314, + "num_input_tokens_seen": 231482155, + "step": 10728, + "time_per_iteration": 2.523301839828491 + }, + { + "auxiliary_loss_clip": 0.00999484, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.02757454, + "balance_loss_mlp": 1.02047634, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.9197842757942916, + "language_loss": 0.7455048, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76583523, + "num_input_tokens_seen": 231502465, + "step": 10729, + "time_per_iteration": 2.9159603118896484 + }, + { + "auxiliary_loss_clip": 0.01057681, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.03454101, + "balance_loss_mlp": 1.02079773, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.7993934936036633, + "language_loss": 0.66500008, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68590605, + "num_input_tokens_seen": 231522740, + "step": 10730, + "time_per_iteration": 2.8193860054016113 + }, + { + "auxiliary_loss_clip": 0.01034243, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.03147006, + "balance_loss_mlp": 1.01845264, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.4967869807037875, + "language_loss": 0.6344853, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65514374, + "num_input_tokens_seen": 231542050, + "step": 10731, + "time_per_iteration": 2.7274959087371826 + }, + { + "auxiliary_loss_clip": 0.01098291, + "auxiliary_loss_mlp": 0.01029787, + "balance_loss_clip": 1.03327894, + "balance_loss_mlp": 1.01812863, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.6820726766394283, + "language_loss": 0.67539006, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.69667089, + "num_input_tokens_seen": 231560380, + "step": 10732, + "time_per_iteration": 2.5796892642974854 + }, + { + "auxiliary_loss_clip": 0.01089502, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.03475881, + "balance_loss_mlp": 1.02266455, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.360378547968568, + "language_loss": 0.75547153, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77670163, + "num_input_tokens_seen": 231580810, + "step": 10733, + "time_per_iteration": 2.5335395336151123 + }, + { + "auxiliary_loss_clip": 0.01087019, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.03386819, + "balance_loss_mlp": 1.0237236, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 1.9162214720479323, + "language_loss": 0.66080701, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.68203902, + "num_input_tokens_seen": 231600585, + "step": 10734, + "time_per_iteration": 2.5685627460479736 + }, + { + "auxiliary_loss_clip": 0.01098396, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.03594971, + "balance_loss_mlp": 1.02552533, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 2.0560866710730874, + "language_loss": 0.73505306, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.75640309, + "num_input_tokens_seen": 231618765, + "step": 10735, + "time_per_iteration": 2.4585304260253906 + }, + { + "auxiliary_loss_clip": 0.01032866, + "auxiliary_loss_mlp": 0.00749383, + "balance_loss_clip": 1.03008556, + "balance_loss_mlp": 1.00028133, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.801645871994604, + "language_loss": 0.74827039, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.7660929, + "num_input_tokens_seen": 231638525, + "step": 10736, + "time_per_iteration": 2.715890884399414 + }, + { + "auxiliary_loss_clip": 0.01094095, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.03658736, + "balance_loss_mlp": 1.01592553, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 1.999770091545354, + "language_loss": 0.70140707, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72262752, + "num_input_tokens_seen": 231656785, + "step": 10737, + "time_per_iteration": 2.5322787761688232 + }, + { + "auxiliary_loss_clip": 0.01014191, + "auxiliary_loss_mlp": 0.00999524, + "balance_loss_clip": 1.00315309, + "balance_loss_mlp": 0.99853498, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7832645533824367, + "language_loss": 0.58468109, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60481817, + "num_input_tokens_seen": 231719075, + "step": 10738, + "time_per_iteration": 3.1705727577209473 + }, + { + "auxiliary_loss_clip": 0.01066634, + "auxiliary_loss_mlp": 0.01028191, + "balance_loss_clip": 1.03405213, + "balance_loss_mlp": 1.01640749, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.742478114984508, + "language_loss": 0.74437213, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76532042, + "num_input_tokens_seen": 231737810, + "step": 10739, + "time_per_iteration": 2.6488234996795654 + }, + { + "auxiliary_loss_clip": 0.01081754, + "auxiliary_loss_mlp": 0.00749465, + "balance_loss_clip": 1.03474236, + "balance_loss_mlp": 1.00034571, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 2.0807720735772355, + "language_loss": 0.71289885, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.73121107, + "num_input_tokens_seen": 231756140, + "step": 10740, + "time_per_iteration": 2.6034233570098877 + }, + { + "auxiliary_loss_clip": 0.01005379, + "auxiliary_loss_mlp": 0.01010274, + "balance_loss_clip": 1.00320077, + "balance_loss_mlp": 1.00915933, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6697761536621226, + "language_loss": 0.55276358, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57292008, + "num_input_tokens_seen": 231823665, + "step": 10741, + "time_per_iteration": 3.1824965476989746 + }, + { + "auxiliary_loss_clip": 0.01098515, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.03386962, + "balance_loss_mlp": 1.01967835, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 1.642865352236254, + "language_loss": 0.80465049, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82594043, + "num_input_tokens_seen": 231844500, + "step": 10742, + "time_per_iteration": 2.550814628601074 + }, + { + "auxiliary_loss_clip": 0.01072679, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.03086591, + "balance_loss_mlp": 1.0198859, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.7303028678928378, + "language_loss": 0.81822032, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.83925861, + "num_input_tokens_seen": 231864510, + "step": 10743, + "time_per_iteration": 4.0943121910095215 + }, + { + "auxiliary_loss_clip": 0.01074562, + "auxiliary_loss_mlp": 0.01028112, + "balance_loss_clip": 1.03264403, + "balance_loss_mlp": 1.01659608, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.8014444571523587, + "language_loss": 0.71938598, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74041265, + "num_input_tokens_seen": 231881555, + "step": 10744, + "time_per_iteration": 2.5675249099731445 + }, + { + "auxiliary_loss_clip": 0.01099756, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.03381312, + "balance_loss_mlp": 1.01462591, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.6791682566303545, + "language_loss": 0.66355371, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68480992, + "num_input_tokens_seen": 231905945, + "step": 10745, + "time_per_iteration": 2.6887402534484863 + }, + { + "auxiliary_loss_clip": 0.01084459, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.03239918, + "balance_loss_mlp": 1.02121711, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 4.818469289394184, + "language_loss": 0.73479795, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75596905, + "num_input_tokens_seen": 231922535, + "step": 10746, + "time_per_iteration": 2.5178604125976562 + }, + { + "auxiliary_loss_clip": 0.01090134, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.03407073, + "balance_loss_mlp": 1.01870561, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.4772150060553506, + "language_loss": 0.66414475, + "learning_rate": 1.175713157660413e-06, + "loss": 0.68534017, + "num_input_tokens_seen": 231944800, + "step": 10747, + "time_per_iteration": 2.5605461597442627 + }, + { + "auxiliary_loss_clip": 0.01068712, + "auxiliary_loss_mlp": 0.01037657, + "balance_loss_clip": 1.03370214, + "balance_loss_mlp": 1.02621901, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 2.057252018471356, + "language_loss": 0.67535818, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69642186, + "num_input_tokens_seen": 231962970, + "step": 10748, + "time_per_iteration": 2.581582546234131 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.03565085, + "balance_loss_mlp": 1.02943277, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 3.6470938090040783, + "language_loss": 0.76145923, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78291667, + "num_input_tokens_seen": 231981195, + "step": 10749, + "time_per_iteration": 2.4635465145111084 + }, + { + "auxiliary_loss_clip": 0.01056807, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02827585, + "balance_loss_mlp": 1.02353132, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.4938269070392385, + "language_loss": 0.76936144, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79029059, + "num_input_tokens_seen": 232001735, + "step": 10750, + "time_per_iteration": 2.653017997741699 + }, + { + "auxiliary_loss_clip": 0.01082513, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.0335691, + "balance_loss_mlp": 1.01806056, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.0579817167643206, + "language_loss": 0.68512011, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70625538, + "num_input_tokens_seen": 232019830, + "step": 10751, + "time_per_iteration": 2.6209394931793213 + }, + { + "auxiliary_loss_clip": 0.0107304, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.03321171, + "balance_loss_mlp": 1.01569009, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 3.9764072891857873, + "language_loss": 0.70729041, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.72829592, + "num_input_tokens_seen": 232039625, + "step": 10752, + "time_per_iteration": 2.5465619564056396 + }, + { + "auxiliary_loss_clip": 0.01068221, + "auxiliary_loss_mlp": 0.01041077, + "balance_loss_clip": 1.03192401, + "balance_loss_mlp": 1.02675343, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.8346222106624581, + "language_loss": 0.78563803, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80673099, + "num_input_tokens_seen": 232055855, + "step": 10753, + "time_per_iteration": 4.215310335159302 + }, + { + "auxiliary_loss_clip": 0.01100646, + "auxiliary_loss_mlp": 0.01040433, + "balance_loss_clip": 1.03542924, + "balance_loss_mlp": 1.02848232, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.6588121757680971, + "language_loss": 0.84933019, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87074095, + "num_input_tokens_seen": 232073475, + "step": 10754, + "time_per_iteration": 2.5837905406951904 + }, + { + "auxiliary_loss_clip": 0.01072728, + "auxiliary_loss_mlp": 0.01036916, + "balance_loss_clip": 1.03142345, + "balance_loss_mlp": 1.02505457, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 2.1055091020579684, + "language_loss": 0.59026957, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61136591, + "num_input_tokens_seen": 232091090, + "step": 10755, + "time_per_iteration": 2.593841075897217 + }, + { + "auxiliary_loss_clip": 0.01052453, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.03127503, + "balance_loss_mlp": 1.02109289, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 2.4133195809070953, + "language_loss": 0.68317902, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70403677, + "num_input_tokens_seen": 232107320, + "step": 10756, + "time_per_iteration": 2.653226137161255 + }, + { + "auxiliary_loss_clip": 0.01060756, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.03509235, + "balance_loss_mlp": 1.02732563, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 2.4112781771970364, + "language_loss": 0.73681998, + "learning_rate": 1.172166263444844e-06, + "loss": 0.75783187, + "num_input_tokens_seen": 232123930, + "step": 10757, + "time_per_iteration": 4.241817235946655 + }, + { + "auxiliary_loss_clip": 0.01049371, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.03498626, + "balance_loss_mlp": 1.01874256, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.6033712094755728, + "language_loss": 0.74518472, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76598257, + "num_input_tokens_seen": 232142905, + "step": 10758, + "time_per_iteration": 2.6782689094543457 + }, + { + "auxiliary_loss_clip": 0.01063708, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.03421569, + "balance_loss_mlp": 1.01825058, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.647120365967, + "language_loss": 0.67958957, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.70053935, + "num_input_tokens_seen": 232162230, + "step": 10759, + "time_per_iteration": 2.569899320602417 + }, + { + "auxiliary_loss_clip": 0.01062708, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.02085543, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.6037946576992317, + "language_loss": 0.75515717, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.7761144, + "num_input_tokens_seen": 232182700, + "step": 10760, + "time_per_iteration": 2.656963348388672 + }, + { + "auxiliary_loss_clip": 0.01070769, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.03059483, + "balance_loss_mlp": 1.02033234, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.5148669003243038, + "language_loss": 0.65007246, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67111075, + "num_input_tokens_seen": 232208235, + "step": 10761, + "time_per_iteration": 2.8013570308685303 + }, + { + "auxiliary_loss_clip": 0.01056517, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.03488088, + "balance_loss_mlp": 1.01779795, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 2.019222824337599, + "language_loss": 0.69795859, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71882468, + "num_input_tokens_seen": 232228720, + "step": 10762, + "time_per_iteration": 2.6926331520080566 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.03545618, + "balance_loss_mlp": 1.02207923, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 1.7636077449157517, + "language_loss": 0.82863837, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.85001534, + "num_input_tokens_seen": 232244655, + "step": 10763, + "time_per_iteration": 2.4596731662750244 + }, + { + "auxiliary_loss_clip": 0.01024762, + "auxiliary_loss_mlp": 0.01000402, + "balance_loss_clip": 1.00404024, + "balance_loss_mlp": 0.99948436, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7104760565394725, + "language_loss": 0.57824719, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59849888, + "num_input_tokens_seen": 232308685, + "step": 10764, + "time_per_iteration": 3.279271125793457 + }, + { + "auxiliary_loss_clip": 0.01063409, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.03303337, + "balance_loss_mlp": 1.01931572, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 1.9202107614160777, + "language_loss": 0.60607123, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62701619, + "num_input_tokens_seen": 232327520, + "step": 10765, + "time_per_iteration": 2.794229745864868 + }, + { + "auxiliary_loss_clip": 0.01099724, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.03472447, + "balance_loss_mlp": 1.01655972, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.8582997525386575, + "language_loss": 0.63042057, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65169752, + "num_input_tokens_seen": 232349025, + "step": 10766, + "time_per_iteration": 2.567185163497925 + }, + { + "auxiliary_loss_clip": 0.0107275, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.03239417, + "balance_loss_mlp": 1.02235222, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.5591478254753264, + "language_loss": 0.75719619, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77828074, + "num_input_tokens_seen": 232367835, + "step": 10767, + "time_per_iteration": 4.1760149002075195 + }, + { + "auxiliary_loss_clip": 0.01081999, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.03359699, + "balance_loss_mlp": 1.0218178, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 4.918989963051105, + "language_loss": 0.77780896, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79896402, + "num_input_tokens_seen": 232385840, + "step": 10768, + "time_per_iteration": 2.532960891723633 + }, + { + "auxiliary_loss_clip": 0.01038562, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.0319736, + "balance_loss_mlp": 1.01735353, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.636473045273602, + "language_loss": 0.71876955, + "learning_rate": 1.167914135250663e-06, + "loss": 0.73944932, + "num_input_tokens_seen": 232406205, + "step": 10769, + "time_per_iteration": 2.6979317665100098 + }, + { + "auxiliary_loss_clip": 0.01098252, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.0352571, + "balance_loss_mlp": 1.02222013, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.977429604204388, + "language_loss": 0.71836793, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.73968101, + "num_input_tokens_seen": 232424995, + "step": 10770, + "time_per_iteration": 2.5458216667175293 + }, + { + "auxiliary_loss_clip": 0.01058566, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.03231478, + "balance_loss_mlp": 1.01596117, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.607070168856383, + "language_loss": 0.73362339, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75449336, + "num_input_tokens_seen": 232445870, + "step": 10771, + "time_per_iteration": 2.758859157562256 + }, + { + "auxiliary_loss_clip": 0.01057394, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.03045082, + "balance_loss_mlp": 1.02066159, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 1.8705566991474805, + "language_loss": 0.74224985, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76315022, + "num_input_tokens_seen": 232464285, + "step": 10772, + "time_per_iteration": 2.6046652793884277 + }, + { + "auxiliary_loss_clip": 0.01074358, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.0326407, + "balance_loss_mlp": 1.01899791, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.6506290958418335, + "language_loss": 0.83036071, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85140014, + "num_input_tokens_seen": 232485815, + "step": 10773, + "time_per_iteration": 2.6231279373168945 + }, + { + "auxiliary_loss_clip": 0.01085821, + "auxiliary_loss_mlp": 0.00749261, + "balance_loss_clip": 1.03298473, + "balance_loss_mlp": 1.00038171, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.9000276274531516, + "language_loss": 0.78410828, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80245912, + "num_input_tokens_seen": 232504875, + "step": 10774, + "time_per_iteration": 2.5319764614105225 + }, + { + "auxiliary_loss_clip": 0.0109245, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.03531337, + "balance_loss_mlp": 1.02301633, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.096459732276695, + "language_loss": 0.68980575, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71107996, + "num_input_tokens_seen": 232521945, + "step": 10775, + "time_per_iteration": 2.513213872909546 + }, + { + "auxiliary_loss_clip": 0.0106181, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.03196192, + "balance_loss_mlp": 1.02659082, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.8090756239742798, + "language_loss": 0.65841675, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.6794228, + "num_input_tokens_seen": 232541500, + "step": 10776, + "time_per_iteration": 2.6920549869537354 + }, + { + "auxiliary_loss_clip": 0.01070791, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.03243494, + "balance_loss_mlp": 1.02121222, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.6729999460047256, + "language_loss": 0.78679764, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.80784798, + "num_input_tokens_seen": 232559720, + "step": 10777, + "time_per_iteration": 2.560032844543457 + }, + { + "auxiliary_loss_clip": 0.01089747, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.03611755, + "balance_loss_mlp": 1.01746988, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 3.134453315005743, + "language_loss": 0.73337793, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75457036, + "num_input_tokens_seen": 232579370, + "step": 10778, + "time_per_iteration": 2.5410094261169434 + }, + { + "auxiliary_loss_clip": 0.01086165, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.03335738, + "balance_loss_mlp": 1.02097511, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 2.038590239843987, + "language_loss": 0.77958262, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80076581, + "num_input_tokens_seen": 232600495, + "step": 10779, + "time_per_iteration": 2.609407901763916 + }, + { + "auxiliary_loss_clip": 0.01015589, + "auxiliary_loss_mlp": 0.01002081, + "balance_loss_clip": 1.00745976, + "balance_loss_mlp": 1.00115681, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7226364415277786, + "language_loss": 0.59392715, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61410379, + "num_input_tokens_seen": 232663165, + "step": 10780, + "time_per_iteration": 3.0726332664489746 + }, + { + "auxiliary_loss_clip": 0.00999054, + "auxiliary_loss_mlp": 0.0102951, + "balance_loss_clip": 1.03123689, + "balance_loss_mlp": 1.01809549, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 2.063774092228666, + "language_loss": 0.79313862, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81342429, + "num_input_tokens_seen": 232683385, + "step": 10781, + "time_per_iteration": 2.892869472503662 + }, + { + "auxiliary_loss_clip": 0.01104662, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.03600085, + "balance_loss_mlp": 1.02319336, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 2.7292829476385854, + "language_loss": 0.78657341, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.80798537, + "num_input_tokens_seen": 232699095, + "step": 10782, + "time_per_iteration": 4.543785095214844 + }, + { + "auxiliary_loss_clip": 0.01093765, + "auxiliary_loss_mlp": 0.00749607, + "balance_loss_clip": 1.03654361, + "balance_loss_mlp": 1.00036263, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.1518288385321327, + "language_loss": 0.64188629, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66032004, + "num_input_tokens_seen": 232717920, + "step": 10783, + "time_per_iteration": 2.6819047927856445 + }, + { + "auxiliary_loss_clip": 0.01105211, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.03670871, + "balance_loss_mlp": 1.01844072, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 1.9339206280355965, + "language_loss": 0.88707078, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90844017, + "num_input_tokens_seen": 232737605, + "step": 10784, + "time_per_iteration": 2.5802018642425537 + }, + { + "auxiliary_loss_clip": 0.01076653, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.03377569, + "balance_loss_mlp": 1.01899076, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 2.2021056719011813, + "language_loss": 0.72945994, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75054282, + "num_input_tokens_seen": 232755110, + "step": 10785, + "time_per_iteration": 2.581437587738037 + }, + { + "auxiliary_loss_clip": 0.01067472, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_clip": 1.03540158, + "balance_loss_mlp": 1.01758122, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.562380067235497, + "language_loss": 0.69272757, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71369052, + "num_input_tokens_seen": 232779040, + "step": 10786, + "time_per_iteration": 2.7575740814208984 + }, + { + "auxiliary_loss_clip": 0.01063594, + "auxiliary_loss_mlp": 0.01028698, + "balance_loss_clip": 1.03288698, + "balance_loss_mlp": 1.01694393, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 2.1309872693948972, + "language_loss": 0.71583366, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73675656, + "num_input_tokens_seen": 232800515, + "step": 10787, + "time_per_iteration": 2.7480483055114746 + }, + { + "auxiliary_loss_clip": 0.01105694, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.03632987, + "balance_loss_mlp": 1.01925576, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 2.1344696200355053, + "language_loss": 0.84582639, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86719668, + "num_input_tokens_seen": 232818450, + "step": 10788, + "time_per_iteration": 2.5049984455108643 + }, + { + "auxiliary_loss_clip": 0.01062229, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.03402126, + "balance_loss_mlp": 1.01813579, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 3.0241851426186943, + "language_loss": 0.77175039, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79267865, + "num_input_tokens_seen": 232834785, + "step": 10789, + "time_per_iteration": 2.6041266918182373 + }, + { + "auxiliary_loss_clip": 0.01082614, + "auxiliary_loss_mlp": 0.01027149, + "balance_loss_clip": 1.03134704, + "balance_loss_mlp": 1.015836, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.6738470700721508, + "language_loss": 0.75868261, + "learning_rate": 1.160483857897479e-06, + "loss": 0.77978021, + "num_input_tokens_seen": 232856050, + "step": 10790, + "time_per_iteration": 2.751661539077759 + }, + { + "auxiliary_loss_clip": 0.0110106, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.03620374, + "balance_loss_mlp": 1.01963568, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.3818418135208046, + "language_loss": 0.60063052, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62194753, + "num_input_tokens_seen": 232873945, + "step": 10791, + "time_per_iteration": 2.5517468452453613 + }, + { + "auxiliary_loss_clip": 0.01066635, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.03603244, + "balance_loss_mlp": 1.01806426, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.766092468293234, + "language_loss": 0.85700619, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.87796801, + "num_input_tokens_seen": 232892160, + "step": 10792, + "time_per_iteration": 2.8966445922851562 + }, + { + "auxiliary_loss_clip": 0.01077232, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.03366816, + "balance_loss_mlp": 1.02082872, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.100790050482768, + "language_loss": 0.78233826, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80344439, + "num_input_tokens_seen": 232911725, + "step": 10793, + "time_per_iteration": 4.408726453781128 + }, + { + "auxiliary_loss_clip": 0.01069767, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.03538179, + "balance_loss_mlp": 1.01654172, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 3.6520863811818356, + "language_loss": 0.74716187, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.76814282, + "num_input_tokens_seen": 232929085, + "step": 10794, + "time_per_iteration": 2.7508609294891357 + }, + { + "auxiliary_loss_clip": 0.01088665, + "auxiliary_loss_mlp": 0.00749484, + "balance_loss_clip": 1.03235769, + "balance_loss_mlp": 1.0003562, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 3.709799569605543, + "language_loss": 0.70144033, + "learning_rate": 1.158716808837621e-06, + "loss": 0.71982187, + "num_input_tokens_seen": 232949455, + "step": 10795, + "time_per_iteration": 2.7067923545837402 + }, + { + "auxiliary_loss_clip": 0.01077866, + "auxiliary_loss_mlp": 0.01036033, + "balance_loss_clip": 1.03317356, + "balance_loss_mlp": 1.02312243, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.767073085651142, + "language_loss": 0.53466988, + "learning_rate": 1.158363494676679e-06, + "loss": 0.55580878, + "num_input_tokens_seen": 232969445, + "step": 10796, + "time_per_iteration": 2.789579153060913 + }, + { + "auxiliary_loss_clip": 0.01089113, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.03388643, + "balance_loss_mlp": 1.01852632, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 1.5163323178736132, + "language_loss": 0.77410507, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79529285, + "num_input_tokens_seen": 232988900, + "step": 10797, + "time_per_iteration": 4.221461296081543 + }, + { + "auxiliary_loss_clip": 0.01050276, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.03263462, + "balance_loss_mlp": 1.01728308, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.122342602304589, + "language_loss": 0.70594138, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72672445, + "num_input_tokens_seen": 233005060, + "step": 10798, + "time_per_iteration": 2.730175495147705 + }, + { + "auxiliary_loss_clip": 0.01044989, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.02984536, + "balance_loss_mlp": 1.01731157, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.7185615713547062, + "language_loss": 0.76864004, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.78937042, + "num_input_tokens_seen": 233023375, + "step": 10799, + "time_per_iteration": 2.6124558448791504 + }, + { + "auxiliary_loss_clip": 0.010887, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.03311431, + "balance_loss_mlp": 1.01979399, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.9703587595405645, + "language_loss": 0.71504223, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.73625052, + "num_input_tokens_seen": 233043130, + "step": 10800, + "time_per_iteration": 2.5674562454223633 + }, + { + "auxiliary_loss_clip": 0.01014174, + "auxiliary_loss_mlp": 0.01018359, + "balance_loss_clip": 1.00364089, + "balance_loss_mlp": 1.01703537, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7757853002635032, + "language_loss": 0.60274303, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62306833, + "num_input_tokens_seen": 233110560, + "step": 10801, + "time_per_iteration": 3.2009830474853516 + }, + { + "auxiliary_loss_clip": 0.01089844, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.03585148, + "balance_loss_mlp": 1.02063727, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 2.671922140062246, + "language_loss": 0.78477919, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80601346, + "num_input_tokens_seen": 233130080, + "step": 10802, + "time_per_iteration": 2.577028751373291 + }, + { + "auxiliary_loss_clip": 0.01100607, + "auxiliary_loss_mlp": 0.01038109, + "balance_loss_clip": 1.03318894, + "balance_loss_mlp": 1.02557409, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.6372718527910346, + "language_loss": 0.74704093, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76842809, + "num_input_tokens_seen": 233150235, + "step": 10803, + "time_per_iteration": 2.5172884464263916 + }, + { + "auxiliary_loss_clip": 0.01036199, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.03082204, + "balance_loss_mlp": 1.0212847, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 3.4963716680050583, + "language_loss": 0.69923472, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.71992922, + "num_input_tokens_seen": 233166710, + "step": 10804, + "time_per_iteration": 2.6853597164154053 + }, + { + "auxiliary_loss_clip": 0.01086962, + "auxiliary_loss_mlp": 0.01026586, + "balance_loss_clip": 1.03407836, + "balance_loss_mlp": 1.01434898, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.7296422621429077, + "language_loss": 0.7265321, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74766755, + "num_input_tokens_seen": 233185445, + "step": 10805, + "time_per_iteration": 2.546069383621216 + }, + { + "auxiliary_loss_clip": 0.01072821, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.0326097, + "balance_loss_mlp": 1.01954687, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.6000346436420148, + "language_loss": 0.65598416, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.67701799, + "num_input_tokens_seen": 233205805, + "step": 10806, + "time_per_iteration": 2.6270365715026855 + }, + { + "auxiliary_loss_clip": 0.01078401, + "auxiliary_loss_mlp": 0.00749641, + "balance_loss_clip": 1.03196633, + "balance_loss_mlp": 1.00042093, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.2342995780848525, + "language_loss": 0.7847774, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.80305785, + "num_input_tokens_seen": 233224215, + "step": 10807, + "time_per_iteration": 4.061419486999512 + }, + { + "auxiliary_loss_clip": 0.01004209, + "auxiliary_loss_mlp": 0.01001954, + "balance_loss_clip": 1.00372875, + "balance_loss_mlp": 1.00069666, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7976376586641982, + "language_loss": 0.58908439, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.609146, + "num_input_tokens_seen": 233294440, + "step": 10808, + "time_per_iteration": 3.2930243015289307 + }, + { + "auxiliary_loss_clip": 0.01079576, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_clip": 1.0361191, + "balance_loss_mlp": 1.01415801, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.980892675612047, + "language_loss": 0.63554716, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65659833, + "num_input_tokens_seen": 233316125, + "step": 10809, + "time_per_iteration": 2.7032310962677 + }, + { + "auxiliary_loss_clip": 0.01088, + "auxiliary_loss_mlp": 0.00749357, + "balance_loss_clip": 1.03407896, + "balance_loss_mlp": 1.00033379, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.6114175147161192, + "language_loss": 0.81669623, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83506978, + "num_input_tokens_seen": 233336140, + "step": 10810, + "time_per_iteration": 2.6028151512145996 + }, + { + "auxiliary_loss_clip": 0.01059081, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.0336715, + "balance_loss_mlp": 1.0200671, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.5495177318118882, + "language_loss": 0.71447712, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.73537177, + "num_input_tokens_seen": 233356095, + "step": 10811, + "time_per_iteration": 2.6395633220672607 + }, + { + "auxiliary_loss_clip": 0.01041551, + "auxiliary_loss_mlp": 0.01028307, + "balance_loss_clip": 1.03637373, + "balance_loss_mlp": 1.01735139, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.726392421954938, + "language_loss": 0.77880675, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79950535, + "num_input_tokens_seen": 233376830, + "step": 10812, + "time_per_iteration": 2.699537992477417 + }, + { + "auxiliary_loss_clip": 0.01083768, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.03281784, + "balance_loss_mlp": 1.02050674, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.7218441153862523, + "language_loss": 0.8515057, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87266684, + "num_input_tokens_seen": 233395275, + "step": 10813, + "time_per_iteration": 2.58640193939209 + }, + { + "auxiliary_loss_clip": 0.01054205, + "auxiliary_loss_mlp": 0.01027079, + "balance_loss_clip": 1.03142703, + "balance_loss_mlp": 1.01474094, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.9120998017196789, + "language_loss": 0.79505992, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.81587279, + "num_input_tokens_seen": 233413345, + "step": 10814, + "time_per_iteration": 2.5944621562957764 + }, + { + "auxiliary_loss_clip": 0.01048936, + "auxiliary_loss_mlp": 0.00749895, + "balance_loss_clip": 1.03339982, + "balance_loss_mlp": 1.00042188, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.5113569653526588, + "language_loss": 0.65711439, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67510271, + "num_input_tokens_seen": 233436105, + "step": 10815, + "time_per_iteration": 2.8433375358581543 + }, + { + "auxiliary_loss_clip": 0.01104101, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.0350163, + "balance_loss_mlp": 1.01771712, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.081117329379102, + "language_loss": 0.7489413, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77029657, + "num_input_tokens_seen": 233452320, + "step": 10816, + "time_per_iteration": 2.5203731060028076 + }, + { + "auxiliary_loss_clip": 0.01069586, + "auxiliary_loss_mlp": 0.01025499, + "balance_loss_clip": 1.03258419, + "balance_loss_mlp": 1.01360834, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.715664556628616, + "language_loss": 0.73055053, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75150138, + "num_input_tokens_seen": 233469920, + "step": 10817, + "time_per_iteration": 2.6046321392059326 + }, + { + "auxiliary_loss_clip": 0.0105186, + "auxiliary_loss_mlp": 0.01036591, + "balance_loss_clip": 1.02863121, + "balance_loss_mlp": 1.022524, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.4446132930627658, + "language_loss": 0.72028154, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74116611, + "num_input_tokens_seen": 233499780, + "step": 10818, + "time_per_iteration": 3.016122579574585 + }, + { + "auxiliary_loss_clip": 0.01071305, + "auxiliary_loss_mlp": 0.01027977, + "balance_loss_clip": 1.03472328, + "balance_loss_mlp": 1.01634812, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 2.061177651336047, + "language_loss": 0.6523807, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67337346, + "num_input_tokens_seen": 233518235, + "step": 10819, + "time_per_iteration": 2.613438844680786 + }, + { + "auxiliary_loss_clip": 0.01063911, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.03182721, + "balance_loss_mlp": 1.01780069, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 2.3433608383718036, + "language_loss": 0.83772051, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85866255, + "num_input_tokens_seen": 233535215, + "step": 10820, + "time_per_iteration": 2.666090965270996 + }, + { + "auxiliary_loss_clip": 0.01095282, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.03560305, + "balance_loss_mlp": 1.01629305, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.2193183921959085, + "language_loss": 0.78024524, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80148584, + "num_input_tokens_seen": 233552775, + "step": 10821, + "time_per_iteration": 2.5660104751586914 + }, + { + "auxiliary_loss_clip": 0.01065813, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.03363872, + "balance_loss_mlp": 1.01845884, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.4149895449999248, + "language_loss": 0.80042344, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.8213734, + "num_input_tokens_seen": 233572080, + "step": 10822, + "time_per_iteration": 4.114445686340332 + }, + { + "auxiliary_loss_clip": 0.01072833, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.03623378, + "balance_loss_mlp": 1.01841378, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 2.3360948870565683, + "language_loss": 0.87340522, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89443946, + "num_input_tokens_seen": 233589155, + "step": 10823, + "time_per_iteration": 2.628422737121582 + }, + { + "auxiliary_loss_clip": 0.01101315, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.03424144, + "balance_loss_mlp": 1.02048361, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.8178131694190327, + "language_loss": 0.66260976, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68394458, + "num_input_tokens_seen": 233608180, + "step": 10824, + "time_per_iteration": 2.543562650680542 + }, + { + "auxiliary_loss_clip": 0.01082736, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.03388679, + "balance_loss_mlp": 1.01577592, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 2.6358760629657287, + "language_loss": 0.87373465, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89484072, + "num_input_tokens_seen": 233625750, + "step": 10825, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.0107909, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.03137922, + "balance_loss_mlp": 1.01664042, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 3.159904319393176, + "language_loss": 0.72777802, + "learning_rate": 1.147778970474885e-06, + "loss": 0.7488662, + "num_input_tokens_seen": 233644235, + "step": 10826, + "time_per_iteration": 2.543999433517456 + }, + { + "auxiliary_loss_clip": 0.01090582, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.03477073, + "balance_loss_mlp": 1.01697874, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 1.8347771294264494, + "language_loss": 0.69124186, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71242809, + "num_input_tokens_seen": 233662845, + "step": 10827, + "time_per_iteration": 2.5529212951660156 + }, + { + "auxiliary_loss_clip": 0.01076193, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.03167021, + "balance_loss_mlp": 1.02034855, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.755507643182253, + "language_loss": 0.76897049, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.79004627, + "num_input_tokens_seen": 233681990, + "step": 10828, + "time_per_iteration": 2.616424560546875 + }, + { + "auxiliary_loss_clip": 0.01088098, + "auxiliary_loss_mlp": 0.01026178, + "balance_loss_clip": 1.03442883, + "balance_loss_mlp": 1.01543117, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.8294268748025317, + "language_loss": 0.89638221, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91752493, + "num_input_tokens_seen": 233698930, + "step": 10829, + "time_per_iteration": 2.5952837467193604 + }, + { + "auxiliary_loss_clip": 0.01024793, + "auxiliary_loss_mlp": 0.00998028, + "balance_loss_clip": 1.00419855, + "balance_loss_mlp": 0.9970383, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6386433688992409, + "language_loss": 0.55446827, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57469642, + "num_input_tokens_seen": 233769825, + "step": 10830, + "time_per_iteration": 3.1883387565612793 + }, + { + "auxiliary_loss_clip": 0.01066944, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.03261232, + "balance_loss_mlp": 1.01951313, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 2.0001703625548317, + "language_loss": 0.74868983, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76967406, + "num_input_tokens_seen": 233787095, + "step": 10831, + "time_per_iteration": 2.63954758644104 + }, + { + "auxiliary_loss_clip": 0.01006436, + "auxiliary_loss_mlp": 0.00998946, + "balance_loss_clip": 1.005795, + "balance_loss_mlp": 0.99756354, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6426781989482068, + "language_loss": 0.51034331, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53039718, + "num_input_tokens_seen": 233853050, + "step": 10832, + "time_per_iteration": 3.2104625701904297 + }, + { + "auxiliary_loss_clip": 0.01082277, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03308916, + "balance_loss_mlp": 1.02025437, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 3.518093703314096, + "language_loss": 0.83214855, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85329783, + "num_input_tokens_seen": 233871385, + "step": 10833, + "time_per_iteration": 4.113900423049927 + }, + { + "auxiliary_loss_clip": 0.01081267, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.03592837, + "balance_loss_mlp": 1.02051795, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.016493145930655, + "language_loss": 0.83353841, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85467279, + "num_input_tokens_seen": 233888175, + "step": 10834, + "time_per_iteration": 2.5570642948150635 + }, + { + "auxiliary_loss_clip": 0.01085427, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.03310323, + "balance_loss_mlp": 1.02489674, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.6541440270034145, + "language_loss": 0.77334154, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79455948, + "num_input_tokens_seen": 233911470, + "step": 10835, + "time_per_iteration": 2.592968463897705 + }, + { + "auxiliary_loss_clip": 0.01073513, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.03388357, + "balance_loss_mlp": 1.02558112, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.3746975027782216, + "language_loss": 0.77240914, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79352021, + "num_input_tokens_seen": 233932135, + "step": 10836, + "time_per_iteration": 2.5994865894317627 + }, + { + "auxiliary_loss_clip": 0.01062498, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.03353834, + "balance_loss_mlp": 1.02298903, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 2.2127848556969547, + "language_loss": 0.82283705, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84381425, + "num_input_tokens_seen": 233947880, + "step": 10837, + "time_per_iteration": 4.073650360107422 + }, + { + "auxiliary_loss_clip": 0.0106288, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.03465962, + "balance_loss_mlp": 1.02160406, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.8075872219254379, + "language_loss": 0.58777773, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.60874951, + "num_input_tokens_seen": 233971475, + "step": 10838, + "time_per_iteration": 2.8298611640930176 + }, + { + "auxiliary_loss_clip": 0.01025475, + "auxiliary_loss_mlp": 0.01008028, + "balance_loss_clip": 1.00501132, + "balance_loss_mlp": 1.00702035, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7312189952588494, + "language_loss": 0.6085428, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62887788, + "num_input_tokens_seen": 234030690, + "step": 10839, + "time_per_iteration": 3.130664587020874 + }, + { + "auxiliary_loss_clip": 0.01078626, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.03450298, + "balance_loss_mlp": 1.01951933, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.591114293019107, + "language_loss": 0.67789239, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.69897902, + "num_input_tokens_seen": 234052470, + "step": 10840, + "time_per_iteration": 2.722080945968628 + }, + { + "auxiliary_loss_clip": 0.01057563, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.03411424, + "balance_loss_mlp": 1.01873565, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.295028523269703, + "language_loss": 0.74103892, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.76190746, + "num_input_tokens_seen": 234071495, + "step": 10841, + "time_per_iteration": 2.746020555496216 + }, + { + "auxiliary_loss_clip": 0.01100969, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.03410125, + "balance_loss_mlp": 1.02113378, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.4560071110330077, + "language_loss": 0.6246286, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64596301, + "num_input_tokens_seen": 234092325, + "step": 10842, + "time_per_iteration": 2.605727195739746 + }, + { + "auxiliary_loss_clip": 0.01017482, + "auxiliary_loss_mlp": 0.01002572, + "balance_loss_clip": 1.00638103, + "balance_loss_mlp": 1.0015887, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8130349468769614, + "language_loss": 0.56143486, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58163536, + "num_input_tokens_seen": 234148005, + "step": 10843, + "time_per_iteration": 2.9709842205047607 + }, + { + "auxiliary_loss_clip": 0.01093818, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.03437686, + "balance_loss_mlp": 1.02370679, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 3.543526732811809, + "language_loss": 0.83043849, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.85174268, + "num_input_tokens_seen": 234164280, + "step": 10844, + "time_per_iteration": 2.5931556224823 + }, + { + "auxiliary_loss_clip": 0.01091478, + "auxiliary_loss_mlp": 0.01030024, + "balance_loss_clip": 1.03493094, + "balance_loss_mlp": 1.01771569, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 1.7846769768328783, + "language_loss": 0.60279012, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62400514, + "num_input_tokens_seen": 234185090, + "step": 10845, + "time_per_iteration": 2.6228017807006836 + }, + { + "auxiliary_loss_clip": 0.01091441, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 1.01846695, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 2.0699922251179093, + "language_loss": 0.79640543, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81762177, + "num_input_tokens_seen": 234204050, + "step": 10846, + "time_per_iteration": 2.619631290435791 + }, + { + "auxiliary_loss_clip": 0.01015845, + "auxiliary_loss_mlp": 0.01014128, + "balance_loss_clip": 1.00448465, + "balance_loss_mlp": 1.01305521, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.709621159401723, + "language_loss": 0.60286105, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62316084, + "num_input_tokens_seen": 234269790, + "step": 10847, + "time_per_iteration": 4.737070322036743 + }, + { + "auxiliary_loss_clip": 0.01104387, + "auxiliary_loss_mlp": 0.01042193, + "balance_loss_clip": 1.03641653, + "balance_loss_mlp": 1.03015912, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.6523570521141053, + "language_loss": 0.80936587, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83083177, + "num_input_tokens_seen": 234290135, + "step": 10848, + "time_per_iteration": 2.58074688911438 + }, + { + "auxiliary_loss_clip": 0.01070211, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.03238487, + "balance_loss_mlp": 1.02213264, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.0783683255780274, + "language_loss": 0.74472904, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.76576555, + "num_input_tokens_seen": 234309535, + "step": 10849, + "time_per_iteration": 2.6221513748168945 + }, + { + "auxiliary_loss_clip": 0.01050107, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.03320205, + "balance_loss_mlp": 1.01830375, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 3.40801933651124, + "language_loss": 0.68007064, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70087045, + "num_input_tokens_seen": 234328755, + "step": 10850, + "time_per_iteration": 2.6931002140045166 + }, + { + "auxiliary_loss_clip": 0.01077298, + "auxiliary_loss_mlp": 0.00749488, + "balance_loss_clip": 1.03458452, + "balance_loss_mlp": 1.0003581, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 1.8378106825712912, + "language_loss": 0.66548592, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68375379, + "num_input_tokens_seen": 234348655, + "step": 10851, + "time_per_iteration": 2.6622986793518066 + }, + { + "auxiliary_loss_clip": 0.01078235, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.03320825, + "balance_loss_mlp": 1.01755047, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.9784776622219273, + "language_loss": 0.73544109, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.75651348, + "num_input_tokens_seen": 234367445, + "step": 10852, + "time_per_iteration": 2.5918262004852295 + }, + { + "auxiliary_loss_clip": 0.01080048, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03706145, + "balance_loss_mlp": 1.01906204, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 9.088233264079307, + "language_loss": 0.66522801, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68634105, + "num_input_tokens_seen": 234384825, + "step": 10853, + "time_per_iteration": 2.5688211917877197 + }, + { + "auxiliary_loss_clip": 0.00997048, + "auxiliary_loss_mlp": 0.00999152, + "balance_loss_clip": 1.00971174, + "balance_loss_mlp": 0.99811506, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.740738945390354, + "language_loss": 0.62994802, + "learning_rate": 1.137926314758634e-06, + "loss": 0.64991009, + "num_input_tokens_seen": 234450630, + "step": 10854, + "time_per_iteration": 3.287945032119751 + }, + { + "auxiliary_loss_clip": 0.01083293, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.0324347, + "balance_loss_mlp": 1.02274394, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.5842216489960106, + "language_loss": 0.77804816, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.7992506, + "num_input_tokens_seen": 234473505, + "step": 10855, + "time_per_iteration": 2.6509876251220703 + }, + { + "auxiliary_loss_clip": 0.0105778, + "auxiliary_loss_mlp": 0.01023104, + "balance_loss_clip": 1.02915287, + "balance_loss_mlp": 1.01178551, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.752773315936282, + "language_loss": 0.7883606, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.80916941, + "num_input_tokens_seen": 234492485, + "step": 10856, + "time_per_iteration": 2.6152496337890625 + }, + { + "auxiliary_loss_clip": 0.01099721, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.03368545, + "balance_loss_mlp": 1.01667178, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.8881621674585702, + "language_loss": 0.73703039, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75832009, + "num_input_tokens_seen": 234512645, + "step": 10857, + "time_per_iteration": 2.5811548233032227 + }, + { + "auxiliary_loss_clip": 0.01075622, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.03160405, + "balance_loss_mlp": 1.01899385, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.063967328999469, + "language_loss": 0.6305455, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.6515981, + "num_input_tokens_seen": 234529310, + "step": 10858, + "time_per_iteration": 2.6135308742523193 + }, + { + "auxiliary_loss_clip": 0.01097511, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.03310156, + "balance_loss_mlp": 1.02031171, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.6857969069693703, + "language_loss": 0.78441, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80570066, + "num_input_tokens_seen": 234546685, + "step": 10859, + "time_per_iteration": 2.54608416557312 + }, + { + "auxiliary_loss_clip": 0.01091676, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.03354859, + "balance_loss_mlp": 1.01627755, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.5701005666457524, + "language_loss": 0.67647815, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.69767654, + "num_input_tokens_seen": 234566255, + "step": 10860, + "time_per_iteration": 2.5359625816345215 + }, + { + "auxiliary_loss_clip": 0.01092293, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.03601193, + "balance_loss_mlp": 1.01595807, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 1.827613571353332, + "language_loss": 0.66236109, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68355966, + "num_input_tokens_seen": 234585405, + "step": 10861, + "time_per_iteration": 2.5431339740753174 + }, + { + "auxiliary_loss_clip": 0.01079771, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.03379035, + "balance_loss_mlp": 1.01746809, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.680313689252, + "language_loss": 0.65049446, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67159837, + "num_input_tokens_seen": 234608095, + "step": 10862, + "time_per_iteration": 2.688624620437622 + }, + { + "auxiliary_loss_clip": 0.01075719, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.03149199, + "balance_loss_mlp": 1.01676011, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.5293637598223913, + "language_loss": 0.77028716, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79132533, + "num_input_tokens_seen": 234627335, + "step": 10863, + "time_per_iteration": 4.021455764770508 + }, + { + "auxiliary_loss_clip": 0.01071455, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.0330019, + "balance_loss_mlp": 1.01966143, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 2.464447858748427, + "language_loss": 0.74985111, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.77087355, + "num_input_tokens_seen": 234646540, + "step": 10864, + "time_per_iteration": 2.585043430328369 + }, + { + "auxiliary_loss_clip": 0.01087159, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.03327703, + "balance_loss_mlp": 1.02284527, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.8667763670801023, + "language_loss": 0.86115867, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88236779, + "num_input_tokens_seen": 234665470, + "step": 10865, + "time_per_iteration": 2.552764415740967 + }, + { + "auxiliary_loss_clip": 0.01070042, + "auxiliary_loss_mlp": 0.00749662, + "balance_loss_clip": 1.03232431, + "balance_loss_mlp": 1.00044894, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 1.5357897837342516, + "language_loss": 0.81789482, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83609188, + "num_input_tokens_seen": 234683955, + "step": 10866, + "time_per_iteration": 2.712831497192383 + }, + { + "auxiliary_loss_clip": 0.01079849, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.03582144, + "balance_loss_mlp": 1.01449084, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.4796646462519512, + "language_loss": 0.82202578, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84308243, + "num_input_tokens_seen": 234704595, + "step": 10867, + "time_per_iteration": 2.599824905395508 + }, + { + "auxiliary_loss_clip": 0.01072737, + "auxiliary_loss_mlp": 0.01025989, + "balance_loss_clip": 1.03386545, + "balance_loss_mlp": 1.0148077, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 2.4161680605559637, + "language_loss": 0.81102335, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.83201063, + "num_input_tokens_seen": 234724090, + "step": 10868, + "time_per_iteration": 2.5826094150543213 + }, + { + "auxiliary_loss_clip": 0.01071273, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.03512025, + "balance_loss_mlp": 1.01715887, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 2.0391332760716017, + "language_loss": 0.79688346, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81789172, + "num_input_tokens_seen": 234742560, + "step": 10869, + "time_per_iteration": 2.6481003761291504 + }, + { + "auxiliary_loss_clip": 0.01092471, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.036448, + "balance_loss_mlp": 1.02235413, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 2.2181251089846707, + "language_loss": 0.72359848, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.74486327, + "num_input_tokens_seen": 234762315, + "step": 10870, + "time_per_iteration": 2.632819890975952 + }, + { + "auxiliary_loss_clip": 0.01060126, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.03348565, + "balance_loss_mlp": 1.02377224, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.263613832113177, + "language_loss": 0.74435771, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.76531547, + "num_input_tokens_seen": 234781300, + "step": 10871, + "time_per_iteration": 2.6651687622070312 + }, + { + "auxiliary_loss_clip": 0.01079161, + "auxiliary_loss_mlp": 0.00749344, + "balance_loss_clip": 1.03376985, + "balance_loss_mlp": 1.00039196, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.8483838367756982, + "language_loss": 0.55656326, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57484829, + "num_input_tokens_seen": 234801040, + "step": 10872, + "time_per_iteration": 2.6184728145599365 + }, + { + "auxiliary_loss_clip": 0.01074941, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.03267276, + "balance_loss_mlp": 1.01943099, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.6449228212097373, + "language_loss": 0.74931705, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77036965, + "num_input_tokens_seen": 234821415, + "step": 10873, + "time_per_iteration": 4.052700996398926 + }, + { + "auxiliary_loss_clip": 0.01090576, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.03526306, + "balance_loss_mlp": 1.01858032, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.7634622309178027, + "language_loss": 0.75200498, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77321798, + "num_input_tokens_seen": 234843795, + "step": 10874, + "time_per_iteration": 2.574176549911499 + }, + { + "auxiliary_loss_clip": 0.01055911, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.03361201, + "balance_loss_mlp": 1.02082419, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.8358205025470102, + "language_loss": 0.81713492, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83802295, + "num_input_tokens_seen": 234862350, + "step": 10875, + "time_per_iteration": 2.681356906890869 + }, + { + "auxiliary_loss_clip": 0.01100097, + "auxiliary_loss_mlp": 0.01037523, + "balance_loss_clip": 1.03394508, + "balance_loss_mlp": 1.02651358, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.665705579761582, + "language_loss": 0.70068133, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72205746, + "num_input_tokens_seen": 234881790, + "step": 10876, + "time_per_iteration": 2.5122032165527344 + }, + { + "auxiliary_loss_clip": 0.01003774, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.03032935, + "balance_loss_mlp": 1.02397227, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 1.9453088623223265, + "language_loss": 0.79792255, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81831914, + "num_input_tokens_seen": 234897775, + "step": 10877, + "time_per_iteration": 2.820390224456787 + }, + { + "auxiliary_loss_clip": 0.01081333, + "auxiliary_loss_mlp": 0.00749545, + "balance_loss_clip": 1.03433371, + "balance_loss_mlp": 1.00052524, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 2.768428578719125, + "language_loss": 0.79234236, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.81065112, + "num_input_tokens_seen": 234918395, + "step": 10878, + "time_per_iteration": 4.159595966339111 + }, + { + "auxiliary_loss_clip": 0.01075841, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.03254867, + "balance_loss_mlp": 1.01624882, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 2.3613753916627687, + "language_loss": 0.84135687, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86240268, + "num_input_tokens_seen": 234936260, + "step": 10879, + "time_per_iteration": 2.5701684951782227 + }, + { + "auxiliary_loss_clip": 0.01080368, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.03330612, + "balance_loss_mlp": 1.01803303, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.1471863219399636, + "language_loss": 0.71184075, + "learning_rate": 1.128800362199601e-06, + "loss": 0.73294479, + "num_input_tokens_seen": 234952110, + "step": 10880, + "time_per_iteration": 2.605651378631592 + }, + { + "auxiliary_loss_clip": 0.01057803, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.03144193, + "balance_loss_mlp": 1.02018058, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 2.24659938307387, + "language_loss": 0.84443331, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86533034, + "num_input_tokens_seen": 234970810, + "step": 10881, + "time_per_iteration": 2.60295033454895 + }, + { + "auxiliary_loss_clip": 0.01065886, + "auxiliary_loss_mlp": 0.01033154, + "balance_loss_clip": 1.03217673, + "balance_loss_mlp": 1.02035689, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 2.0018035585396152, + "language_loss": 0.77753234, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.79852271, + "num_input_tokens_seen": 234989565, + "step": 10882, + "time_per_iteration": 2.6258461475372314 + }, + { + "auxiliary_loss_clip": 0.01103662, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.03597021, + "balance_loss_mlp": 1.02031887, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 1.8819731915764297, + "language_loss": 0.82379067, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84515512, + "num_input_tokens_seen": 235007955, + "step": 10883, + "time_per_iteration": 2.5334243774414062 + }, + { + "auxiliary_loss_clip": 0.01048342, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.03376913, + "balance_loss_mlp": 1.02145529, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.3338115317082804, + "language_loss": 0.85039318, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87121391, + "num_input_tokens_seen": 235024860, + "step": 10884, + "time_per_iteration": 2.7011537551879883 + }, + { + "auxiliary_loss_clip": 0.01076658, + "auxiliary_loss_mlp": 0.0103574, + "balance_loss_clip": 1.03234696, + "balance_loss_mlp": 1.02394986, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 2.2479524390657497, + "language_loss": 0.79937881, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82050276, + "num_input_tokens_seen": 235043815, + "step": 10885, + "time_per_iteration": 2.633305549621582 + }, + { + "auxiliary_loss_clip": 0.01060227, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.03170967, + "balance_loss_mlp": 1.01398849, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.8516193106702767, + "language_loss": 0.71757627, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.73843813, + "num_input_tokens_seen": 235062985, + "step": 10886, + "time_per_iteration": 2.731149673461914 + }, + { + "auxiliary_loss_clip": 0.01082749, + "auxiliary_loss_mlp": 0.010301, + "balance_loss_clip": 1.03363621, + "balance_loss_mlp": 1.0189836, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 2.0905413319106576, + "language_loss": 0.78315759, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80428612, + "num_input_tokens_seen": 235081670, + "step": 10887, + "time_per_iteration": 4.171336650848389 + }, + { + "auxiliary_loss_clip": 0.01072176, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.0341301, + "balance_loss_mlp": 1.02088046, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 2.8040272659685628, + "language_loss": 0.7945503, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81559902, + "num_input_tokens_seen": 235098510, + "step": 10888, + "time_per_iteration": 2.5956406593322754 + }, + { + "auxiliary_loss_clip": 0.01087128, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.03328967, + "balance_loss_mlp": 1.01910996, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.5172648690859831, + "language_loss": 0.6665771, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.6877467, + "num_input_tokens_seen": 235119990, + "step": 10889, + "time_per_iteration": 2.8050413131713867 + }, + { + "auxiliary_loss_clip": 0.01071803, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.03176725, + "balance_loss_mlp": 1.02338207, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.4519849923344628, + "language_loss": 0.79854894, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.81964564, + "num_input_tokens_seen": 235139255, + "step": 10890, + "time_per_iteration": 2.631528854370117 + }, + { + "auxiliary_loss_clip": 0.01089475, + "auxiliary_loss_mlp": 0.00749635, + "balance_loss_clip": 1.03270173, + "balance_loss_mlp": 1.0005002, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 1.9992340865175504, + "language_loss": 0.65360487, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.671996, + "num_input_tokens_seen": 235158455, + "step": 10891, + "time_per_iteration": 2.637799024581909 + }, + { + "auxiliary_loss_clip": 0.01083604, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.03191853, + "balance_loss_mlp": 1.02347612, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.908000888565182, + "language_loss": 0.79734045, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81852305, + "num_input_tokens_seen": 235177350, + "step": 10892, + "time_per_iteration": 2.5221168994903564 + }, + { + "auxiliary_loss_clip": 0.0109499, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.03653097, + "balance_loss_mlp": 1.02243114, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 3.300877712133125, + "language_loss": 0.77881569, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80010307, + "num_input_tokens_seen": 235196435, + "step": 10893, + "time_per_iteration": 2.5439579486846924 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.03580451, + "balance_loss_mlp": 1.02062178, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.5334986293012953, + "language_loss": 0.70247781, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72385931, + "num_input_tokens_seen": 235215430, + "step": 10894, + "time_per_iteration": 2.499314069747925 + }, + { + "auxiliary_loss_clip": 0.01093748, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.03474188, + "balance_loss_mlp": 1.01950634, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 2.7606182242921764, + "language_loss": 0.63013732, + "learning_rate": 1.123545533127549e-06, + "loss": 0.65139174, + "num_input_tokens_seen": 235232015, + "step": 10895, + "time_per_iteration": 2.5059597492218018 + }, + { + "auxiliary_loss_clip": 0.010863, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.03255272, + "balance_loss_mlp": 1.02774286, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 2.0860879478152783, + "language_loss": 0.78760624, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.80886215, + "num_input_tokens_seen": 235248115, + "step": 10896, + "time_per_iteration": 2.482374906539917 + }, + { + "auxiliary_loss_clip": 0.01078374, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.0358932, + "balance_loss_mlp": 1.01938868, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.3372280912635421, + "language_loss": 0.70766342, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.7287516, + "num_input_tokens_seen": 235270785, + "step": 10897, + "time_per_iteration": 2.649737596511841 + }, + { + "auxiliary_loss_clip": 0.01101971, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.03425252, + "balance_loss_mlp": 1.01870918, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.8247956323496899, + "language_loss": 0.75452614, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77584839, + "num_input_tokens_seen": 235287905, + "step": 10898, + "time_per_iteration": 2.4701895713806152 + }, + { + "auxiliary_loss_clip": 0.01080795, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.03529572, + "balance_loss_mlp": 1.02169251, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.7200964684232156, + "language_loss": 0.73540425, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75654513, + "num_input_tokens_seen": 235305525, + "step": 10899, + "time_per_iteration": 2.5686686038970947 + }, + { + "auxiliary_loss_clip": 0.01079229, + "auxiliary_loss_mlp": 0.01025914, + "balance_loss_clip": 1.03469622, + "balance_loss_mlp": 1.01479805, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.1367760184098783, + "language_loss": 0.55627936, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.57733083, + "num_input_tokens_seen": 235324415, + "step": 10900, + "time_per_iteration": 2.6464173793792725 + }, + { + "auxiliary_loss_clip": 0.01092937, + "auxiliary_loss_mlp": 0.01034577, + "balance_loss_clip": 1.03854752, + "balance_loss_mlp": 1.02198851, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.7684587617730534, + "language_loss": 0.7671895, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.78846467, + "num_input_tokens_seen": 235341595, + "step": 10901, + "time_per_iteration": 2.5626220703125 + }, + { + "auxiliary_loss_clip": 0.01100427, + "auxiliary_loss_mlp": 0.01027553, + "balance_loss_clip": 1.03509188, + "balance_loss_mlp": 1.01532233, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 2.6579403009419225, + "language_loss": 0.73350954, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75478929, + "num_input_tokens_seen": 235361700, + "step": 10902, + "time_per_iteration": 2.4981961250305176 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.010335, + "balance_loss_clip": 1.03695989, + "balance_loss_mlp": 1.0217104, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.6506793584869963, + "language_loss": 0.67826843, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.69962001, + "num_input_tokens_seen": 235382065, + "step": 10903, + "time_per_iteration": 4.018261432647705 + }, + { + "auxiliary_loss_clip": 0.01079673, + "auxiliary_loss_mlp": 0.00749827, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.00043142, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 7.166687045531843, + "language_loss": 0.66714722, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.68544221, + "num_input_tokens_seen": 235402130, + "step": 10904, + "time_per_iteration": 2.664257764816284 + }, + { + "auxiliary_loss_clip": 0.01091738, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.03341794, + "balance_loss_mlp": 1.02589834, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.859959736666034, + "language_loss": 0.9105953, + "learning_rate": 1.120046465383464e-06, + "loss": 0.93190104, + "num_input_tokens_seen": 235420435, + "step": 10905, + "time_per_iteration": 2.5609302520751953 + }, + { + "auxiliary_loss_clip": 0.01086353, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.03239822, + "balance_loss_mlp": 1.02089179, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 2.0638743219842004, + "language_loss": 0.75302708, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77421546, + "num_input_tokens_seen": 235439960, + "step": 10906, + "time_per_iteration": 2.5577285289764404 + }, + { + "auxiliary_loss_clip": 0.01105969, + "auxiliary_loss_mlp": 0.01037292, + "balance_loss_clip": 1.03736424, + "balance_loss_mlp": 1.02465558, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 3.5546016453995533, + "language_loss": 0.74224317, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76367581, + "num_input_tokens_seen": 235457495, + "step": 10907, + "time_per_iteration": 2.449544906616211 + }, + { + "auxiliary_loss_clip": 0.01058976, + "auxiliary_loss_mlp": 0.01029498, + "balance_loss_clip": 1.03106618, + "balance_loss_mlp": 1.01628947, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.4272393936059946, + "language_loss": 0.72391623, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74480093, + "num_input_tokens_seen": 235479525, + "step": 10908, + "time_per_iteration": 2.66083025932312 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.03649366, + "balance_loss_mlp": 1.02316904, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.0558562782194025, + "language_loss": 0.80915564, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83055127, + "num_input_tokens_seen": 235496305, + "step": 10909, + "time_per_iteration": 2.4502458572387695 + }, + { + "auxiliary_loss_clip": 0.01103212, + "auxiliary_loss_mlp": 0.01037001, + "balance_loss_clip": 1.03553712, + "balance_loss_mlp": 1.02403164, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.5576730697326373, + "language_loss": 0.63651407, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.65791619, + "num_input_tokens_seen": 235512545, + "step": 10910, + "time_per_iteration": 2.4794976711273193 + }, + { + "auxiliary_loss_clip": 0.01076953, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.03351474, + "balance_loss_mlp": 1.02209759, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 6.022341114212718, + "language_loss": 0.75849485, + "learning_rate": 1.117948625548313e-06, + "loss": 0.7796303, + "num_input_tokens_seen": 235526045, + "step": 10911, + "time_per_iteration": 2.56296968460083 + }, + { + "auxiliary_loss_clip": 0.01096318, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.03271317, + "balance_loss_mlp": 1.01681209, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 1.5996358205362544, + "language_loss": 0.75603294, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77727377, + "num_input_tokens_seen": 235545285, + "step": 10912, + "time_per_iteration": 2.5863113403320312 + }, + { + "auxiliary_loss_clip": 0.01064593, + "auxiliary_loss_mlp": 0.00749793, + "balance_loss_clip": 1.03488839, + "balance_loss_mlp": 1.00042868, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.606398398681087, + "language_loss": 0.77601087, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79415464, + "num_input_tokens_seen": 235563150, + "step": 10913, + "time_per_iteration": 4.02839207649231 + }, + { + "auxiliary_loss_clip": 0.01066753, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.03058374, + "balance_loss_mlp": 1.01775277, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.750545048433745, + "language_loss": 0.70807767, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.72903615, + "num_input_tokens_seen": 235582535, + "step": 10914, + "time_per_iteration": 2.573399782180786 + }, + { + "auxiliary_loss_clip": 0.01070119, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.03455174, + "balance_loss_mlp": 1.01814818, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.9755982275459856, + "language_loss": 0.74495196, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76595485, + "num_input_tokens_seen": 235601490, + "step": 10915, + "time_per_iteration": 2.684281587600708 + }, + { + "auxiliary_loss_clip": 0.01056488, + "auxiliary_loss_mlp": 0.01028333, + "balance_loss_clip": 1.03049684, + "balance_loss_mlp": 1.01587009, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 2.032302728260625, + "language_loss": 0.79646015, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81730843, + "num_input_tokens_seen": 235619165, + "step": 10916, + "time_per_iteration": 2.685729742050171 + }, + { + "auxiliary_loss_clip": 0.01068724, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.0308845, + "balance_loss_mlp": 1.01963544, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 2.2914171635584726, + "language_loss": 0.76441473, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78541112, + "num_input_tokens_seen": 235637115, + "step": 10917, + "time_per_iteration": 4.10782527923584 + }, + { + "auxiliary_loss_clip": 0.01100616, + "auxiliary_loss_mlp": 0.00749596, + "balance_loss_clip": 1.03492868, + "balance_loss_mlp": 1.00039387, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 1.8651525650503358, + "language_loss": 0.69674331, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.71524543, + "num_input_tokens_seen": 235656330, + "step": 10918, + "time_per_iteration": 2.5565643310546875 + }, + { + "auxiliary_loss_clip": 0.0106819, + "auxiliary_loss_mlp": 0.01039274, + "balance_loss_clip": 1.03487194, + "balance_loss_mlp": 1.0279969, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.8838231703189614, + "language_loss": 0.76497436, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78604901, + "num_input_tokens_seen": 235674510, + "step": 10919, + "time_per_iteration": 2.6164047718048096 + }, + { + "auxiliary_loss_clip": 0.01015653, + "auxiliary_loss_mlp": 0.00746959, + "balance_loss_clip": 1.00513482, + "balance_loss_mlp": 1.00011146, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7159026308421131, + "language_loss": 0.53028059, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.54790664, + "num_input_tokens_seen": 235735050, + "step": 10920, + "time_per_iteration": 3.1478874683380127 + }, + { + "auxiliary_loss_clip": 0.010895, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.03463149, + "balance_loss_mlp": 1.01821113, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 2.0025747221245345, + "language_loss": 0.65068549, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67188609, + "num_input_tokens_seen": 235757545, + "step": 10921, + "time_per_iteration": 2.5884273052215576 + }, + { + "auxiliary_loss_clip": 0.01071828, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.03125608, + "balance_loss_mlp": 1.02278519, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.737026241046225, + "language_loss": 0.81353015, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83462638, + "num_input_tokens_seen": 235777265, + "step": 10922, + "time_per_iteration": 2.711329936981201 + }, + { + "auxiliary_loss_clip": 0.01040238, + "auxiliary_loss_mlp": 0.00750052, + "balance_loss_clip": 1.03091741, + "balance_loss_mlp": 1.00039625, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 1.9942163979237375, + "language_loss": 0.71128607, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.72918892, + "num_input_tokens_seen": 235796565, + "step": 10923, + "time_per_iteration": 2.694244861602783 + }, + { + "auxiliary_loss_clip": 0.010669, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.03562903, + "balance_loss_mlp": 1.02200294, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 1.9017684813588513, + "language_loss": 0.80562758, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82663405, + "num_input_tokens_seen": 235814805, + "step": 10924, + "time_per_iteration": 2.5739901065826416 + }, + { + "auxiliary_loss_clip": 0.01084933, + "auxiliary_loss_mlp": 0.0103041, + "balance_loss_clip": 1.03410339, + "balance_loss_mlp": 1.01865554, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.4759275332206923, + "language_loss": 0.72334194, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74449539, + "num_input_tokens_seen": 235833405, + "step": 10925, + "time_per_iteration": 2.5528366565704346 + }, + { + "auxiliary_loss_clip": 0.0108715, + "auxiliary_loss_mlp": 0.01028935, + "balance_loss_clip": 1.03334236, + "balance_loss_mlp": 1.01707351, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 1.9145659372576416, + "language_loss": 0.72570944, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74687028, + "num_input_tokens_seen": 235848530, + "step": 10926, + "time_per_iteration": 2.5505871772766113 + }, + { + "auxiliary_loss_clip": 0.01056872, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.03372741, + "balance_loss_mlp": 1.0190345, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.6996045925185614, + "language_loss": 0.72816509, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74905193, + "num_input_tokens_seen": 235867225, + "step": 10927, + "time_per_iteration": 4.1982951164245605 + }, + { + "auxiliary_loss_clip": 0.00995817, + "auxiliary_loss_mlp": 0.00999997, + "balance_loss_clip": 1.00544453, + "balance_loss_mlp": 0.99885887, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7399670525504434, + "language_loss": 0.64459896, + "learning_rate": 1.112011294493775e-06, + "loss": 0.6645571, + "num_input_tokens_seen": 235932925, + "step": 10928, + "time_per_iteration": 3.1586952209472656 + }, + { + "auxiliary_loss_clip": 0.01088398, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.03274345, + "balance_loss_mlp": 1.02065921, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.6922424021906408, + "language_loss": 0.77588594, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.79710495, + "num_input_tokens_seen": 235952680, + "step": 10929, + "time_per_iteration": 2.6207919120788574 + }, + { + "auxiliary_loss_clip": 0.01065396, + "auxiliary_loss_mlp": 0.01030797, + "balance_loss_clip": 1.03239679, + "balance_loss_mlp": 1.01909637, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.9240919149593156, + "language_loss": 0.65586531, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67682725, + "num_input_tokens_seen": 235972075, + "step": 10930, + "time_per_iteration": 2.6350040435791016 + }, + { + "auxiliary_loss_clip": 0.01049574, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.03011513, + "balance_loss_mlp": 1.01833034, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.531742405612682, + "language_loss": 0.70551777, + "learning_rate": 1.110964538515258e-06, + "loss": 0.72632551, + "num_input_tokens_seen": 235990340, + "step": 10931, + "time_per_iteration": 2.6791439056396484 + }, + { + "auxiliary_loss_clip": 0.01056465, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.03241229, + "balance_loss_mlp": 1.02193356, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.63648282946849, + "language_loss": 0.6823982, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70329952, + "num_input_tokens_seen": 236007470, + "step": 10932, + "time_per_iteration": 2.6536171436309814 + }, + { + "auxiliary_loss_clip": 0.01076087, + "auxiliary_loss_mlp": 0.00749616, + "balance_loss_clip": 1.03166795, + "balance_loss_mlp": 1.00042379, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 2.19293611879136, + "language_loss": 0.8028335, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82109058, + "num_input_tokens_seen": 236029030, + "step": 10933, + "time_per_iteration": 2.9066998958587646 + }, + { + "auxiliary_loss_clip": 0.01052549, + "auxiliary_loss_mlp": 0.01038879, + "balance_loss_clip": 1.03388309, + "balance_loss_mlp": 1.02580726, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.7339370677599582, + "language_loss": 0.73719668, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75811094, + "num_input_tokens_seen": 236047160, + "step": 10934, + "time_per_iteration": 2.7561023235321045 + }, + { + "auxiliary_loss_clip": 0.01081168, + "auxiliary_loss_mlp": 0.0104005, + "balance_loss_clip": 1.03232431, + "balance_loss_mlp": 1.02668071, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.5697866617099836, + "language_loss": 0.76425439, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78546655, + "num_input_tokens_seen": 236069215, + "step": 10935, + "time_per_iteration": 2.7193541526794434 + }, + { + "auxiliary_loss_clip": 0.01060078, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.03231978, + "balance_loss_mlp": 1.02297974, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.6899033499682041, + "language_loss": 0.7810204, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.80198711, + "num_input_tokens_seen": 236088335, + "step": 10936, + "time_per_iteration": 2.6320691108703613 + }, + { + "auxiliary_loss_clip": 0.01050725, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.03203177, + "balance_loss_mlp": 1.02069533, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 2.1233128738437226, + "language_loss": 0.69068146, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71151054, + "num_input_tokens_seen": 236108540, + "step": 10937, + "time_per_iteration": 2.695108652114868 + }, + { + "auxiliary_loss_clip": 0.01075478, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.03396249, + "balance_loss_mlp": 1.01749837, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.4924265676102, + "language_loss": 0.68813503, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70918775, + "num_input_tokens_seen": 236124495, + "step": 10938, + "time_per_iteration": 2.568021059036255 + }, + { + "auxiliary_loss_clip": 0.01071438, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03334606, + "balance_loss_mlp": 1.01993823, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 2.1629363225448577, + "language_loss": 0.71621037, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73724812, + "num_input_tokens_seen": 236142550, + "step": 10939, + "time_per_iteration": 2.554252862930298 + }, + { + "auxiliary_loss_clip": 0.01079566, + "auxiliary_loss_mlp": 0.00749579, + "balance_loss_clip": 1.03402221, + "balance_loss_mlp": 1.00044394, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 3.075806303416346, + "language_loss": 0.77452701, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79281849, + "num_input_tokens_seen": 236156620, + "step": 10940, + "time_per_iteration": 2.562871217727661 + }, + { + "auxiliary_loss_clip": 0.01055524, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.0314436, + "balance_loss_mlp": 1.02165842, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 2.744661680242127, + "language_loss": 0.68415582, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70505488, + "num_input_tokens_seen": 236177095, + "step": 10941, + "time_per_iteration": 2.7893428802490234 + }, + { + "auxiliary_loss_clip": 0.01082458, + "auxiliary_loss_mlp": 0.00749551, + "balance_loss_clip": 1.03019691, + "balance_loss_mlp": 1.0003767, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 2.0181564983791227, + "language_loss": 0.68376637, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70208645, + "num_input_tokens_seen": 236194695, + "step": 10942, + "time_per_iteration": 2.5654075145721436 + }, + { + "auxiliary_loss_clip": 0.01066574, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.03332889, + "balance_loss_mlp": 1.01875997, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 2.1590236954662765, + "language_loss": 0.71382844, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73481691, + "num_input_tokens_seen": 236213885, + "step": 10943, + "time_per_iteration": 4.1466004848480225 + }, + { + "auxiliary_loss_clip": 0.010541, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.03145814, + "balance_loss_mlp": 1.02160335, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.6887972686250488, + "language_loss": 0.5946666, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61554825, + "num_input_tokens_seen": 236237315, + "step": 10944, + "time_per_iteration": 2.8584234714508057 + }, + { + "auxiliary_loss_clip": 0.0109359, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.03484154, + "balance_loss_mlp": 1.01885033, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.8085326338610683, + "language_loss": 0.72430491, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74555486, + "num_input_tokens_seen": 236256345, + "step": 10945, + "time_per_iteration": 2.808659315109253 + }, + { + "auxiliary_loss_clip": 0.01079053, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.03445256, + "balance_loss_mlp": 1.01634717, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.5824128023515516, + "language_loss": 0.70517838, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72624409, + "num_input_tokens_seen": 236281890, + "step": 10946, + "time_per_iteration": 2.9645893573760986 + }, + { + "auxiliary_loss_clip": 0.01091479, + "auxiliary_loss_mlp": 0.01031307, + "balance_loss_clip": 1.03595448, + "balance_loss_mlp": 1.01949334, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.9533581912829696, + "language_loss": 0.82245672, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84368455, + "num_input_tokens_seen": 236298370, + "step": 10947, + "time_per_iteration": 2.8053293228149414 + }, + { + "auxiliary_loss_clip": 0.01041796, + "auxiliary_loss_mlp": 0.00749465, + "balance_loss_clip": 1.030689, + "balance_loss_mlp": 1.00040686, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.7013793259111818, + "language_loss": 0.77238727, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79029989, + "num_input_tokens_seen": 236317380, + "step": 10948, + "time_per_iteration": 2.9629032611846924 + }, + { + "auxiliary_loss_clip": 0.01090928, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.03578973, + "balance_loss_mlp": 1.01726937, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.6526253956098922, + "language_loss": 0.79408824, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81528294, + "num_input_tokens_seen": 236336210, + "step": 10949, + "time_per_iteration": 2.7306230068206787 + }, + { + "auxiliary_loss_clip": 0.01015357, + "auxiliary_loss_mlp": 0.0100007, + "balance_loss_clip": 1.00482988, + "balance_loss_mlp": 0.99906904, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.743943784883188, + "language_loss": 0.61827922, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63843352, + "num_input_tokens_seen": 236403090, + "step": 10950, + "time_per_iteration": 3.367069959640503 + }, + { + "auxiliary_loss_clip": 0.01088036, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.03359461, + "balance_loss_mlp": 1.02352297, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 2.2272543205738446, + "language_loss": 0.66629225, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.68751776, + "num_input_tokens_seen": 236420475, + "step": 10951, + "time_per_iteration": 2.8893685340881348 + }, + { + "auxiliary_loss_clip": 0.01084623, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.03303194, + "balance_loss_mlp": 1.02103257, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.5373035667604262, + "language_loss": 0.76626116, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.78743011, + "num_input_tokens_seen": 236441915, + "step": 10952, + "time_per_iteration": 2.9034903049468994 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.03620577, + "balance_loss_mlp": 1.01937389, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.844155573361939, + "language_loss": 0.73394537, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75525564, + "num_input_tokens_seen": 236460340, + "step": 10953, + "time_per_iteration": 4.359909296035767 + }, + { + "auxiliary_loss_clip": 0.01067532, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.03362906, + "balance_loss_mlp": 1.02488852, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 1.956896131386507, + "language_loss": 0.7846489, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80569082, + "num_input_tokens_seen": 236478280, + "step": 10954, + "time_per_iteration": 2.8720405101776123 + }, + { + "auxiliary_loss_clip": 0.01073615, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.0310837, + "balance_loss_mlp": 1.02423692, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 2.177420438404367, + "language_loss": 0.69690895, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71800721, + "num_input_tokens_seen": 236493225, + "step": 10955, + "time_per_iteration": 2.8868489265441895 + }, + { + "auxiliary_loss_clip": 0.01067433, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.03164268, + "balance_loss_mlp": 1.02132416, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 2.320906272241015, + "language_loss": 0.81020933, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.8312043, + "num_input_tokens_seen": 236514420, + "step": 10956, + "time_per_iteration": 2.9351298809051514 + }, + { + "auxiliary_loss_clip": 0.0109057, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.03726721, + "balance_loss_mlp": 1.0225184, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.177718286706582, + "language_loss": 0.81111634, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83236247, + "num_input_tokens_seen": 236532785, + "step": 10957, + "time_per_iteration": 4.351170539855957 + }, + { + "auxiliary_loss_clip": 0.01077344, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.03453588, + "balance_loss_mlp": 1.02024055, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.576187733038611, + "language_loss": 0.76110673, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78218281, + "num_input_tokens_seen": 236553330, + "step": 10958, + "time_per_iteration": 2.9717137813568115 + }, + { + "auxiliary_loss_clip": 0.01053492, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.03143454, + "balance_loss_mlp": 1.02847719, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.651334742893951, + "language_loss": 0.74912363, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77007437, + "num_input_tokens_seen": 236572960, + "step": 10959, + "time_per_iteration": 2.8610634803771973 + }, + { + "auxiliary_loss_clip": 0.01089325, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.03444719, + "balance_loss_mlp": 1.01806819, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 2.599647422058468, + "language_loss": 0.6477319, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.66892076, + "num_input_tokens_seen": 236594090, + "step": 10960, + "time_per_iteration": 2.793278217315674 + }, + { + "auxiliary_loss_clip": 0.01103778, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03512907, + "balance_loss_mlp": 1.01993847, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.2636843422685184, + "language_loss": 0.82570088, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.84705961, + "num_input_tokens_seen": 236610190, + "step": 10961, + "time_per_iteration": 2.929723024368286 + }, + { + "auxiliary_loss_clip": 0.01060133, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.03281236, + "balance_loss_mlp": 1.01855874, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 2.318128103260024, + "language_loss": 0.73384666, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75474679, + "num_input_tokens_seen": 236631575, + "step": 10962, + "time_per_iteration": 2.922531843185425 + }, + { + "auxiliary_loss_clip": 0.01081634, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.03232872, + "balance_loss_mlp": 1.01955497, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 1.9603988778630332, + "language_loss": 0.80030996, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.82143724, + "num_input_tokens_seen": 236649815, + "step": 10963, + "time_per_iteration": 2.875108242034912 + }, + { + "auxiliary_loss_clip": 0.01048684, + "auxiliary_loss_mlp": 0.00749199, + "balance_loss_clip": 1.03363144, + "balance_loss_mlp": 1.00035787, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.8342549243966362, + "language_loss": 0.7822026, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80018139, + "num_input_tokens_seen": 236668335, + "step": 10964, + "time_per_iteration": 3.118072748184204 + }, + { + "auxiliary_loss_clip": 0.01057438, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.02873755, + "balance_loss_mlp": 1.02358317, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 3.657898300153739, + "language_loss": 0.73870552, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.75963253, + "num_input_tokens_seen": 236688945, + "step": 10965, + "time_per_iteration": 2.8968966007232666 + }, + { + "auxiliary_loss_clip": 0.01064877, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.03245807, + "balance_loss_mlp": 1.02545869, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 1.9213947755724816, + "language_loss": 0.73323274, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75426662, + "num_input_tokens_seen": 236707055, + "step": 10966, + "time_per_iteration": 2.874098539352417 + }, + { + "auxiliary_loss_clip": 0.01088331, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.03200626, + "balance_loss_mlp": 1.02002907, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.4863135940065302, + "language_loss": 0.77051449, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79171777, + "num_input_tokens_seen": 236725900, + "step": 10967, + "time_per_iteration": 4.424032688140869 + }, + { + "auxiliary_loss_clip": 0.01015995, + "auxiliary_loss_mlp": 0.01007689, + "balance_loss_clip": 1.00490761, + "balance_loss_mlp": 1.0066576, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 1.8486919066736403, + "language_loss": 0.4847641, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50500095, + "num_input_tokens_seen": 236788415, + "step": 10968, + "time_per_iteration": 3.314338207244873 + }, + { + "auxiliary_loss_clip": 0.01047895, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.02952957, + "balance_loss_mlp": 1.02698088, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.7402842053338237, + "language_loss": 0.79420865, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.8150931, + "num_input_tokens_seen": 236805155, + "step": 10969, + "time_per_iteration": 2.944518804550171 + }, + { + "auxiliary_loss_clip": 0.01090464, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.03428912, + "balance_loss_mlp": 1.01945162, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 2.1421111269316024, + "language_loss": 0.65933609, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.68054545, + "num_input_tokens_seen": 236824360, + "step": 10970, + "time_per_iteration": 2.823240280151367 + }, + { + "auxiliary_loss_clip": 0.01083136, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.03107905, + "balance_loss_mlp": 1.01697516, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.8922226130555027, + "language_loss": 0.7644099, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78552657, + "num_input_tokens_seen": 236844640, + "step": 10971, + "time_per_iteration": 2.759937286376953 + }, + { + "auxiliary_loss_clip": 0.01022065, + "auxiliary_loss_mlp": 0.01052515, + "balance_loss_clip": 1.0256443, + "balance_loss_mlp": 1.03819835, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.2814821079436713, + "language_loss": 0.70328486, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72403073, + "num_input_tokens_seen": 236861160, + "step": 10972, + "time_per_iteration": 2.880892038345337 + }, + { + "auxiliary_loss_clip": 0.01088146, + "auxiliary_loss_mlp": 0.01024945, + "balance_loss_clip": 1.03428757, + "balance_loss_mlp": 1.01326883, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 1.7477003510636113, + "language_loss": 0.55742764, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.57855856, + "num_input_tokens_seen": 236880465, + "step": 10973, + "time_per_iteration": 2.799471378326416 + }, + { + "auxiliary_loss_clip": 0.01085934, + "auxiliary_loss_mlp": 0.01036369, + "balance_loss_clip": 1.03735375, + "balance_loss_mlp": 1.02463281, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 2.052157991635549, + "language_loss": 0.78441411, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.80563712, + "num_input_tokens_seen": 236897730, + "step": 10974, + "time_per_iteration": 2.7703139781951904 + }, + { + "auxiliary_loss_clip": 0.01087951, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.03556514, + "balance_loss_mlp": 1.02185917, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.5738431780488877, + "language_loss": 0.68945342, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.7106657, + "num_input_tokens_seen": 236917300, + "step": 10975, + "time_per_iteration": 2.752258539199829 + }, + { + "auxiliary_loss_clip": 0.01082665, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.03360283, + "balance_loss_mlp": 1.01771331, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.6188604963821116, + "language_loss": 0.70521975, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.72633559, + "num_input_tokens_seen": 236935590, + "step": 10976, + "time_per_iteration": 2.7695462703704834 + }, + { + "auxiliary_loss_clip": 0.01071368, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.03223336, + "balance_loss_mlp": 1.0200305, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.5688585473633756, + "language_loss": 0.676404, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69743669, + "num_input_tokens_seen": 236952830, + "step": 10977, + "time_per_iteration": 2.7861645221710205 + }, + { + "auxiliary_loss_clip": 0.01063542, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03293121, + "balance_loss_mlp": 1.01895595, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 1.8455789303595234, + "language_loss": 0.81203061, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83298182, + "num_input_tokens_seen": 236971930, + "step": 10978, + "time_per_iteration": 2.8120603561401367 + }, + { + "auxiliary_loss_clip": 0.01070614, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.03393567, + "balance_loss_mlp": 1.02529538, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 2.230903464215075, + "language_loss": 0.67447138, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69554842, + "num_input_tokens_seen": 236989920, + "step": 10979, + "time_per_iteration": 2.971393585205078 + }, + { + "auxiliary_loss_clip": 0.01065645, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.03210187, + "balance_loss_mlp": 1.01619947, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 3.3463717373009945, + "language_loss": 0.73097849, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75192165, + "num_input_tokens_seen": 237006570, + "step": 10980, + "time_per_iteration": 2.7743730545043945 + }, + { + "auxiliary_loss_clip": 0.01059804, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.03229642, + "balance_loss_mlp": 1.01838005, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.6544562391296938, + "language_loss": 0.72825509, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.74914134, + "num_input_tokens_seen": 237028415, + "step": 10981, + "time_per_iteration": 2.9660391807556152 + }, + { + "auxiliary_loss_clip": 0.01042885, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.03117549, + "balance_loss_mlp": 1.02252078, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.156098979821819, + "language_loss": 0.68998921, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.71075928, + "num_input_tokens_seen": 237046595, + "step": 10982, + "time_per_iteration": 2.928725242614746 + }, + { + "auxiliary_loss_clip": 0.01088803, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.03497243, + "balance_loss_mlp": 1.02061224, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.5142872139041132, + "language_loss": 0.70008671, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.7212925, + "num_input_tokens_seen": 237066150, + "step": 10983, + "time_per_iteration": 4.276463031768799 + }, + { + "auxiliary_loss_clip": 0.01088078, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.03220081, + "balance_loss_mlp": 1.01749182, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.7168616230596534, + "language_loss": 0.70523053, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72640145, + "num_input_tokens_seen": 237087060, + "step": 10984, + "time_per_iteration": 2.846172332763672 + }, + { + "auxiliary_loss_clip": 0.01063608, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.03108048, + "balance_loss_mlp": 1.01897025, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.5043992311880006, + "language_loss": 0.83895147, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85988796, + "num_input_tokens_seen": 237103825, + "step": 10985, + "time_per_iteration": 2.817690134048462 + }, + { + "auxiliary_loss_clip": 0.01089913, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.03429568, + "balance_loss_mlp": 1.01849854, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.3747410476295974, + "language_loss": 0.73984355, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76104891, + "num_input_tokens_seen": 237121740, + "step": 10986, + "time_per_iteration": 2.7705953121185303 + }, + { + "auxiliary_loss_clip": 0.01086371, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.03287697, + "balance_loss_mlp": 1.01807392, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 2.398980131943702, + "language_loss": 0.79568005, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.81683874, + "num_input_tokens_seen": 237139565, + "step": 10987, + "time_per_iteration": 2.780688524246216 + }, + { + "auxiliary_loss_clip": 0.01006474, + "auxiliary_loss_mlp": 0.01001349, + "balance_loss_clip": 1.00822353, + "balance_loss_mlp": 1.00026417, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8252314063367221, + "language_loss": 0.54144192, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.5615201, + "num_input_tokens_seen": 237201055, + "step": 10988, + "time_per_iteration": 3.3419744968414307 + }, + { + "auxiliary_loss_clip": 0.01038839, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.03269339, + "balance_loss_mlp": 1.02327061, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 2.146634737577804, + "language_loss": 0.77779102, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79851985, + "num_input_tokens_seen": 237221805, + "step": 10989, + "time_per_iteration": 2.9082164764404297 + }, + { + "auxiliary_loss_clip": 0.01078786, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.03626561, + "balance_loss_mlp": 1.02094507, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 2.0089086883959775, + "language_loss": 0.77138436, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7924931, + "num_input_tokens_seen": 237238270, + "step": 10990, + "time_per_iteration": 2.7736923694610596 + }, + { + "auxiliary_loss_clip": 0.01101908, + "auxiliary_loss_mlp": 0.01030549, + "balance_loss_clip": 1.0349884, + "balance_loss_mlp": 1.01892614, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 1.9851037473016777, + "language_loss": 0.60536695, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62669158, + "num_input_tokens_seen": 237255400, + "step": 10991, + "time_per_iteration": 2.8851146697998047 + }, + { + "auxiliary_loss_clip": 0.01072791, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.03294182, + "balance_loss_mlp": 1.02127957, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.5479744161229463, + "language_loss": 0.6887176, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70978105, + "num_input_tokens_seen": 237273105, + "step": 10992, + "time_per_iteration": 2.806891918182373 + }, + { + "auxiliary_loss_clip": 0.01088555, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.03366327, + "balance_loss_mlp": 1.01740217, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 1.714994353263547, + "language_loss": 0.8749361, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.89611208, + "num_input_tokens_seen": 237292650, + "step": 10993, + "time_per_iteration": 4.292592287063599 + }, + { + "auxiliary_loss_clip": 0.0109747, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.03623688, + "balance_loss_mlp": 1.02014995, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.6521088544592728, + "language_loss": 0.67062157, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69192696, + "num_input_tokens_seen": 237312865, + "step": 10994, + "time_per_iteration": 2.763646364212036 + }, + { + "auxiliary_loss_clip": 0.01063029, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.033867, + "balance_loss_mlp": 1.02130532, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.84257837374288, + "language_loss": 0.76929998, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.79026806, + "num_input_tokens_seen": 237331210, + "step": 10995, + "time_per_iteration": 2.7275350093841553 + }, + { + "auxiliary_loss_clip": 0.01080229, + "auxiliary_loss_mlp": 0.01027816, + "balance_loss_clip": 1.03439784, + "balance_loss_mlp": 1.01743865, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.8324139590548354, + "language_loss": 0.74499017, + "learning_rate": 1.088359933123053e-06, + "loss": 0.7660706, + "num_input_tokens_seen": 237349455, + "step": 10996, + "time_per_iteration": 2.8112869262695312 + }, + { + "auxiliary_loss_clip": 0.01100618, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.03595304, + "balance_loss_mlp": 1.02167845, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.9156946846867342, + "language_loss": 0.68759418, + "learning_rate": 1.088013301487126e-06, + "loss": 0.70893061, + "num_input_tokens_seen": 237367100, + "step": 10997, + "time_per_iteration": 4.183530330657959 + }, + { + "auxiliary_loss_clip": 0.01081405, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.03373861, + "balance_loss_mlp": 1.01856816, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 2.3019892019691706, + "language_loss": 0.6880917, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70920414, + "num_input_tokens_seen": 237384840, + "step": 10998, + "time_per_iteration": 2.8147380352020264 + }, + { + "auxiliary_loss_clip": 0.01015437, + "auxiliary_loss_mlp": 0.01006891, + "balance_loss_clip": 1.00436616, + "balance_loss_mlp": 1.00593746, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.9525831570638023, + "language_loss": 0.5113889, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53161216, + "num_input_tokens_seen": 237443355, + "step": 10999, + "time_per_iteration": 3.187746047973633 + }, + { + "auxiliary_loss_clip": 0.01103945, + "auxiliary_loss_mlp": 0.00749455, + "balance_loss_clip": 1.03614688, + "balance_loss_mlp": 1.00028515, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.370962001521193, + "language_loss": 0.7025044, + "learning_rate": 1.086973614127679e-06, + "loss": 0.72103834, + "num_input_tokens_seen": 237459205, + "step": 11000, + "time_per_iteration": 2.681373357772827 + }, + { + "auxiliary_loss_clip": 0.01069783, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.03397369, + "balance_loss_mlp": 1.02203918, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.8216195081655593, + "language_loss": 0.65189171, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67291605, + "num_input_tokens_seen": 237483580, + "step": 11001, + "time_per_iteration": 2.8966076374053955 + }, + { + "auxiliary_loss_clip": 0.01097894, + "auxiliary_loss_mlp": 0.01026425, + "balance_loss_clip": 1.03305483, + "balance_loss_mlp": 1.01546359, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.6918626069433031, + "language_loss": 0.7287873, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75003046, + "num_input_tokens_seen": 237502860, + "step": 11002, + "time_per_iteration": 2.711400032043457 + }, + { + "auxiliary_loss_clip": 0.01081587, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.03200316, + "balance_loss_mlp": 1.0221386, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 1.9342656688554822, + "language_loss": 0.78863358, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.8097856, + "num_input_tokens_seen": 237521030, + "step": 11003, + "time_per_iteration": 2.746427297592163 + }, + { + "auxiliary_loss_clip": 0.01089766, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.03428757, + "balance_loss_mlp": 1.02149165, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.9092961776354644, + "language_loss": 0.6874969, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.70873547, + "num_input_tokens_seen": 237539585, + "step": 11004, + "time_per_iteration": 2.780343770980835 + }, + { + "auxiliary_loss_clip": 0.01090464, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.03346348, + "balance_loss_mlp": 1.02065063, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.0818897854190963, + "language_loss": 0.69348907, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71472341, + "num_input_tokens_seen": 237557655, + "step": 11005, + "time_per_iteration": 2.7301621437072754 + }, + { + "auxiliary_loss_clip": 0.01078059, + "auxiliary_loss_mlp": 0.01029468, + "balance_loss_clip": 1.03344822, + "balance_loss_mlp": 1.01857233, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.6146652542600322, + "language_loss": 0.78245699, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80353224, + "num_input_tokens_seen": 237577000, + "step": 11006, + "time_per_iteration": 2.797396183013916 + }, + { + "auxiliary_loss_clip": 0.01087968, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.03415179, + "balance_loss_mlp": 1.02007174, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.5033724141355624, + "language_loss": 0.76390165, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78509951, + "num_input_tokens_seen": 237597960, + "step": 11007, + "time_per_iteration": 2.859299421310425 + }, + { + "auxiliary_loss_clip": 0.01088551, + "auxiliary_loss_mlp": 0.01028287, + "balance_loss_clip": 1.03676796, + "balance_loss_mlp": 1.01712275, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.5671351055068958, + "language_loss": 0.78643811, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80760646, + "num_input_tokens_seen": 237616385, + "step": 11008, + "time_per_iteration": 4.284212827682495 + }, + { + "auxiliary_loss_clip": 0.01102911, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.03439689, + "balance_loss_mlp": 1.01707625, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.7295523520169045, + "language_loss": 0.81963485, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.84095502, + "num_input_tokens_seen": 237634930, + "step": 11009, + "time_per_iteration": 2.720134973526001 + }, + { + "auxiliary_loss_clip": 0.00996209, + "auxiliary_loss_mlp": 0.01008065, + "balance_loss_clip": 1.01235247, + "balance_loss_mlp": 1.0070641, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9799132548218368, + "language_loss": 0.67438984, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.6944325, + "num_input_tokens_seen": 237693175, + "step": 11010, + "time_per_iteration": 3.2187867164611816 + }, + { + "auxiliary_loss_clip": 0.01089025, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.03287077, + "balance_loss_mlp": 1.01765001, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.8245475565279792, + "language_loss": 0.71124876, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73243141, + "num_input_tokens_seen": 237713160, + "step": 11011, + "time_per_iteration": 2.741597890853882 + }, + { + "auxiliary_loss_clip": 0.01091444, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.0371232, + "balance_loss_mlp": 1.01827192, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.4555652252738103, + "language_loss": 0.72265804, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74386466, + "num_input_tokens_seen": 237733600, + "step": 11012, + "time_per_iteration": 2.8044912815093994 + }, + { + "auxiliary_loss_clip": 0.01082197, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.03294826, + "balance_loss_mlp": 1.01939082, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.7030173950919947, + "language_loss": 0.79116368, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81228089, + "num_input_tokens_seen": 237752135, + "step": 11013, + "time_per_iteration": 2.7231101989746094 + }, + { + "auxiliary_loss_clip": 0.01076794, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.0339061, + "balance_loss_mlp": 1.01600528, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 2.4928678864241287, + "language_loss": 0.70112467, + "learning_rate": 1.082125865538971e-06, + "loss": 0.7221694, + "num_input_tokens_seen": 237770735, + "step": 11014, + "time_per_iteration": 2.6816914081573486 + }, + { + "auxiliary_loss_clip": 0.01069986, + "auxiliary_loss_mlp": 0.00749088, + "balance_loss_clip": 1.03374827, + "balance_loss_mlp": 1.00023806, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 1.8869272766832514, + "language_loss": 0.77228224, + "learning_rate": 1.081779858400137e-06, + "loss": 0.79047298, + "num_input_tokens_seen": 237789005, + "step": 11015, + "time_per_iteration": 2.7718899250030518 + }, + { + "auxiliary_loss_clip": 0.01089897, + "auxiliary_loss_mlp": 0.00749185, + "balance_loss_clip": 1.03513169, + "balance_loss_mlp": 1.00026405, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.6643362864980789, + "language_loss": 0.82392174, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.84231257, + "num_input_tokens_seen": 237807740, + "step": 11016, + "time_per_iteration": 2.6610517501831055 + }, + { + "auxiliary_loss_clip": 0.01083202, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.03231847, + "balance_loss_mlp": 1.01946294, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 1.9514919199851664, + "language_loss": 0.69603431, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71717238, + "num_input_tokens_seen": 237826340, + "step": 11017, + "time_per_iteration": 2.620737075805664 + }, + { + "auxiliary_loss_clip": 0.01067575, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.03184009, + "balance_loss_mlp": 1.02266347, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.768251201648026, + "language_loss": 0.77232099, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.7933414, + "num_input_tokens_seen": 237848305, + "step": 11018, + "time_per_iteration": 2.9434449672698975 + }, + { + "auxiliary_loss_clip": 0.01076439, + "auxiliary_loss_mlp": 0.01037013, + "balance_loss_clip": 1.0324285, + "balance_loss_mlp": 1.02552724, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 2.057741487675976, + "language_loss": 0.82989722, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85103172, + "num_input_tokens_seen": 237867020, + "step": 11019, + "time_per_iteration": 2.6054861545562744 + }, + { + "auxiliary_loss_clip": 0.01079054, + "auxiliary_loss_mlp": 0.00749514, + "balance_loss_clip": 1.03139663, + "balance_loss_mlp": 1.00027478, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.6138931572033886, + "language_loss": 0.71776038, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73604608, + "num_input_tokens_seen": 237886710, + "step": 11020, + "time_per_iteration": 2.644069194793701 + }, + { + "auxiliary_loss_clip": 0.01075967, + "auxiliary_loss_mlp": 0.01028297, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.01523185, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.755331209751754, + "language_loss": 0.72536218, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74640483, + "num_input_tokens_seen": 237904795, + "step": 11021, + "time_per_iteration": 2.666571617126465 + }, + { + "auxiliary_loss_clip": 0.01070712, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.03193378, + "balance_loss_mlp": 1.02059567, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.2890153937060345, + "language_loss": 0.83331954, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85434723, + "num_input_tokens_seen": 237921320, + "step": 11022, + "time_per_iteration": 2.6548397541046143 + }, + { + "auxiliary_loss_clip": 0.01085044, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.03514373, + "balance_loss_mlp": 1.0184164, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 5.198787426410582, + "language_loss": 0.728387, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74954927, + "num_input_tokens_seen": 237933525, + "step": 11023, + "time_per_iteration": 4.115620374679565 + }, + { + "auxiliary_loss_clip": 0.01060188, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.02984691, + "balance_loss_mlp": 1.02004218, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 1.8484698982913774, + "language_loss": 0.74782777, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.76874483, + "num_input_tokens_seen": 237953395, + "step": 11024, + "time_per_iteration": 2.7358224391937256 + }, + { + "auxiliary_loss_clip": 0.01067737, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.0327518, + "balance_loss_mlp": 1.01829433, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.6075101759471915, + "language_loss": 0.69710457, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71808147, + "num_input_tokens_seen": 237971445, + "step": 11025, + "time_per_iteration": 2.782149314880371 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.03663838, + "balance_loss_mlp": 1.02228892, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.5522389004304789, + "language_loss": 0.78826976, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.80961955, + "num_input_tokens_seen": 237989965, + "step": 11026, + "time_per_iteration": 2.756774663925171 + }, + { + "auxiliary_loss_clip": 0.01088233, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.03504288, + "balance_loss_mlp": 1.02242112, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.6087663108424632, + "language_loss": 0.75787425, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.77908289, + "num_input_tokens_seen": 238006820, + "step": 11027, + "time_per_iteration": 2.6741273403167725 + }, + { + "auxiliary_loss_clip": 0.01070272, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.03226089, + "balance_loss_mlp": 1.02099204, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.3598818711003533, + "language_loss": 0.7038765, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72491193, + "num_input_tokens_seen": 238022560, + "step": 11028, + "time_per_iteration": 2.7198431491851807 + }, + { + "auxiliary_loss_clip": 0.01088466, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.03379726, + "balance_loss_mlp": 1.02258658, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 2.348498946330985, + "language_loss": 0.79457629, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.81578922, + "num_input_tokens_seen": 238041895, + "step": 11029, + "time_per_iteration": 2.6993210315704346 + }, + { + "auxiliary_loss_clip": 0.01100621, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.03374219, + "balance_loss_mlp": 1.01742435, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.166981350994085, + "language_loss": 0.76636195, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.7876637, + "num_input_tokens_seen": 238060445, + "step": 11030, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.01097139, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.0378511, + "balance_loss_mlp": 1.01730204, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 3.1494506582912773, + "language_loss": 0.74986678, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.77112901, + "num_input_tokens_seen": 238077080, + "step": 11031, + "time_per_iteration": 2.679783344268799 + }, + { + "auxiliary_loss_clip": 0.01089666, + "auxiliary_loss_mlp": 0.01030675, + "balance_loss_clip": 1.03492105, + "balance_loss_mlp": 1.0189631, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 5.29509015309874, + "language_loss": 0.74737477, + "learning_rate": 1.075903075048228e-06, + "loss": 0.76857823, + "num_input_tokens_seen": 238091045, + "step": 11032, + "time_per_iteration": 2.719728469848633 + }, + { + "auxiliary_loss_clip": 0.01056186, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.03148556, + "balance_loss_mlp": 1.02155924, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.7681020212924987, + "language_loss": 0.8028667, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82375228, + "num_input_tokens_seen": 238110220, + "step": 11033, + "time_per_iteration": 4.374698162078857 + }, + { + "auxiliary_loss_clip": 0.01081015, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.03456378, + "balance_loss_mlp": 1.01758957, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.7313484151815015, + "language_loss": 0.80098611, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82208878, + "num_input_tokens_seen": 238130400, + "step": 11034, + "time_per_iteration": 2.7929835319519043 + }, + { + "auxiliary_loss_clip": 0.01085184, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.03495383, + "balance_loss_mlp": 1.01678467, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.8417269002036338, + "language_loss": 0.75800246, + "learning_rate": 1.074867045054166e-06, + "loss": 0.77912933, + "num_input_tokens_seen": 238148165, + "step": 11035, + "time_per_iteration": 2.695683479309082 + }, + { + "auxiliary_loss_clip": 0.01067933, + "auxiliary_loss_mlp": 0.01021301, + "balance_loss_clip": 1.03382611, + "balance_loss_mlp": 1.01003528, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 2.0598230562386255, + "language_loss": 0.82893896, + "learning_rate": 1.074521771867622e-06, + "loss": 0.84983134, + "num_input_tokens_seen": 238166360, + "step": 11036, + "time_per_iteration": 2.829566717147827 + }, + { + "auxiliary_loss_clip": 0.01025101, + "auxiliary_loss_mlp": 0.01002704, + "balance_loss_clip": 1.00462675, + "balance_loss_mlp": 1.00179791, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.775459771258913, + "language_loss": 0.52344161, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54371965, + "num_input_tokens_seen": 238227630, + "step": 11037, + "time_per_iteration": 4.606448411941528 + }, + { + "auxiliary_loss_clip": 0.0104464, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.03379297, + "balance_loss_mlp": 1.02281964, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.8126488486128773, + "language_loss": 0.78767431, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.80846304, + "num_input_tokens_seen": 238248435, + "step": 11038, + "time_per_iteration": 2.876702070236206 + }, + { + "auxiliary_loss_clip": 0.0106821, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.03340042, + "balance_loss_mlp": 1.02777767, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 1.8950666168120607, + "language_loss": 0.63756335, + "learning_rate": 1.073486162925716e-06, + "loss": 0.65864623, + "num_input_tokens_seen": 238268755, + "step": 11039, + "time_per_iteration": 2.9593636989593506 + }, + { + "auxiliary_loss_clip": 0.01059919, + "auxiliary_loss_mlp": 0.0102702, + "balance_loss_clip": 1.03264391, + "balance_loss_mlp": 1.01574278, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 2.0581131868568945, + "language_loss": 0.63475615, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.65562552, + "num_input_tokens_seen": 238290120, + "step": 11040, + "time_per_iteration": 2.930204153060913 + }, + { + "auxiliary_loss_clip": 0.01054865, + "auxiliary_loss_mlp": 0.01035495, + "balance_loss_clip": 1.02777982, + "balance_loss_mlp": 1.02337718, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 2.0692661004319484, + "language_loss": 0.71556234, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73646593, + "num_input_tokens_seen": 238309290, + "step": 11041, + "time_per_iteration": 2.7987401485443115 + }, + { + "auxiliary_loss_clip": 0.01080035, + "auxiliary_loss_mlp": 0.01041245, + "balance_loss_clip": 1.03097975, + "balance_loss_mlp": 1.028144, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.66043920399909, + "language_loss": 0.62052143, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.64173418, + "num_input_tokens_seen": 238327280, + "step": 11042, + "time_per_iteration": 2.8364195823669434 + }, + { + "auxiliary_loss_clip": 0.01092024, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.03317523, + "balance_loss_mlp": 1.015733, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.4489969833717287, + "language_loss": 0.68101716, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.7022199, + "num_input_tokens_seen": 238346330, + "step": 11043, + "time_per_iteration": 2.675340414047241 + }, + { + "auxiliary_loss_clip": 0.01086583, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.03460288, + "balance_loss_mlp": 1.01740384, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.5174325475980825, + "language_loss": 0.8397066, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86084485, + "num_input_tokens_seen": 238364650, + "step": 11044, + "time_per_iteration": 2.788261890411377 + }, + { + "auxiliary_loss_clip": 0.01058934, + "auxiliary_loss_mlp": 0.01029638, + "balance_loss_clip": 1.03071773, + "balance_loss_mlp": 1.01777101, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 2.276109825366186, + "language_loss": 0.69674325, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71762896, + "num_input_tokens_seen": 238381630, + "step": 11045, + "time_per_iteration": 2.72420597076416 + }, + { + "auxiliary_loss_clip": 0.01091048, + "auxiliary_loss_mlp": 0.01024948, + "balance_loss_clip": 1.03645277, + "balance_loss_mlp": 1.01381969, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.5830484310434152, + "language_loss": 0.64479971, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66595972, + "num_input_tokens_seen": 238402595, + "step": 11046, + "time_per_iteration": 2.74701189994812 + }, + { + "auxiliary_loss_clip": 0.01070768, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.03518343, + "balance_loss_mlp": 1.01469111, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.5793823861794865, + "language_loss": 0.71430063, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73526996, + "num_input_tokens_seen": 238426860, + "step": 11047, + "time_per_iteration": 2.9323971271514893 + }, + { + "auxiliary_loss_clip": 0.0103794, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.03681481, + "balance_loss_mlp": 1.02208519, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.85511105884159, + "language_loss": 0.77245981, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79317778, + "num_input_tokens_seen": 238443990, + "step": 11048, + "time_per_iteration": 4.295028448104858 + }, + { + "auxiliary_loss_clip": 0.00999274, + "auxiliary_loss_mlp": 0.00998989, + "balance_loss_clip": 1.00762582, + "balance_loss_mlp": 0.99785072, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7931778675519446, + "language_loss": 0.5502556, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57023823, + "num_input_tokens_seen": 238503045, + "step": 11049, + "time_per_iteration": 3.302079677581787 + }, + { + "auxiliary_loss_clip": 0.01087686, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.03393102, + "balance_loss_mlp": 1.01860404, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.6287314752240247, + "language_loss": 0.640863, + "learning_rate": 1.069691638104648e-06, + "loss": 0.6620298, + "num_input_tokens_seen": 238527320, + "step": 11050, + "time_per_iteration": 2.8565237522125244 + }, + { + "auxiliary_loss_clip": 0.01096104, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.03325927, + "balance_loss_mlp": 1.02063882, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.773049969346766, + "language_loss": 0.78950799, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.81078148, + "num_input_tokens_seen": 238546030, + "step": 11051, + "time_per_iteration": 2.7567250728607178 + }, + { + "auxiliary_loss_clip": 0.01079927, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.03771114, + "balance_loss_mlp": 1.02111793, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 8.003119940506023, + "language_loss": 0.85177743, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87289828, + "num_input_tokens_seen": 238564175, + "step": 11052, + "time_per_iteration": 2.8670265674591064 + }, + { + "auxiliary_loss_clip": 0.01051908, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.03164506, + "balance_loss_mlp": 1.02209449, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.3642559095629476, + "language_loss": 0.74702245, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.76789463, + "num_input_tokens_seen": 238581010, + "step": 11053, + "time_per_iteration": 2.856109380722046 + }, + { + "auxiliary_loss_clip": 0.01067251, + "auxiliary_loss_mlp": 0.01027901, + "balance_loss_clip": 1.03264034, + "balance_loss_mlp": 1.01724935, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 2.0687435769234037, + "language_loss": 0.79321432, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81416583, + "num_input_tokens_seen": 238601365, + "step": 11054, + "time_per_iteration": 2.7639243602752686 + }, + { + "auxiliary_loss_clip": 0.01054161, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.033553, + "balance_loss_mlp": 1.0205307, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.6392142388796944, + "language_loss": 0.74227166, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76312459, + "num_input_tokens_seen": 238619850, + "step": 11055, + "time_per_iteration": 2.8342764377593994 + }, + { + "auxiliary_loss_clip": 0.01063816, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.03225648, + "balance_loss_mlp": 1.02426314, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 2.2325076722648767, + "language_loss": 0.72628719, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.74729055, + "num_input_tokens_seen": 238637635, + "step": 11056, + "time_per_iteration": 2.78950572013855 + }, + { + "auxiliary_loss_clip": 0.01054892, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.03179157, + "balance_loss_mlp": 1.02103376, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 1.940174642279893, + "language_loss": 0.69815308, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71902341, + "num_input_tokens_seen": 238656200, + "step": 11057, + "time_per_iteration": 2.837636947631836 + }, + { + "auxiliary_loss_clip": 0.01089888, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.0351783, + "balance_loss_mlp": 1.01761985, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 3.880432487028144, + "language_loss": 0.80588222, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82706743, + "num_input_tokens_seen": 238675005, + "step": 11058, + "time_per_iteration": 2.8507280349731445 + }, + { + "auxiliary_loss_clip": 0.0099947, + "auxiliary_loss_mlp": 0.00996098, + "balance_loss_clip": 1.00820994, + "balance_loss_mlp": 0.99501956, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.8174298206947519, + "language_loss": 0.62606919, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64602488, + "num_input_tokens_seen": 238731425, + "step": 11059, + "time_per_iteration": 3.1701574325561523 + }, + { + "auxiliary_loss_clip": 0.01085644, + "auxiliary_loss_mlp": 0.0103945, + "balance_loss_clip": 1.03279293, + "balance_loss_mlp": 1.0291326, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 2.7525059401176586, + "language_loss": 0.78777635, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.80902731, + "num_input_tokens_seen": 238752020, + "step": 11060, + "time_per_iteration": 2.770003080368042 + }, + { + "auxiliary_loss_clip": 0.01065973, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.03338945, + "balance_loss_mlp": 1.02200067, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.8008032081953884, + "language_loss": 0.78500557, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.80600321, + "num_input_tokens_seen": 238769665, + "step": 11061, + "time_per_iteration": 2.8059098720550537 + }, + { + "auxiliary_loss_clip": 0.01078077, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.03676057, + "balance_loss_mlp": 1.01792967, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.3927040312485595, + "language_loss": 0.56605649, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.5871228, + "num_input_tokens_seen": 238782180, + "step": 11062, + "time_per_iteration": 4.344768047332764 + }, + { + "auxiliary_loss_clip": 0.01083155, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.03080952, + "balance_loss_mlp": 1.02257347, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.7935335280412512, + "language_loss": 0.76012892, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78132534, + "num_input_tokens_seen": 238800315, + "step": 11063, + "time_per_iteration": 2.7956702709198 + }, + { + "auxiliary_loss_clip": 0.01040167, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.03394294, + "balance_loss_mlp": 1.02202976, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.2962400638186546, + "language_loss": 0.70465642, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72539723, + "num_input_tokens_seen": 238822250, + "step": 11064, + "time_per_iteration": 2.961395025253296 + }, + { + "auxiliary_loss_clip": 0.01024073, + "auxiliary_loss_mlp": 0.01007564, + "balance_loss_clip": 1.00344825, + "balance_loss_mlp": 1.00661647, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8514365875161535, + "language_loss": 0.63025576, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65057212, + "num_input_tokens_seen": 238877190, + "step": 11065, + "time_per_iteration": 3.1265628337860107 + }, + { + "auxiliary_loss_clip": 0.01081385, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.03210378, + "balance_loss_mlp": 1.02462435, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 2.243991002165702, + "language_loss": 0.62537473, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64656258, + "num_input_tokens_seen": 238896010, + "step": 11066, + "time_per_iteration": 2.7733545303344727 + }, + { + "auxiliary_loss_clip": 0.01056268, + "auxiliary_loss_mlp": 0.01036395, + "balance_loss_clip": 1.02843451, + "balance_loss_mlp": 1.02331769, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.601663721091856, + "language_loss": 0.70059627, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72152293, + "num_input_tokens_seen": 238918990, + "step": 11067, + "time_per_iteration": 2.83358097076416 + }, + { + "auxiliary_loss_clip": 0.01006897, + "auxiliary_loss_mlp": 0.01013795, + "balance_loss_clip": 1.00604534, + "balance_loss_mlp": 1.01243651, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.913590238846598, + "language_loss": 0.72087979, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74108672, + "num_input_tokens_seen": 238975735, + "step": 11068, + "time_per_iteration": 3.317080497741699 + }, + { + "auxiliary_loss_clip": 0.00994313, + "auxiliary_loss_mlp": 0.01002037, + "balance_loss_clip": 1.00344193, + "balance_loss_mlp": 1.0009644, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7103131062600727, + "language_loss": 0.57821476, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59817827, + "num_input_tokens_seen": 239042360, + "step": 11069, + "time_per_iteration": 3.386784076690674 + }, + { + "auxiliary_loss_clip": 0.01002958, + "auxiliary_loss_mlp": 0.01001379, + "balance_loss_clip": 1.00367761, + "balance_loss_mlp": 1.00035369, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7493405080692737, + "language_loss": 0.6352843, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65532768, + "num_input_tokens_seen": 239109410, + "step": 11070, + "time_per_iteration": 3.2720203399658203 + }, + { + "auxiliary_loss_clip": 0.01096996, + "auxiliary_loss_mlp": 0.01023874, + "balance_loss_clip": 1.03258133, + "balance_loss_mlp": 1.01300263, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.6850189016797805, + "language_loss": 0.58446038, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60566902, + "num_input_tokens_seen": 239135345, + "step": 11071, + "time_per_iteration": 2.817500352859497 + }, + { + "auxiliary_loss_clip": 0.01091501, + "auxiliary_loss_mlp": 0.01026852, + "balance_loss_clip": 1.03674054, + "balance_loss_mlp": 1.01664782, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 1.9122982249269604, + "language_loss": 0.72932148, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75050509, + "num_input_tokens_seen": 239154340, + "step": 11072, + "time_per_iteration": 2.777944803237915 + }, + { + "auxiliary_loss_clip": 0.01087713, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.03451133, + "balance_loss_mlp": 1.01823831, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 1.910565065580779, + "language_loss": 0.70788687, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72906053, + "num_input_tokens_seen": 239177815, + "step": 11073, + "time_per_iteration": 4.4347944259643555 + }, + { + "auxiliary_loss_clip": 0.01069667, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.03574729, + "balance_loss_mlp": 1.01817143, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 2.0458219978461805, + "language_loss": 0.56424642, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58523625, + "num_input_tokens_seen": 239195735, + "step": 11074, + "time_per_iteration": 2.854903221130371 + }, + { + "auxiliary_loss_clip": 0.01100024, + "auxiliary_loss_mlp": 0.00749282, + "balance_loss_clip": 1.03536201, + "balance_loss_mlp": 1.00030649, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.4329773727324344, + "language_loss": 0.72236741, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74086046, + "num_input_tokens_seen": 239217535, + "step": 11075, + "time_per_iteration": 2.813063383102417 + }, + { + "auxiliary_loss_clip": 0.01085933, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.03368521, + "balance_loss_mlp": 1.01834011, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.6766974491014448, + "language_loss": 0.65772367, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.67887288, + "num_input_tokens_seen": 239241975, + "step": 11076, + "time_per_iteration": 2.870561361312866 + }, + { + "auxiliary_loss_clip": 0.01071176, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.03038073, + "balance_loss_mlp": 1.01911879, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.6678022136453776, + "language_loss": 0.75319135, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77421463, + "num_input_tokens_seen": 239262025, + "step": 11077, + "time_per_iteration": 4.314013481140137 + }, + { + "auxiliary_loss_clip": 0.01076261, + "auxiliary_loss_mlp": 0.01026511, + "balance_loss_clip": 1.03233862, + "balance_loss_mlp": 1.01573455, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.5713957386588489, + "language_loss": 0.66624337, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68727112, + "num_input_tokens_seen": 239282775, + "step": 11078, + "time_per_iteration": 2.83170485496521 + }, + { + "auxiliary_loss_clip": 0.01100071, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.03375864, + "balance_loss_mlp": 1.019117, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 1.9781942430029371, + "language_loss": 0.6950106, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71632069, + "num_input_tokens_seen": 239299775, + "step": 11079, + "time_per_iteration": 2.7682383060455322 + }, + { + "auxiliary_loss_clip": 0.01074531, + "auxiliary_loss_mlp": 0.01022232, + "balance_loss_clip": 1.03162944, + "balance_loss_mlp": 1.01162243, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.5517683258642396, + "language_loss": 0.80338579, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.8243534, + "num_input_tokens_seen": 239319660, + "step": 11080, + "time_per_iteration": 2.8384640216827393 + }, + { + "auxiliary_loss_clip": 0.01059639, + "auxiliary_loss_mlp": 0.01024413, + "balance_loss_clip": 1.03040326, + "balance_loss_mlp": 1.01407754, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.8860933436398415, + "language_loss": 0.7816723, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80251276, + "num_input_tokens_seen": 239339215, + "step": 11081, + "time_per_iteration": 2.8085713386535645 + }, + { + "auxiliary_loss_clip": 0.01057879, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.03082871, + "balance_loss_mlp": 1.02407849, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.9642619322876955, + "language_loss": 0.79950595, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82046372, + "num_input_tokens_seen": 239358545, + "step": 11082, + "time_per_iteration": 2.8489511013031006 + }, + { + "auxiliary_loss_clip": 0.01058252, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.03161693, + "balance_loss_mlp": 1.01981258, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.508388039346413, + "language_loss": 0.84025919, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86114508, + "num_input_tokens_seen": 239376665, + "step": 11083, + "time_per_iteration": 2.8022053241729736 + }, + { + "auxiliary_loss_clip": 0.01067191, + "auxiliary_loss_mlp": 0.01028704, + "balance_loss_clip": 1.03593683, + "balance_loss_mlp": 1.01693797, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.265681090597724, + "language_loss": 0.85180748, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87276638, + "num_input_tokens_seen": 239394345, + "step": 11084, + "time_per_iteration": 2.8625245094299316 + }, + { + "auxiliary_loss_clip": 0.01079487, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.03289771, + "balance_loss_mlp": 1.01794183, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.9924744451652034, + "language_loss": 0.73352873, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75462222, + "num_input_tokens_seen": 239410605, + "step": 11085, + "time_per_iteration": 2.7593960762023926 + }, + { + "auxiliary_loss_clip": 0.01074945, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.03212404, + "balance_loss_mlp": 1.01898706, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.877876755704063, + "language_loss": 0.80543947, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82649362, + "num_input_tokens_seen": 239427155, + "step": 11086, + "time_per_iteration": 2.783123016357422 + }, + { + "auxiliary_loss_clip": 0.01076342, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.0338558, + "balance_loss_mlp": 1.01746154, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 1.8577859686828626, + "language_loss": 0.74806309, + "learning_rate": 1.056959663258702e-06, + "loss": 0.76911718, + "num_input_tokens_seen": 239445510, + "step": 11087, + "time_per_iteration": 4.283452749252319 + }, + { + "auxiliary_loss_clip": 0.01086426, + "auxiliary_loss_mlp": 0.01027144, + "balance_loss_clip": 1.0324719, + "balance_loss_mlp": 1.01553941, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.6442672253105288, + "language_loss": 0.64808387, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.66921961, + "num_input_tokens_seen": 239464805, + "step": 11088, + "time_per_iteration": 2.7378785610198975 + }, + { + "auxiliary_loss_clip": 0.01084856, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.03279388, + "balance_loss_mlp": 1.0174458, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 1.8606851315416908, + "language_loss": 0.6419239, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66306216, + "num_input_tokens_seen": 239483890, + "step": 11089, + "time_per_iteration": 2.683946132659912 + }, + { + "auxiliary_loss_clip": 0.01096591, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.03329539, + "balance_loss_mlp": 1.01964092, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.4870716790026504, + "language_loss": 0.81256926, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83383924, + "num_input_tokens_seen": 239500080, + "step": 11090, + "time_per_iteration": 2.731825113296509 + }, + { + "auxiliary_loss_clip": 0.01076293, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.0319407, + "balance_loss_mlp": 1.01904869, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 2.037803827067379, + "language_loss": 0.77730894, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79837441, + "num_input_tokens_seen": 239517335, + "step": 11091, + "time_per_iteration": 2.7625064849853516 + }, + { + "auxiliary_loss_clip": 0.01097821, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.0333643, + "balance_loss_mlp": 1.02145076, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.939096406899247, + "language_loss": 0.79365128, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81495667, + "num_input_tokens_seen": 239536240, + "step": 11092, + "time_per_iteration": 2.690664291381836 + }, + { + "auxiliary_loss_clip": 0.0100557, + "auxiliary_loss_mlp": 0.0100217, + "balance_loss_clip": 1.01267314, + "balance_loss_mlp": 1.00103188, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7627706568968426, + "language_loss": 0.57671183, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59678924, + "num_input_tokens_seen": 239598000, + "step": 11093, + "time_per_iteration": 3.257948398590088 + }, + { + "auxiliary_loss_clip": 0.01098771, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.03480649, + "balance_loss_mlp": 1.01722574, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 1.6883323300332582, + "language_loss": 0.76416379, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78543717, + "num_input_tokens_seen": 239617650, + "step": 11094, + "time_per_iteration": 2.718329429626465 + }, + { + "auxiliary_loss_clip": 0.01098222, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.03361917, + "balance_loss_mlp": 1.01897478, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.7433498513272514, + "language_loss": 0.7350986, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75638676, + "num_input_tokens_seen": 239639825, + "step": 11095, + "time_per_iteration": 2.8064849376678467 + }, + { + "auxiliary_loss_clip": 0.01085877, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.03418696, + "balance_loss_mlp": 1.02218676, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.1504234311269, + "language_loss": 0.73076558, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75195718, + "num_input_tokens_seen": 239656300, + "step": 11096, + "time_per_iteration": 2.7239110469818115 + }, + { + "auxiliary_loss_clip": 0.01054951, + "auxiliary_loss_mlp": 0.01030052, + "balance_loss_clip": 1.03219151, + "balance_loss_mlp": 1.01823807, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 1.7563666783694003, + "language_loss": 0.63920021, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66005015, + "num_input_tokens_seen": 239676655, + "step": 11097, + "time_per_iteration": 2.7800164222717285 + }, + { + "auxiliary_loss_clip": 0.01089556, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.03514862, + "balance_loss_mlp": 1.02166653, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.9107069381937722, + "language_loss": 0.75513238, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77635717, + "num_input_tokens_seen": 239695430, + "step": 11098, + "time_per_iteration": 2.7223286628723145 + }, + { + "auxiliary_loss_clip": 0.01100469, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.03515244, + "balance_loss_mlp": 1.01892841, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.5085591117715655, + "language_loss": 0.7410202, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76232147, + "num_input_tokens_seen": 239717070, + "step": 11099, + "time_per_iteration": 2.687687635421753 + }, + { + "auxiliary_loss_clip": 0.01084504, + "auxiliary_loss_mlp": 0.01034087, + "balance_loss_clip": 1.0316925, + "balance_loss_mlp": 1.02272654, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.894373760610913, + "language_loss": 0.78205687, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.8032428, + "num_input_tokens_seen": 239737105, + "step": 11100, + "time_per_iteration": 2.7178330421447754 + }, + { + "auxiliary_loss_clip": 0.01096989, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.03349185, + "balance_loss_mlp": 1.02489054, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 1.9633689404111876, + "language_loss": 0.60171944, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62304908, + "num_input_tokens_seen": 239757835, + "step": 11101, + "time_per_iteration": 2.705827474594116 + }, + { + "auxiliary_loss_clip": 0.01083925, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.03525603, + "balance_loss_mlp": 1.02015173, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.7890907946027288, + "language_loss": 0.71509624, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73625779, + "num_input_tokens_seen": 239775425, + "step": 11102, + "time_per_iteration": 4.216434717178345 + }, + { + "auxiliary_loss_clip": 0.01087657, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.03146505, + "balance_loss_mlp": 1.01888585, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.3907284919503382, + "language_loss": 0.84462368, + "learning_rate": 1.051469068021034e-06, + "loss": 0.86579758, + "num_input_tokens_seen": 239794605, + "step": 11103, + "time_per_iteration": 2.596569776535034 + }, + { + "auxiliary_loss_clip": 0.01075888, + "auxiliary_loss_mlp": 0.01026693, + "balance_loss_clip": 1.03122842, + "balance_loss_mlp": 1.01591015, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 1.8404831463490272, + "language_loss": 0.77584696, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.79687274, + "num_input_tokens_seen": 239812135, + "step": 11104, + "time_per_iteration": 2.7150707244873047 + }, + { + "auxiliary_loss_clip": 0.01052257, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.032897, + "balance_loss_mlp": 1.01966333, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.5841986621729738, + "language_loss": 0.57983696, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60066587, + "num_input_tokens_seen": 239835845, + "step": 11105, + "time_per_iteration": 2.93713641166687 + }, + { + "auxiliary_loss_clip": 0.01092074, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.03424811, + "balance_loss_mlp": 1.01914907, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.7483365916647582, + "language_loss": 0.72955376, + "learning_rate": 1.0504406049066e-06, + "loss": 0.7507869, + "num_input_tokens_seen": 239853820, + "step": 11106, + "time_per_iteration": 2.8098394870758057 + }, + { + "auxiliary_loss_clip": 0.01097932, + "auxiliary_loss_mlp": 0.01026088, + "balance_loss_clip": 1.03350818, + "balance_loss_mlp": 1.01483464, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.5864780452892158, + "language_loss": 0.76382208, + "learning_rate": 1.0500978558659e-06, + "loss": 0.78506225, + "num_input_tokens_seen": 239873365, + "step": 11107, + "time_per_iteration": 2.7094693183898926 + }, + { + "auxiliary_loss_clip": 0.01073663, + "auxiliary_loss_mlp": 0.01027349, + "balance_loss_clip": 1.03179264, + "balance_loss_mlp": 1.01666808, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.462558010213252, + "language_loss": 0.9023242, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92333436, + "num_input_tokens_seen": 239891215, + "step": 11108, + "time_per_iteration": 2.774437189102173 + }, + { + "auxiliary_loss_clip": 0.01067861, + "auxiliary_loss_mlp": 0.0102291, + "balance_loss_clip": 1.03588402, + "balance_loss_mlp": 1.0129025, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.5307252019870725, + "language_loss": 0.82778114, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84868884, + "num_input_tokens_seen": 239913490, + "step": 11109, + "time_per_iteration": 2.9922752380371094 + }, + { + "auxiliary_loss_clip": 0.01072063, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.03171015, + "balance_loss_mlp": 1.0180788, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 1.9356163106995028, + "language_loss": 0.69290531, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.7139222, + "num_input_tokens_seen": 239931565, + "step": 11110, + "time_per_iteration": 2.7030675411224365 + }, + { + "auxiliary_loss_clip": 0.01071272, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.03317308, + "balance_loss_mlp": 1.02054572, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.5212808293985531, + "language_loss": 0.73406637, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75510973, + "num_input_tokens_seen": 239952395, + "step": 11111, + "time_per_iteration": 2.7483723163604736 + }, + { + "auxiliary_loss_clip": 0.01095035, + "auxiliary_loss_mlp": 0.01027683, + "balance_loss_clip": 1.03211725, + "balance_loss_mlp": 1.01676965, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 3.414947658246405, + "language_loss": 0.65698677, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67821395, + "num_input_tokens_seen": 239968910, + "step": 11112, + "time_per_iteration": 2.64013934135437 + }, + { + "auxiliary_loss_clip": 0.01070735, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.03054953, + "balance_loss_mlp": 1.01928139, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 2.0161306976161577, + "language_loss": 0.62995481, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65096945, + "num_input_tokens_seen": 239987680, + "step": 11113, + "time_per_iteration": 4.291936874389648 + }, + { + "auxiliary_loss_clip": 0.01054681, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.03333294, + "balance_loss_mlp": 1.01867771, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.8970502521506296, + "language_loss": 0.65539944, + "learning_rate": 1.047699621879422e-06, + "loss": 0.67624021, + "num_input_tokens_seen": 240005790, + "step": 11114, + "time_per_iteration": 2.8225955963134766 + }, + { + "auxiliary_loss_clip": 0.01087894, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.03256631, + "balance_loss_mlp": 1.02362275, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.5821074738232763, + "language_loss": 0.78403425, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80526066, + "num_input_tokens_seen": 240025895, + "step": 11115, + "time_per_iteration": 2.6787803173065186 + }, + { + "auxiliary_loss_clip": 0.01040105, + "auxiliary_loss_mlp": 0.00749367, + "balance_loss_clip": 1.02698946, + "balance_loss_mlp": 1.00034761, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.6671710963633157, + "language_loss": 0.79740047, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81529522, + "num_input_tokens_seen": 240044880, + "step": 11116, + "time_per_iteration": 2.8224077224731445 + }, + { + "auxiliary_loss_clip": 0.01070302, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.0342977, + "balance_loss_mlp": 1.02246964, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 2.1489801465796043, + "language_loss": 0.79192436, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.8129729, + "num_input_tokens_seen": 240065785, + "step": 11117, + "time_per_iteration": 4.308102607727051 + }, + { + "auxiliary_loss_clip": 0.01055133, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.0347296, + "balance_loss_mlp": 1.016379, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 1.8279760561825844, + "language_loss": 0.65914398, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67998314, + "num_input_tokens_seen": 240085130, + "step": 11118, + "time_per_iteration": 2.7308413982391357 + }, + { + "auxiliary_loss_clip": 0.01075032, + "auxiliary_loss_mlp": 0.01026808, + "balance_loss_clip": 1.03216195, + "balance_loss_mlp": 1.01634121, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 1.5437683503178246, + "language_loss": 0.68806499, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.70908344, + "num_input_tokens_seen": 240105495, + "step": 11119, + "time_per_iteration": 2.7194571495056152 + }, + { + "auxiliary_loss_clip": 0.01069757, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.03144443, + "balance_loss_mlp": 1.01798677, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 2.011358172888902, + "language_loss": 0.67379332, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69478273, + "num_input_tokens_seen": 240125455, + "step": 11120, + "time_per_iteration": 2.7585818767547607 + }, + { + "auxiliary_loss_clip": 0.01065383, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.03205383, + "balance_loss_mlp": 1.02042472, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 2.58965938646283, + "language_loss": 0.71977419, + "learning_rate": 1.045303157347638e-06, + "loss": 0.74075043, + "num_input_tokens_seen": 240143870, + "step": 11121, + "time_per_iteration": 2.7484753131866455 + }, + { + "auxiliary_loss_clip": 0.01073627, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.03094947, + "balance_loss_mlp": 1.02226698, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 3.2516970905854534, + "language_loss": 0.69378769, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.71486437, + "num_input_tokens_seen": 240161020, + "step": 11122, + "time_per_iteration": 2.6434166431427 + }, + { + "auxiliary_loss_clip": 0.01035487, + "auxiliary_loss_mlp": 0.00749345, + "balance_loss_clip": 1.02933538, + "balance_loss_mlp": 1.00031352, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.6849868400953123, + "language_loss": 0.71606994, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73391825, + "num_input_tokens_seen": 240179820, + "step": 11123, + "time_per_iteration": 2.852972984313965 + }, + { + "auxiliary_loss_clip": 0.01080059, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.03459597, + "balance_loss_mlp": 1.02465391, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.7079604325855897, + "language_loss": 0.79048657, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81165195, + "num_input_tokens_seen": 240200130, + "step": 11124, + "time_per_iteration": 2.7354016304016113 + }, + { + "auxiliary_loss_clip": 0.01074353, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.03541672, + "balance_loss_mlp": 1.02571547, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 2.4794483113608003, + "language_loss": 0.74206364, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76318204, + "num_input_tokens_seen": 240217945, + "step": 11125, + "time_per_iteration": 2.733055591583252 + }, + { + "auxiliary_loss_clip": 0.01058473, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.03273916, + "balance_loss_mlp": 1.02454138, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.2560988839827454, + "language_loss": 0.66525686, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68620145, + "num_input_tokens_seen": 240237220, + "step": 11126, + "time_per_iteration": 2.77960467338562 + }, + { + "auxiliary_loss_clip": 0.01078577, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.03052592, + "balance_loss_mlp": 1.01556897, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 1.8650037311951746, + "language_loss": 0.7117148, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73277158, + "num_input_tokens_seen": 240256000, + "step": 11127, + "time_per_iteration": 4.184763431549072 + }, + { + "auxiliary_loss_clip": 0.01078626, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.03193569, + "balance_loss_mlp": 1.01961124, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 2.0751070787799426, + "language_loss": 0.80236125, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82346934, + "num_input_tokens_seen": 240275845, + "step": 11128, + "time_per_iteration": 2.7815804481506348 + }, + { + "auxiliary_loss_clip": 0.01099332, + "auxiliary_loss_mlp": 0.01025541, + "balance_loss_clip": 1.03376555, + "balance_loss_mlp": 1.01427603, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.7819410135156142, + "language_loss": 0.80798757, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.82923627, + "num_input_tokens_seen": 240294095, + "step": 11129, + "time_per_iteration": 2.6512296199798584 + }, + { + "auxiliary_loss_clip": 0.0107603, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.03103364, + "balance_loss_mlp": 1.0221709, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 2.4786690847653112, + "language_loss": 0.70339155, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72448117, + "num_input_tokens_seen": 240313460, + "step": 11130, + "time_per_iteration": 2.762791872024536 + }, + { + "auxiliary_loss_clip": 0.01071304, + "auxiliary_loss_mlp": 0.01032959, + "balance_loss_clip": 1.03208077, + "balance_loss_mlp": 1.02259946, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.6544539979781197, + "language_loss": 0.70080692, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.7218495, + "num_input_tokens_seen": 240333540, + "step": 11131, + "time_per_iteration": 2.7153406143188477 + }, + { + "auxiliary_loss_clip": 0.01090602, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.01737976, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.4820443997033883, + "language_loss": 0.65621066, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.67741841, + "num_input_tokens_seen": 240350085, + "step": 11132, + "time_per_iteration": 2.5141968727111816 + }, + { + "auxiliary_loss_clip": 0.01084056, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.0307591, + "balance_loss_mlp": 1.01887798, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.6806696834493624, + "language_loss": 0.74384344, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76499689, + "num_input_tokens_seen": 240370015, + "step": 11133, + "time_per_iteration": 2.5473663806915283 + }, + { + "auxiliary_loss_clip": 0.01094206, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.03727436, + "balance_loss_mlp": 1.01766133, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 2.2496538353707805, + "language_loss": 0.6651665, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68640614, + "num_input_tokens_seen": 240390770, + "step": 11134, + "time_per_iteration": 2.6183393001556396 + }, + { + "auxiliary_loss_clip": 0.01087641, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.03461576, + "balance_loss_mlp": 1.02076483, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 2.650526716001629, + "language_loss": 0.77615452, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79736602, + "num_input_tokens_seen": 240409590, + "step": 11135, + "time_per_iteration": 2.59456205368042 + }, + { + "auxiliary_loss_clip": 0.01086637, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.03298569, + "balance_loss_mlp": 1.016891, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.6824292627071373, + "language_loss": 0.73950827, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76065946, + "num_input_tokens_seen": 240428180, + "step": 11136, + "time_per_iteration": 2.653461217880249 + }, + { + "auxiliary_loss_clip": 0.01093765, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.03615689, + "balance_loss_mlp": 1.01899242, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.86755401412511, + "language_loss": 0.6220969, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.6433475, + "num_input_tokens_seen": 240447815, + "step": 11137, + "time_per_iteration": 2.6325950622558594 + }, + { + "auxiliary_loss_clip": 0.01099227, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.03470922, + "balance_loss_mlp": 1.01558995, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 1.7712920003324264, + "language_loss": 0.66122603, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.68248785, + "num_input_tokens_seen": 240468635, + "step": 11138, + "time_per_iteration": 2.5693211555480957 + }, + { + "auxiliary_loss_clip": 0.01061051, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.03033507, + "balance_loss_mlp": 1.0271616, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.7437288928844017, + "language_loss": 0.72863591, + "learning_rate": 1.039148976175053e-06, + "loss": 0.74963295, + "num_input_tokens_seen": 240488550, + "step": 11139, + "time_per_iteration": 2.6950716972351074 + }, + { + "auxiliary_loss_clip": 0.01057407, + "auxiliary_loss_mlp": 0.01028573, + "balance_loss_clip": 1.0309037, + "balance_loss_mlp": 1.01829147, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 2.1250322471318643, + "language_loss": 0.70511597, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72597575, + "num_input_tokens_seen": 240508330, + "step": 11140, + "time_per_iteration": 2.671198844909668 + }, + { + "auxiliary_loss_clip": 0.0108764, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.03125072, + "balance_loss_mlp": 1.01439261, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 2.08167018306097, + "language_loss": 0.75591582, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77705121, + "num_input_tokens_seen": 240528470, + "step": 11141, + "time_per_iteration": 2.6772477626800537 + }, + { + "auxiliary_loss_clip": 0.01091049, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.03523362, + "balance_loss_mlp": 1.01976609, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.9176293077967959, + "language_loss": 0.82019228, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84141862, + "num_input_tokens_seen": 240547815, + "step": 11142, + "time_per_iteration": 4.037845611572266 + }, + { + "auxiliary_loss_clip": 0.01049645, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.03129292, + "balance_loss_mlp": 1.01796269, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.4735037257668786, + "language_loss": 0.69859546, + "learning_rate": 1.037782980862959e-06, + "loss": 0.7193861, + "num_input_tokens_seen": 240567765, + "step": 11143, + "time_per_iteration": 2.6554152965545654 + }, + { + "auxiliary_loss_clip": 0.01056856, + "auxiliary_loss_mlp": 0.00749355, + "balance_loss_clip": 1.03233957, + "balance_loss_mlp": 1.00033998, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.4087794427343052, + "language_loss": 0.7014389, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71950102, + "num_input_tokens_seen": 240590750, + "step": 11144, + "time_per_iteration": 2.739175319671631 + }, + { + "auxiliary_loss_clip": 0.01071345, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.0308044, + "balance_loss_mlp": 1.01835036, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.6902665971806747, + "language_loss": 0.74375558, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76476932, + "num_input_tokens_seen": 240608875, + "step": 11145, + "time_per_iteration": 2.6543164253234863 + }, + { + "auxiliary_loss_clip": 0.01079728, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.03346884, + "balance_loss_mlp": 1.015306, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.4711386862529603, + "language_loss": 0.70627081, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.72734058, + "num_input_tokens_seen": 240628565, + "step": 11146, + "time_per_iteration": 2.606778383255005 + }, + { + "auxiliary_loss_clip": 0.0109465, + "auxiliary_loss_mlp": 0.0074919, + "balance_loss_clip": 1.03307414, + "balance_loss_mlp": 1.00025964, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.8748189036980414, + "language_loss": 0.7857821, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.8042205, + "num_input_tokens_seen": 240646325, + "step": 11147, + "time_per_iteration": 2.4913487434387207 + }, + { + "auxiliary_loss_clip": 0.01089725, + "auxiliary_loss_mlp": 0.0074922, + "balance_loss_clip": 1.03513646, + "balance_loss_mlp": 1.00029933, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.8366940879230713, + "language_loss": 0.70389616, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72228557, + "num_input_tokens_seen": 240666145, + "step": 11148, + "time_per_iteration": 2.5185673236846924 + }, + { + "auxiliary_loss_clip": 0.01077093, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.0312295, + "balance_loss_mlp": 1.02046299, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 2.3437599707103383, + "language_loss": 0.70138311, + "learning_rate": 1.035735082774636e-06, + "loss": 0.7224679, + "num_input_tokens_seen": 240685570, + "step": 11149, + "time_per_iteration": 2.617357015609741 + }, + { + "auxiliary_loss_clip": 0.0107919, + "auxiliary_loss_mlp": 0.01026548, + "balance_loss_clip": 1.03224242, + "balance_loss_mlp": 1.01597989, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 1.6899481898336033, + "language_loss": 0.73820072, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75925809, + "num_input_tokens_seen": 240706945, + "step": 11150, + "time_per_iteration": 2.6520233154296875 + }, + { + "auxiliary_loss_clip": 0.01089801, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.03646326, + "balance_loss_mlp": 1.02284312, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.8073859427070744, + "language_loss": 0.78079021, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80202991, + "num_input_tokens_seen": 240727990, + "step": 11151, + "time_per_iteration": 2.660128116607666 + }, + { + "auxiliary_loss_clip": 0.00977768, + "auxiliary_loss_mlp": 0.01004477, + "balance_loss_clip": 1.00567305, + "balance_loss_mlp": 1.00343359, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.7899861826664939, + "language_loss": 0.55462861, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57445103, + "num_input_tokens_seen": 240790380, + "step": 11152, + "time_per_iteration": 3.2949283123016357 + }, + { + "auxiliary_loss_clip": 0.0107745, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.03476334, + "balance_loss_mlp": 1.01933503, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.4931011163329837, + "language_loss": 0.80602384, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.82710689, + "num_input_tokens_seen": 240811545, + "step": 11153, + "time_per_iteration": 2.675487756729126 + }, + { + "auxiliary_loss_clip": 0.01061818, + "auxiliary_loss_mlp": 0.00749368, + "balance_loss_clip": 1.03328156, + "balance_loss_mlp": 1.00029016, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.5062566637986234, + "language_loss": 0.76113057, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.77924246, + "num_input_tokens_seen": 240831380, + "step": 11154, + "time_per_iteration": 4.2477498054504395 + }, + { + "auxiliary_loss_clip": 0.01076827, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.03263283, + "balance_loss_mlp": 1.02276504, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.5625840729138047, + "language_loss": 0.76372749, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78484333, + "num_input_tokens_seen": 240851855, + "step": 11155, + "time_per_iteration": 2.6178858280181885 + }, + { + "auxiliary_loss_clip": 0.01103033, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.03843474, + "balance_loss_mlp": 1.02004862, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 1.82234610689359, + "language_loss": 0.82136023, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84270728, + "num_input_tokens_seen": 240869980, + "step": 11156, + "time_per_iteration": 2.6134519577026367 + }, + { + "auxiliary_loss_clip": 0.01099458, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.03490615, + "balance_loss_mlp": 1.01847577, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.0900541195577715, + "language_loss": 0.74906993, + "learning_rate": 1.033006600114165e-06, + "loss": 0.77035677, + "num_input_tokens_seen": 240888680, + "step": 11157, + "time_per_iteration": 4.106576919555664 + }, + { + "auxiliary_loss_clip": 0.0109595, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.03842771, + "balance_loss_mlp": 1.02312446, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.9966577407782313, + "language_loss": 0.74245358, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.7637645, + "num_input_tokens_seen": 240909050, + "step": 11158, + "time_per_iteration": 2.7024168968200684 + }, + { + "auxiliary_loss_clip": 0.01103379, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.03629613, + "balance_loss_mlp": 1.02186692, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 1.9506750276510985, + "language_loss": 0.81586576, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83723503, + "num_input_tokens_seen": 240930035, + "step": 11159, + "time_per_iteration": 2.6357033252716064 + }, + { + "auxiliary_loss_clip": 0.01078916, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.03310084, + "balance_loss_mlp": 1.01782227, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.6002487427507726, + "language_loss": 0.76554167, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.78662133, + "num_input_tokens_seen": 240948895, + "step": 11160, + "time_per_iteration": 2.6477599143981934 + }, + { + "auxiliary_loss_clip": 0.01075147, + "auxiliary_loss_mlp": 0.01026288, + "balance_loss_clip": 1.03440261, + "balance_loss_mlp": 1.01572585, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.6786607098537554, + "language_loss": 0.73533762, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75635195, + "num_input_tokens_seen": 240967770, + "step": 11161, + "time_per_iteration": 2.6959517002105713 + }, + { + "auxiliary_loss_clip": 0.01077615, + "auxiliary_loss_mlp": 0.01034091, + "balance_loss_clip": 1.03160024, + "balance_loss_mlp": 1.02146101, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.5910697656909896, + "language_loss": 0.6818471, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70296419, + "num_input_tokens_seen": 240988985, + "step": 11162, + "time_per_iteration": 2.7714731693267822 + }, + { + "auxiliary_loss_clip": 0.01073074, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.03109872, + "balance_loss_mlp": 1.02593875, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 3.659117691701602, + "language_loss": 0.70279413, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72389913, + "num_input_tokens_seen": 241005455, + "step": 11163, + "time_per_iteration": 2.5951969623565674 + }, + { + "auxiliary_loss_clip": 0.01099194, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.03651416, + "balance_loss_mlp": 1.0198195, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.5740675681851808, + "language_loss": 0.75504619, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.77634382, + "num_input_tokens_seen": 241026175, + "step": 11164, + "time_per_iteration": 2.5691940784454346 + }, + { + "auxiliary_loss_clip": 0.01099335, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.03407884, + "balance_loss_mlp": 1.02007151, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 2.6988918753789077, + "language_loss": 0.65470171, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67601305, + "num_input_tokens_seen": 241044040, + "step": 11165, + "time_per_iteration": 2.5845963954925537 + }, + { + "auxiliary_loss_clip": 0.01098502, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.03509009, + "balance_loss_mlp": 1.01942515, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.1305409605239163, + "language_loss": 0.71714258, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73843288, + "num_input_tokens_seen": 241063615, + "step": 11166, + "time_per_iteration": 2.5418663024902344 + }, + { + "auxiliary_loss_clip": 0.01098714, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.03580427, + "balance_loss_mlp": 1.0177561, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 2.555873869387104, + "language_loss": 0.76782298, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.78909415, + "num_input_tokens_seen": 241082520, + "step": 11167, + "time_per_iteration": 4.097855567932129 + }, + { + "auxiliary_loss_clip": 0.01086488, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03141832, + "balance_loss_mlp": 1.02157712, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 2.3825202746293916, + "language_loss": 0.68631303, + "learning_rate": 1.029258769662629e-06, + "loss": 0.70750201, + "num_input_tokens_seen": 241103505, + "step": 11168, + "time_per_iteration": 2.7092323303222656 + }, + { + "auxiliary_loss_clip": 0.01059299, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.03090441, + "balance_loss_mlp": 1.0235703, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 3.1617323324913635, + "language_loss": 0.73530817, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75626117, + "num_input_tokens_seen": 241122885, + "step": 11169, + "time_per_iteration": 2.6365408897399902 + }, + { + "auxiliary_loss_clip": 0.01090194, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.03336763, + "balance_loss_mlp": 1.02067578, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.0374571033009174, + "language_loss": 0.76155889, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78279507, + "num_input_tokens_seen": 241140865, + "step": 11170, + "time_per_iteration": 2.509456157684326 + }, + { + "auxiliary_loss_clip": 0.01079782, + "auxiliary_loss_mlp": 0.01026966, + "balance_loss_clip": 1.03325582, + "balance_loss_mlp": 1.0154264, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 1.7983375961512371, + "language_loss": 0.74334514, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76441264, + "num_input_tokens_seen": 241158225, + "step": 11171, + "time_per_iteration": 2.5632212162017822 + }, + { + "auxiliary_loss_clip": 0.01057342, + "auxiliary_loss_mlp": 0.01046133, + "balance_loss_clip": 1.03078103, + "balance_loss_mlp": 1.03273392, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 1.6106092673507844, + "language_loss": 0.86356348, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.8845982, + "num_input_tokens_seen": 241175215, + "step": 11172, + "time_per_iteration": 2.5777156352996826 + }, + { + "auxiliary_loss_clip": 0.01083132, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.03097129, + "balance_loss_mlp": 1.02111137, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.4727748080146312, + "language_loss": 0.63381422, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65497553, + "num_input_tokens_seen": 241195250, + "step": 11173, + "time_per_iteration": 2.6756722927093506 + }, + { + "auxiliary_loss_clip": 0.0109521, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.03359914, + "balance_loss_mlp": 1.0227983, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.685466355883896, + "language_loss": 0.71576214, + "learning_rate": 1.02721637475002e-06, + "loss": 0.7370761, + "num_input_tokens_seen": 241210720, + "step": 11174, + "time_per_iteration": 2.511904716491699 + }, + { + "auxiliary_loss_clip": 0.01060309, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.03218818, + "balance_loss_mlp": 1.01587617, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 2.338124750852088, + "language_loss": 0.68859994, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70947212, + "num_input_tokens_seen": 241227395, + "step": 11175, + "time_per_iteration": 2.5555591583251953 + }, + { + "auxiliary_loss_clip": 0.01071609, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.03476787, + "balance_loss_mlp": 1.02013493, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.0240187895861754, + "language_loss": 0.73951781, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.7605437, + "num_input_tokens_seen": 241246355, + "step": 11176, + "time_per_iteration": 2.574143171310425 + }, + { + "auxiliary_loss_clip": 0.01078162, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.03285801, + "balance_loss_mlp": 1.01693177, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 2.611417791715381, + "language_loss": 0.72730887, + "learning_rate": 1.026195675108182e-06, + "loss": 0.74837899, + "num_input_tokens_seen": 241264180, + "step": 11177, + "time_per_iteration": 2.62428879737854 + }, + { + "auxiliary_loss_clip": 0.01100415, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.03464067, + "balance_loss_mlp": 1.01930499, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 1.8765929664154046, + "language_loss": 0.7611016, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78242445, + "num_input_tokens_seen": 241282245, + "step": 11178, + "time_per_iteration": 2.541943073272705 + }, + { + "auxiliary_loss_clip": 0.01093007, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.03685331, + "balance_loss_mlp": 1.02165747, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.5904119869787323, + "language_loss": 0.69627374, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.71753007, + "num_input_tokens_seen": 241300745, + "step": 11179, + "time_per_iteration": 2.5651426315307617 + }, + { + "auxiliary_loss_clip": 0.01052025, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.03189361, + "balance_loss_mlp": 1.01696885, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.8317673671181272, + "language_loss": 0.74105394, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.7618531, + "num_input_tokens_seen": 241319320, + "step": 11180, + "time_per_iteration": 2.680044174194336 + }, + { + "auxiliary_loss_clip": 0.01077718, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.03352976, + "balance_loss_mlp": 1.01933503, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.5673765580657795, + "language_loss": 0.75062108, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77170229, + "num_input_tokens_seen": 241342225, + "step": 11181, + "time_per_iteration": 2.611250638961792 + }, + { + "auxiliary_loss_clip": 0.01081263, + "auxiliary_loss_mlp": 0.01026367, + "balance_loss_clip": 1.03379154, + "balance_loss_mlp": 1.01516747, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 2.1253541148047232, + "language_loss": 0.74753189, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76860821, + "num_input_tokens_seen": 241358240, + "step": 11182, + "time_per_iteration": 4.040737152099609 + }, + { + "auxiliary_loss_clip": 0.01086641, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.03446198, + "balance_loss_mlp": 1.01991105, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 2.930459554764237, + "language_loss": 0.70255291, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.72372425, + "num_input_tokens_seen": 241378420, + "step": 11183, + "time_per_iteration": 2.5550410747528076 + }, + { + "auxiliary_loss_clip": 0.01051284, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.03256488, + "balance_loss_mlp": 1.01848865, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.6719421534120942, + "language_loss": 0.77734864, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79816401, + "num_input_tokens_seen": 241397185, + "step": 11184, + "time_per_iteration": 2.7278952598571777 + }, + { + "auxiliary_loss_clip": 0.01079174, + "auxiliary_loss_mlp": 0.00750113, + "balance_loss_clip": 1.03548098, + "balance_loss_mlp": 1.00034332, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 2.3525024637388867, + "language_loss": 0.66238791, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.68068075, + "num_input_tokens_seen": 241415785, + "step": 11185, + "time_per_iteration": 2.7391700744628906 + }, + { + "auxiliary_loss_clip": 0.0106279, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.0316515, + "balance_loss_mlp": 1.02057922, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.7076191625455162, + "language_loss": 0.80573809, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82669163, + "num_input_tokens_seen": 241437390, + "step": 11186, + "time_per_iteration": 2.752187967300415 + }, + { + "auxiliary_loss_clip": 0.01089302, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.03630841, + "balance_loss_mlp": 1.02001178, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.555609541804076, + "language_loss": 0.80185914, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82305914, + "num_input_tokens_seen": 241458085, + "step": 11187, + "time_per_iteration": 2.57950496673584 + }, + { + "auxiliary_loss_clip": 0.01058952, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.03802311, + "balance_loss_mlp": 1.01976824, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 1.8028154487079615, + "language_loss": 0.70321143, + "learning_rate": 1.022455955762965e-06, + "loss": 0.72412729, + "num_input_tokens_seen": 241476880, + "step": 11188, + "time_per_iteration": 2.779552698135376 + }, + { + "auxiliary_loss_clip": 0.01038623, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.03798902, + "balance_loss_mlp": 1.0210743, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.7230812831173234, + "language_loss": 0.75760055, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.7783106, + "num_input_tokens_seen": 241496535, + "step": 11189, + "time_per_iteration": 2.7506637573242188 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.0338335, + "balance_loss_mlp": 1.01820755, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 3.1934847537003317, + "language_loss": 0.75492072, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.77625489, + "num_input_tokens_seen": 241513465, + "step": 11190, + "time_per_iteration": 2.5062572956085205 + }, + { + "auxiliary_loss_clip": 0.01030565, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.0282917, + "balance_loss_mlp": 1.01901817, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.7343808203218034, + "language_loss": 0.7714448, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.7920624, + "num_input_tokens_seen": 241534125, + "step": 11191, + "time_per_iteration": 2.718703269958496 + }, + { + "auxiliary_loss_clip": 0.01097591, + "auxiliary_loss_mlp": 0.0102805, + "balance_loss_clip": 1.03417563, + "balance_loss_mlp": 1.01684475, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 1.7941346060665562, + "language_loss": 0.86256397, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88382035, + "num_input_tokens_seen": 241556340, + "step": 11192, + "time_per_iteration": 2.6525497436523438 + }, + { + "auxiliary_loss_clip": 0.01088879, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.03430212, + "balance_loss_mlp": 1.02312088, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 2.1869394791236125, + "language_loss": 0.7581526, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.77939105, + "num_input_tokens_seen": 241575185, + "step": 11193, + "time_per_iteration": 2.610280752182007 + }, + { + "auxiliary_loss_clip": 0.01069606, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.03773069, + "balance_loss_mlp": 1.02208483, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 2.0226669334275336, + "language_loss": 0.78737152, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80840361, + "num_input_tokens_seen": 241592970, + "step": 11194, + "time_per_iteration": 4.181936025619507 + }, + { + "auxiliary_loss_clip": 0.01089786, + "auxiliary_loss_mlp": 0.01027557, + "balance_loss_clip": 1.03355122, + "balance_loss_mlp": 1.01670361, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.070876906400182, + "language_loss": 0.89971, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.92088342, + "num_input_tokens_seen": 241610245, + "step": 11195, + "time_per_iteration": 2.6037464141845703 + }, + { + "auxiliary_loss_clip": 0.01087698, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.03230357, + "balance_loss_mlp": 1.02274942, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 1.7581198861447984, + "language_loss": 0.72379154, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74500579, + "num_input_tokens_seen": 241630350, + "step": 11196, + "time_per_iteration": 2.722384452819824 + }, + { + "auxiliary_loss_clip": 0.00970146, + "auxiliary_loss_mlp": 0.01006185, + "balance_loss_clip": 1.01350152, + "balance_loss_mlp": 1.00508833, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7829219839499597, + "language_loss": 0.5653547, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58511806, + "num_input_tokens_seen": 241692380, + "step": 11197, + "time_per_iteration": 4.662010669708252 + }, + { + "auxiliary_loss_clip": 0.01078327, + "auxiliary_loss_mlp": 0.01028776, + "balance_loss_clip": 1.03477097, + "balance_loss_mlp": 1.01796389, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.1534453343728086, + "language_loss": 0.75206423, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.7731353, + "num_input_tokens_seen": 241710430, + "step": 11198, + "time_per_iteration": 2.559540033340454 + }, + { + "auxiliary_loss_clip": 0.01087002, + "auxiliary_loss_mlp": 0.01028005, + "balance_loss_clip": 1.0311923, + "balance_loss_mlp": 1.01578569, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.115599726198507, + "language_loss": 0.81729281, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83844292, + "num_input_tokens_seen": 241724775, + "step": 11199, + "time_per_iteration": 2.567063093185425 + }, + { + "auxiliary_loss_clip": 0.01045975, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.03121495, + "balance_loss_mlp": 1.02163398, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.8482574613116274, + "language_loss": 0.71505982, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73585957, + "num_input_tokens_seen": 241744440, + "step": 11200, + "time_per_iteration": 2.842550039291382 + }, + { + "auxiliary_loss_clip": 0.0110272, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.03721619, + "balance_loss_mlp": 1.0225203, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.53294303238506, + "language_loss": 0.64375901, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66512585, + "num_input_tokens_seen": 241771705, + "step": 11201, + "time_per_iteration": 2.899704694747925 + }, + { + "auxiliary_loss_clip": 0.01082226, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.03513002, + "balance_loss_mlp": 1.01984417, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 2.0416819733180365, + "language_loss": 0.62970936, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.6508466, + "num_input_tokens_seen": 241790830, + "step": 11202, + "time_per_iteration": 2.5853829383850098 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.03433454, + "balance_loss_mlp": 1.01653063, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 2.8157176249483613, + "language_loss": 0.74659991, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76787674, + "num_input_tokens_seen": 241808165, + "step": 11203, + "time_per_iteration": 2.550976514816284 + }, + { + "auxiliary_loss_clip": 0.01085611, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.0359385, + "balance_loss_mlp": 1.01984608, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 2.083352881773669, + "language_loss": 0.67715061, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.69833648, + "num_input_tokens_seen": 241826925, + "step": 11204, + "time_per_iteration": 2.5762720108032227 + }, + { + "auxiliary_loss_clip": 0.01095264, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.01940513, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.662011839792025, + "language_loss": 0.74001276, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76128185, + "num_input_tokens_seen": 241845525, + "step": 11205, + "time_per_iteration": 2.605781316757202 + }, + { + "auxiliary_loss_clip": 0.01096677, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.03300893, + "balance_loss_mlp": 1.0217346, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.7309120217781748, + "language_loss": 0.71613061, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73742819, + "num_input_tokens_seen": 241866815, + "step": 11206, + "time_per_iteration": 2.7244691848754883 + }, + { + "auxiliary_loss_clip": 0.01065991, + "auxiliary_loss_mlp": 0.0074959, + "balance_loss_clip": 1.03637433, + "balance_loss_mlp": 1.00032938, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 3.6194498457893647, + "language_loss": 0.67155808, + "learning_rate": 1.016007014855092e-06, + "loss": 0.6897139, + "num_input_tokens_seen": 241887050, + "step": 11207, + "time_per_iteration": 2.774587392807007 + }, + { + "auxiliary_loss_clip": 0.01044958, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.03005171, + "balance_loss_mlp": 1.01894307, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 6.967789616134407, + "language_loss": 0.74123192, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.76198417, + "num_input_tokens_seen": 241904280, + "step": 11208, + "time_per_iteration": 4.295374393463135 + }, + { + "auxiliary_loss_clip": 0.01081664, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.03156304, + "balance_loss_mlp": 1.02301407, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 1.8743701181645913, + "language_loss": 0.75996917, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.78115916, + "num_input_tokens_seen": 241919190, + "step": 11209, + "time_per_iteration": 2.6243221759796143 + }, + { + "auxiliary_loss_clip": 0.01061238, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.03296006, + "balance_loss_mlp": 1.01841843, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.7975822504439978, + "language_loss": 0.66265935, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68356013, + "num_input_tokens_seen": 241940525, + "step": 11210, + "time_per_iteration": 2.6812195777893066 + }, + { + "auxiliary_loss_clip": 0.01096347, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.03321576, + "balance_loss_mlp": 1.01720405, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.149253587087633, + "language_loss": 0.79954791, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82078397, + "num_input_tokens_seen": 241959290, + "step": 11211, + "time_per_iteration": 2.5212574005126953 + }, + { + "auxiliary_loss_clip": 0.01046987, + "auxiliary_loss_mlp": 0.01027679, + "balance_loss_clip": 1.03061807, + "balance_loss_mlp": 1.01663995, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.4767549548260355, + "language_loss": 0.76600331, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78674996, + "num_input_tokens_seen": 241980715, + "step": 11212, + "time_per_iteration": 2.676133632659912 + }, + { + "auxiliary_loss_clip": 0.01058108, + "auxiliary_loss_mlp": 0.00749803, + "balance_loss_clip": 1.03141618, + "balance_loss_mlp": 1.00035119, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.642298343961648, + "language_loss": 0.77371758, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.79179668, + "num_input_tokens_seen": 241999985, + "step": 11213, + "time_per_iteration": 2.644235134124756 + }, + { + "auxiliary_loss_clip": 0.01051727, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.0368588, + "balance_loss_mlp": 1.02179003, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 1.6516398290729608, + "language_loss": 0.67715549, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69800907, + "num_input_tokens_seen": 242018990, + "step": 11214, + "time_per_iteration": 2.636409044265747 + }, + { + "auxiliary_loss_clip": 0.01100252, + "auxiliary_loss_mlp": 0.00749499, + "balance_loss_clip": 1.03448808, + "balance_loss_mlp": 1.0003494, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 2.268679137804429, + "language_loss": 0.72552854, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74402606, + "num_input_tokens_seen": 242039340, + "step": 11215, + "time_per_iteration": 2.7115817070007324 + }, + { + "auxiliary_loss_clip": 0.01087148, + "auxiliary_loss_mlp": 0.0074942, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.00031185, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 1.9549900851806672, + "language_loss": 0.66734618, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.68571186, + "num_input_tokens_seen": 242062215, + "step": 11216, + "time_per_iteration": 2.704521417617798 + }, + { + "auxiliary_loss_clip": 0.01027025, + "auxiliary_loss_mlp": 0.01000309, + "balance_loss_clip": 1.00624418, + "balance_loss_mlp": 0.99933189, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6739968650555372, + "language_loss": 0.56305838, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.5833317, + "num_input_tokens_seen": 242131130, + "step": 11217, + "time_per_iteration": 3.1841213703155518 + }, + { + "auxiliary_loss_clip": 0.0108741, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.03301406, + "balance_loss_mlp": 1.01807261, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.9435529709206718, + "language_loss": 0.74244791, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76361984, + "num_input_tokens_seen": 242149720, + "step": 11218, + "time_per_iteration": 2.543757200241089 + }, + { + "auxiliary_loss_clip": 0.01068259, + "auxiliary_loss_mlp": 0.01042782, + "balance_loss_clip": 1.03396916, + "balance_loss_mlp": 1.0293653, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.828492769802439, + "language_loss": 0.65985453, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68096495, + "num_input_tokens_seen": 242168875, + "step": 11219, + "time_per_iteration": 2.7149038314819336 + }, + { + "auxiliary_loss_clip": 0.01052446, + "auxiliary_loss_mlp": 0.01037089, + "balance_loss_clip": 1.0278461, + "balance_loss_mlp": 1.02360058, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 3.8884196380541636, + "language_loss": 0.74928141, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77017677, + "num_input_tokens_seen": 242188465, + "step": 11220, + "time_per_iteration": 2.692473888397217 + }, + { + "auxiliary_loss_clip": 0.01084724, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.0345751, + "balance_loss_mlp": 1.0220753, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.6379959086560223, + "language_loss": 0.70425463, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72544336, + "num_input_tokens_seen": 242208675, + "step": 11221, + "time_per_iteration": 2.6933045387268066 + }, + { + "auxiliary_loss_clip": 0.0106209, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.03273487, + "balance_loss_mlp": 1.01727796, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 1.877585362810081, + "language_loss": 0.58013105, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60103142, + "num_input_tokens_seen": 242227440, + "step": 11222, + "time_per_iteration": 4.196520805358887 + }, + { + "auxiliary_loss_clip": 0.01086103, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.03389347, + "balance_loss_mlp": 1.02081716, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 2.044326019815895, + "language_loss": 0.7676897, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.78887427, + "num_input_tokens_seen": 242245240, + "step": 11223, + "time_per_iteration": 2.5506722927093506 + }, + { + "auxiliary_loss_clip": 0.01093044, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.03520036, + "balance_loss_mlp": 1.02105045, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 2.507416464023761, + "language_loss": 0.75321376, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77446926, + "num_input_tokens_seen": 242263435, + "step": 11224, + "time_per_iteration": 2.5769364833831787 + }, + { + "auxiliary_loss_clip": 0.01044827, + "auxiliary_loss_mlp": 0.01026048, + "balance_loss_clip": 1.03075588, + "balance_loss_mlp": 1.01610613, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.7585603507185013, + "language_loss": 0.63197303, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65268183, + "num_input_tokens_seen": 242282765, + "step": 11225, + "time_per_iteration": 2.6805906295776367 + }, + { + "auxiliary_loss_clip": 0.01093517, + "auxiliary_loss_mlp": 0.00749168, + "balance_loss_clip": 1.03231597, + "balance_loss_mlp": 1.00031316, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.9513742193564128, + "language_loss": 0.6426959, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66112268, + "num_input_tokens_seen": 242298980, + "step": 11226, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.01088883, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.03410172, + "balance_loss_mlp": 1.01821816, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.4473408016146023, + "language_loss": 0.7170099, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73819566, + "num_input_tokens_seen": 242315420, + "step": 11227, + "time_per_iteration": 2.5944015979766846 + }, + { + "auxiliary_loss_clip": 0.0106782, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.03053498, + "balance_loss_mlp": 1.019104, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 3.8026818930600514, + "language_loss": 0.71730959, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73830622, + "num_input_tokens_seen": 242332805, + "step": 11228, + "time_per_iteration": 2.6388347148895264 + }, + { + "auxiliary_loss_clip": 0.01006318, + "auxiliary_loss_mlp": 0.01001712, + "balance_loss_clip": 1.00764346, + "balance_loss_mlp": 1.00064492, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7575914136259837, + "language_loss": 0.53249365, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55257392, + "num_input_tokens_seen": 242396160, + "step": 11229, + "time_per_iteration": 3.257504463195801 + }, + { + "auxiliary_loss_clip": 0.0108814, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.03322959, + "balance_loss_mlp": 1.01849902, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.9024899372768016, + "language_loss": 0.80383325, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82500809, + "num_input_tokens_seen": 242414660, + "step": 11230, + "time_per_iteration": 2.6364192962646484 + }, + { + "auxiliary_loss_clip": 0.01066593, + "auxiliary_loss_mlp": 0.01025296, + "balance_loss_clip": 1.03267717, + "balance_loss_mlp": 1.01468003, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.4779285380563292, + "language_loss": 0.65500855, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.6759274, + "num_input_tokens_seen": 242434225, + "step": 11231, + "time_per_iteration": 2.617692708969116 + }, + { + "auxiliary_loss_clip": 0.01062924, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.03577495, + "balance_loss_mlp": 1.02238548, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 2.2603218716919935, + "language_loss": 0.66573274, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68671715, + "num_input_tokens_seen": 242454355, + "step": 11232, + "time_per_iteration": 2.742009162902832 + }, + { + "auxiliary_loss_clip": 0.01049587, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.02775764, + "balance_loss_mlp": 1.01455212, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 2.6573868169002655, + "language_loss": 0.72654444, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74729991, + "num_input_tokens_seen": 242474935, + "step": 11233, + "time_per_iteration": 4.150411605834961 + }, + { + "auxiliary_loss_clip": 0.01089643, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.03347802, + "balance_loss_mlp": 1.02032566, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.5657136333762027, + "language_loss": 0.76992226, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79113901, + "num_input_tokens_seen": 242495530, + "step": 11234, + "time_per_iteration": 2.765803337097168 + }, + { + "auxiliary_loss_clip": 0.01097791, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.03389788, + "balance_loss_mlp": 1.01997972, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5215305516542164, + "language_loss": 0.7521252, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77341676, + "num_input_tokens_seen": 242514550, + "step": 11235, + "time_per_iteration": 2.6879961490631104 + }, + { + "auxiliary_loss_clip": 0.01017079, + "auxiliary_loss_mlp": 0.00998359, + "balance_loss_clip": 1.00579381, + "balance_loss_mlp": 0.9972502, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7802844378270747, + "language_loss": 0.51384258, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.533997, + "num_input_tokens_seen": 242569200, + "step": 11236, + "time_per_iteration": 3.0915029048919678 + }, + { + "auxiliary_loss_clip": 0.01064223, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.03179598, + "balance_loss_mlp": 1.01540446, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 1.9503247456185842, + "language_loss": 0.75701952, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77794892, + "num_input_tokens_seen": 242586950, + "step": 11237, + "time_per_iteration": 4.242210626602173 + }, + { + "auxiliary_loss_clip": 0.01071822, + "auxiliary_loss_mlp": 0.01035287, + "balance_loss_clip": 1.03490341, + "balance_loss_mlp": 1.02403378, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 1.8393198790358933, + "language_loss": 0.77276248, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79383361, + "num_input_tokens_seen": 242607380, + "step": 11238, + "time_per_iteration": 2.7198612689971924 + }, + { + "auxiliary_loss_clip": 0.01084823, + "auxiliary_loss_mlp": 0.01031438, + "balance_loss_clip": 1.03239131, + "balance_loss_mlp": 1.01930225, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 2.3686147756458977, + "language_loss": 0.66397655, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68513918, + "num_input_tokens_seen": 242628025, + "step": 11239, + "time_per_iteration": 2.6690425872802734 + }, + { + "auxiliary_loss_clip": 0.01075067, + "auxiliary_loss_mlp": 0.01023612, + "balance_loss_clip": 1.03455007, + "balance_loss_mlp": 1.01239443, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 2.409403588228385, + "language_loss": 0.83183849, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85282528, + "num_input_tokens_seen": 242643825, + "step": 11240, + "time_per_iteration": 2.548247814178467 + }, + { + "auxiliary_loss_clip": 0.010743, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.0354073, + "balance_loss_mlp": 1.01780558, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 2.545172294080561, + "language_loss": 0.74538136, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76643693, + "num_input_tokens_seen": 242661820, + "step": 11241, + "time_per_iteration": 2.754786968231201 + }, + { + "auxiliary_loss_clip": 0.01052866, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.03609204, + "balance_loss_mlp": 1.02297974, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 4.105595418636715, + "language_loss": 0.80245495, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82332325, + "num_input_tokens_seen": 242679890, + "step": 11242, + "time_per_iteration": 2.6792259216308594 + }, + { + "auxiliary_loss_clip": 0.01080025, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.03126144, + "balance_loss_mlp": 1.02438521, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.7528512868673043, + "language_loss": 0.72522902, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74640059, + "num_input_tokens_seen": 242699495, + "step": 11243, + "time_per_iteration": 2.6771395206451416 + }, + { + "auxiliary_loss_clip": 0.01087896, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.03384948, + "balance_loss_mlp": 1.02222705, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.7111195357447224, + "language_loss": 0.72553217, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74674046, + "num_input_tokens_seen": 242719500, + "step": 11244, + "time_per_iteration": 2.6089987754821777 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.01041897, + "balance_loss_clip": 1.03505301, + "balance_loss_mlp": 1.03015494, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 4.38060790552996, + "language_loss": 0.85457206, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87600327, + "num_input_tokens_seen": 242738325, + "step": 11245, + "time_per_iteration": 2.568613052368164 + }, + { + "auxiliary_loss_clip": 0.01103572, + "auxiliary_loss_mlp": 0.00749636, + "balance_loss_clip": 1.0345329, + "balance_loss_mlp": 1.00039041, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 1.8157314041526453, + "language_loss": 0.73799253, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75652456, + "num_input_tokens_seen": 242756620, + "step": 11246, + "time_per_iteration": 2.6740036010742188 + }, + { + "auxiliary_loss_clip": 0.01084973, + "auxiliary_loss_mlp": 0.01028055, + "balance_loss_clip": 1.03254318, + "balance_loss_mlp": 1.01660502, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.9332516714631733, + "language_loss": 0.87626445, + "learning_rate": 1.002474432661539e-06, + "loss": 0.89739472, + "num_input_tokens_seen": 242774505, + "step": 11247, + "time_per_iteration": 2.676161050796509 + }, + { + "auxiliary_loss_clip": 0.01006716, + "auxiliary_loss_mlp": 0.00999859, + "balance_loss_clip": 1.00840831, + "balance_loss_mlp": 0.99864942, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8203983826576259, + "language_loss": 0.53989673, + "learning_rate": 1.002136890130115e-06, + "loss": 0.55996245, + "num_input_tokens_seen": 242828645, + "step": 11248, + "time_per_iteration": 4.7567198276519775 + }, + { + "auxiliary_loss_clip": 0.01040435, + "auxiliary_loss_mlp": 0.01024575, + "balance_loss_clip": 1.03661942, + "balance_loss_mlp": 1.01394153, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.6849711781359797, + "language_loss": 0.73626792, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75691801, + "num_input_tokens_seen": 242850100, + "step": 11249, + "time_per_iteration": 2.8393514156341553 + }, + { + "auxiliary_loss_clip": 0.01088373, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.03236902, + "balance_loss_mlp": 1.02423942, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.395477066987487, + "language_loss": 0.74145865, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.76270652, + "num_input_tokens_seen": 242867775, + "step": 11250, + "time_per_iteration": 2.741194248199463 + }, + { + "auxiliary_loss_clip": 0.01100583, + "auxiliary_loss_mlp": 0.01027239, + "balance_loss_clip": 1.03468001, + "balance_loss_mlp": 1.01623583, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 16.94248447356404, + "language_loss": 0.75022388, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77150214, + "num_input_tokens_seen": 242886865, + "step": 11251, + "time_per_iteration": 2.7198846340179443 + }, + { + "auxiliary_loss_clip": 0.01062901, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.03451467, + "balance_loss_mlp": 1.01702738, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 2.1429302568875284, + "language_loss": 0.69878352, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.71969604, + "num_input_tokens_seen": 242906705, + "step": 11252, + "time_per_iteration": 2.7430107593536377 + }, + { + "auxiliary_loss_clip": 0.01050884, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.03323412, + "balance_loss_mlp": 1.02049077, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.7414804508358603, + "language_loss": 0.66549468, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.68631554, + "num_input_tokens_seen": 242925215, + "step": 11253, + "time_per_iteration": 2.7897756099700928 + }, + { + "auxiliary_loss_clip": 0.01052494, + "auxiliary_loss_mlp": 0.00749926, + "balance_loss_clip": 1.02962744, + "balance_loss_mlp": 1.00036383, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.7791821210096568, + "language_loss": 0.77199388, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79001808, + "num_input_tokens_seen": 242944750, + "step": 11254, + "time_per_iteration": 2.7363991737365723 + }, + { + "auxiliary_loss_clip": 0.01087999, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.0328691, + "balance_loss_mlp": 1.01859641, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.108143910150969, + "language_loss": 0.71991158, + "learning_rate": 9.997751526206835e-07, + "loss": 0.7410925, + "num_input_tokens_seen": 242963860, + "step": 11255, + "time_per_iteration": 2.661187171936035 + }, + { + "auxiliary_loss_clip": 0.01047429, + "auxiliary_loss_mlp": 0.00749395, + "balance_loss_clip": 1.02937019, + "balance_loss_mlp": 1.00029063, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.303006247192934, + "language_loss": 0.74691343, + "learning_rate": 9.994379131600828e-07, + "loss": 0.76488161, + "num_input_tokens_seen": 242983050, + "step": 11256, + "time_per_iteration": 2.798642158508301 + }, + { + "auxiliary_loss_clip": 0.01089795, + "auxiliary_loss_mlp": 0.01033847, + "balance_loss_clip": 1.03493071, + "balance_loss_mlp": 1.02229548, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.1531157917913095, + "language_loss": 0.66105855, + "learning_rate": 9.991007116408965e-07, + "loss": 0.68229496, + "num_input_tokens_seen": 243001125, + "step": 11257, + "time_per_iteration": 2.6616625785827637 + }, + { + "auxiliary_loss_clip": 0.01053505, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.03598571, + "balance_loss_mlp": 1.0199095, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.4384681990467165, + "language_loss": 0.75706339, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77790618, + "num_input_tokens_seen": 243021865, + "step": 11258, + "time_per_iteration": 2.7835519313812256 + }, + { + "auxiliary_loss_clip": 0.01074404, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.03292274, + "balance_loss_mlp": 1.02206731, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.7151003028511689, + "language_loss": 0.6674149, + "learning_rate": 9.984264224779127e-07, + "loss": 0.68848467, + "num_input_tokens_seen": 243042970, + "step": 11259, + "time_per_iteration": 2.8091135025024414 + }, + { + "auxiliary_loss_clip": 0.01075486, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.03152847, + "balance_loss_mlp": 1.0233084, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.204387994254336, + "language_loss": 0.85249287, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87359488, + "num_input_tokens_seen": 243058470, + "step": 11260, + "time_per_iteration": 2.5877673625946045 + }, + { + "auxiliary_loss_clip": 0.01070473, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.03100729, + "balance_loss_mlp": 1.02483869, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.5649235693602557, + "language_loss": 0.7691887, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79026818, + "num_input_tokens_seen": 243076630, + "step": 11261, + "time_per_iteration": 2.6390578746795654 + }, + { + "auxiliary_loss_clip": 0.01077477, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.03176308, + "balance_loss_mlp": 1.02404785, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.8258021985145396, + "language_loss": 0.87836528, + "learning_rate": 9.97415273613666e-07, + "loss": 0.89949894, + "num_input_tokens_seen": 243092260, + "step": 11262, + "time_per_iteration": 2.719742774963379 + }, + { + "auxiliary_loss_clip": 0.01078976, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.03536081, + "balance_loss_mlp": 1.02226412, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 1.916347362143391, + "language_loss": 0.74377745, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76490378, + "num_input_tokens_seen": 243109405, + "step": 11263, + "time_per_iteration": 4.119860649108887 + }, + { + "auxiliary_loss_clip": 0.01092918, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.0342809, + "balance_loss_mlp": 1.02147615, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 2.6453642682135965, + "language_loss": 0.67677796, + "learning_rate": 9.967413644401016e-07, + "loss": 0.69804299, + "num_input_tokens_seen": 243128135, + "step": 11264, + "time_per_iteration": 2.555194139480591 + }, + { + "auxiliary_loss_clip": 0.01079521, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.0369041, + "balance_loss_mlp": 1.01951241, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 2.020100320770607, + "language_loss": 0.73122954, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75233722, + "num_input_tokens_seen": 243146785, + "step": 11265, + "time_per_iteration": 2.6598262786865234 + }, + { + "auxiliary_loss_clip": 0.01054475, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.02947068, + "balance_loss_mlp": 1.02398872, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.9817132591747526, + "language_loss": 0.61528581, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63619226, + "num_input_tokens_seen": 243165275, + "step": 11266, + "time_per_iteration": 2.6303203105926514 + }, + { + "auxiliary_loss_clip": 0.01072117, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.03565979, + "balance_loss_mlp": 1.02008021, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 2.044280159584842, + "language_loss": 0.70613432, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72717154, + "num_input_tokens_seen": 243182845, + "step": 11267, + "time_per_iteration": 2.6149508953094482 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.03467917, + "balance_loss_mlp": 1.01833022, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 2.8148609608657016, + "language_loss": 0.70894408, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73025167, + "num_input_tokens_seen": 243201475, + "step": 11268, + "time_per_iteration": 2.5474319458007812 + }, + { + "auxiliary_loss_clip": 0.0107385, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.03353298, + "balance_loss_mlp": 1.01875997, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.8957314265512946, + "language_loss": 0.76922226, + "learning_rate": 9.950572574939194e-07, + "loss": 0.79026961, + "num_input_tokens_seen": 243221850, + "step": 11269, + "time_per_iteration": 2.682333469390869 + }, + { + "auxiliary_loss_clip": 0.0106636, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.03299177, + "balance_loss_mlp": 1.02371526, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 1.9783116065499655, + "language_loss": 0.74158132, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76260173, + "num_input_tokens_seen": 243239855, + "step": 11270, + "time_per_iteration": 2.6801915168762207 + }, + { + "auxiliary_loss_clip": 0.01045251, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.03505075, + "balance_loss_mlp": 1.02420259, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.690964815723884, + "language_loss": 0.72993731, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75075209, + "num_input_tokens_seen": 243260085, + "step": 11271, + "time_per_iteration": 2.7576324939727783 + }, + { + "auxiliary_loss_clip": 0.01099552, + "auxiliary_loss_mlp": 0.01033789, + "balance_loss_clip": 1.03465962, + "balance_loss_mlp": 1.02241671, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.7097029875769745, + "language_loss": 0.67944753, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70078093, + "num_input_tokens_seen": 243280065, + "step": 11272, + "time_per_iteration": 2.604320764541626 + }, + { + "auxiliary_loss_clip": 0.01091654, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.03594613, + "balance_loss_mlp": 1.02025843, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 1.8981399114417632, + "language_loss": 0.73875082, + "learning_rate": 9.937106577958481e-07, + "loss": 0.76000202, + "num_input_tokens_seen": 243297775, + "step": 11273, + "time_per_iteration": 2.660151481628418 + }, + { + "auxiliary_loss_clip": 0.01081092, + "auxiliary_loss_mlp": 0.01036607, + "balance_loss_clip": 1.03482711, + "balance_loss_mlp": 1.02527606, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 1.8497650885289179, + "language_loss": 0.70230269, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72347963, + "num_input_tokens_seen": 243315760, + "step": 11274, + "time_per_iteration": 4.2513108253479 + }, + { + "auxiliary_loss_clip": 0.01100585, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.03392434, + "balance_loss_mlp": 1.01919568, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.6222216160123344, + "language_loss": 0.66008413, + "learning_rate": 9.930375868473093e-07, + "loss": 0.68139976, + "num_input_tokens_seen": 243335715, + "step": 11275, + "time_per_iteration": 2.6746621131896973 + }, + { + "auxiliary_loss_clip": 0.01092666, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.03650832, + "balance_loss_mlp": 1.02192545, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 2.3692469505758296, + "language_loss": 0.72667187, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74792671, + "num_input_tokens_seen": 243356935, + "step": 11276, + "time_per_iteration": 2.6608824729919434 + }, + { + "auxiliary_loss_clip": 0.01070692, + "auxiliary_loss_mlp": 0.0074959, + "balance_loss_clip": 1.031564, + "balance_loss_mlp": 1.00027943, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.8161823046613217, + "language_loss": 0.76830924, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78651208, + "num_input_tokens_seen": 243375625, + "step": 11277, + "time_per_iteration": 4.254513263702393 + }, + { + "auxiliary_loss_clip": 0.01077235, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.03234422, + "balance_loss_mlp": 1.01732659, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 2.7488315064362454, + "language_loss": 0.83659482, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85765821, + "num_input_tokens_seen": 243390195, + "step": 11278, + "time_per_iteration": 2.6559298038482666 + }, + { + "auxiliary_loss_clip": 0.01072827, + "auxiliary_loss_mlp": 0.00749196, + "balance_loss_clip": 1.03612328, + "balance_loss_mlp": 1.00033307, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.5700837201138045, + "language_loss": 0.70106691, + "learning_rate": 9.916919032616844e-07, + "loss": 0.71928716, + "num_input_tokens_seen": 243411690, + "step": 11279, + "time_per_iteration": 2.831817865371704 + }, + { + "auxiliary_loss_clip": 0.01091653, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.03483772, + "balance_loss_mlp": 1.01935291, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 1.8995600088783544, + "language_loss": 0.73921978, + "learning_rate": 9.913555779212485e-07, + "loss": 0.7604543, + "num_input_tokens_seen": 243430280, + "step": 11280, + "time_per_iteration": 2.658313035964966 + }, + { + "auxiliary_loss_clip": 0.01088727, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.03294873, + "balance_loss_mlp": 1.01724303, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.7459042826275801, + "language_loss": 0.69941735, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72059929, + "num_input_tokens_seen": 243448690, + "step": 11281, + "time_per_iteration": 2.631115674972534 + }, + { + "auxiliary_loss_clip": 0.01097479, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.0348264, + "balance_loss_mlp": 1.01730537, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.4896797820300498, + "language_loss": 0.63622123, + "learning_rate": 9.906830419968217e-07, + "loss": 0.65747762, + "num_input_tokens_seen": 243470695, + "step": 11282, + "time_per_iteration": 2.633754014968872 + }, + { + "auxiliary_loss_clip": 0.01056149, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.02992344, + "balance_loss_mlp": 1.02629161, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.781112174633917, + "language_loss": 0.74834991, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76932615, + "num_input_tokens_seen": 243493345, + "step": 11283, + "time_per_iteration": 2.789982795715332 + }, + { + "auxiliary_loss_clip": 0.01089642, + "auxiliary_loss_mlp": 0.01026629, + "balance_loss_clip": 1.03407216, + "balance_loss_mlp": 1.01516676, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.845471348977413, + "language_loss": 0.57212776, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59329045, + "num_input_tokens_seen": 243515670, + "step": 11284, + "time_per_iteration": 2.785245418548584 + }, + { + "auxiliary_loss_clip": 0.01077309, + "auxiliary_loss_mlp": 0.01027142, + "balance_loss_clip": 1.03321302, + "balance_loss_mlp": 1.01607966, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 2.1364088344092815, + "language_loss": 0.74850678, + "learning_rate": 9.896745251925535e-07, + "loss": 0.76955128, + "num_input_tokens_seen": 243533625, + "step": 11285, + "time_per_iteration": 2.6737220287323 + }, + { + "auxiliary_loss_clip": 0.01100461, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.03695083, + "balance_loss_mlp": 1.02013612, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 5.600165498898763, + "language_loss": 0.66520524, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68652427, + "num_input_tokens_seen": 243553040, + "step": 11286, + "time_per_iteration": 2.6466968059539795 + }, + { + "auxiliary_loss_clip": 0.01077475, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.03223634, + "balance_loss_mlp": 1.01679707, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 7.914779942921473, + "language_loss": 0.53035641, + "learning_rate": 9.890023721933447e-07, + "loss": 0.5514183, + "num_input_tokens_seen": 243572590, + "step": 11287, + "time_per_iteration": 2.7514004707336426 + }, + { + "auxiliary_loss_clip": 0.01049626, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.03039026, + "balance_loss_mlp": 1.02057397, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.6339232955604803, + "language_loss": 0.77226436, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79308784, + "num_input_tokens_seen": 243594140, + "step": 11288, + "time_per_iteration": 4.263271808624268 + }, + { + "auxiliary_loss_clip": 0.01093685, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.03731537, + "balance_loss_mlp": 1.02322197, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 1.8314943214024604, + "language_loss": 0.73219454, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75347459, + "num_input_tokens_seen": 243615170, + "step": 11289, + "time_per_iteration": 2.659637212753296 + }, + { + "auxiliary_loss_clip": 0.01101111, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.03569996, + "balance_loss_mlp": 1.02028322, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.5278114571421546, + "language_loss": 0.79985654, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82118642, + "num_input_tokens_seen": 243635675, + "step": 11290, + "time_per_iteration": 2.573063373565674 + }, + { + "auxiliary_loss_clip": 0.01088317, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.03568578, + "balance_loss_mlp": 1.02135932, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 1.4824473220644743, + "language_loss": 0.75070333, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77191323, + "num_input_tokens_seen": 243654950, + "step": 11291, + "time_per_iteration": 2.6829771995544434 + }, + { + "auxiliary_loss_clip": 0.01082752, + "auxiliary_loss_mlp": 0.00749488, + "balance_loss_clip": 1.0355531, + "balance_loss_mlp": 1.00032651, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.929373896624289, + "language_loss": 0.75483048, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77315283, + "num_input_tokens_seen": 243674970, + "step": 11292, + "time_per_iteration": 2.6684184074401855 + }, + { + "auxiliary_loss_clip": 0.010599, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.03315592, + "balance_loss_mlp": 1.02062893, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 1.9627443145865917, + "language_loss": 0.84364855, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86457539, + "num_input_tokens_seen": 243693440, + "step": 11293, + "time_per_iteration": 2.6695215702056885 + }, + { + "auxiliary_loss_clip": 0.011066, + "auxiliary_loss_mlp": 0.01038147, + "balance_loss_clip": 1.03771496, + "balance_loss_mlp": 1.02533817, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.098369972558912, + "language_loss": 0.79399633, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81544375, + "num_input_tokens_seen": 243710055, + "step": 11294, + "time_per_iteration": 2.5216782093048096 + }, + { + "auxiliary_loss_clip": 0.01076187, + "auxiliary_loss_mlp": 0.01024266, + "balance_loss_clip": 1.03277445, + "balance_loss_mlp": 1.01323903, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.5833198361536838, + "language_loss": 0.79060698, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81161153, + "num_input_tokens_seen": 243728635, + "step": 11295, + "time_per_iteration": 2.6576404571533203 + }, + { + "auxiliary_loss_clip": 0.01070972, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.03342152, + "balance_loss_mlp": 1.02042699, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 2.3989465392629103, + "language_loss": 0.71817935, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73919308, + "num_input_tokens_seen": 243748330, + "step": 11296, + "time_per_iteration": 2.6107285022735596 + }, + { + "auxiliary_loss_clip": 0.01088992, + "auxiliary_loss_mlp": 0.01026137, + "balance_loss_clip": 1.03450966, + "balance_loss_mlp": 1.01505017, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.5151558054891547, + "language_loss": 0.70462143, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72577268, + "num_input_tokens_seen": 243769380, + "step": 11297, + "time_per_iteration": 2.621621608734131 + }, + { + "auxiliary_loss_clip": 0.01072324, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.0337851, + "balance_loss_mlp": 1.02171409, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.2008643557098577, + "language_loss": 0.66078603, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68185329, + "num_input_tokens_seen": 243785510, + "step": 11298, + "time_per_iteration": 2.75966739654541 + }, + { + "auxiliary_loss_clip": 0.0109133, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.03479767, + "balance_loss_mlp": 1.01751423, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.771746857271351, + "language_loss": 0.71827865, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73947001, + "num_input_tokens_seen": 243805545, + "step": 11299, + "time_per_iteration": 2.618307590484619 + }, + { + "auxiliary_loss_clip": 0.01103388, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.03703141, + "balance_loss_mlp": 1.01928473, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.4188242250957372, + "language_loss": 0.77217317, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79351914, + "num_input_tokens_seen": 243825185, + "step": 11300, + "time_per_iteration": 2.593956232070923 + }, + { + "auxiliary_loss_clip": 0.01087879, + "auxiliary_loss_mlp": 0.01029448, + "balance_loss_clip": 1.03331733, + "balance_loss_mlp": 1.01786041, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.9850529621610793, + "language_loss": 0.63282156, + "learning_rate": 9.843016006639577e-07, + "loss": 0.6539948, + "num_input_tokens_seen": 243841600, + "step": 11301, + "time_per_iteration": 2.535949468612671 + }, + { + "auxiliary_loss_clip": 0.01088924, + "auxiliary_loss_mlp": 0.0102772, + "balance_loss_clip": 1.03433824, + "balance_loss_mlp": 1.016747, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.8094828104999372, + "language_loss": 0.83067858, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85184503, + "num_input_tokens_seen": 243862250, + "step": 11302, + "time_per_iteration": 2.609126329421997 + }, + { + "auxiliary_loss_clip": 0.01090768, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.03356051, + "balance_loss_mlp": 1.01857305, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 1.8902471605961308, + "language_loss": 0.69560486, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71681708, + "num_input_tokens_seen": 243880560, + "step": 11303, + "time_per_iteration": 2.6173903942108154 + }, + { + "auxiliary_loss_clip": 0.01070599, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.03379142, + "balance_loss_mlp": 1.02094388, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 2.5391208198528337, + "language_loss": 0.70303595, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72407246, + "num_input_tokens_seen": 243900635, + "step": 11304, + "time_per_iteration": 4.2086591720581055 + }, + { + "auxiliary_loss_clip": 0.01089114, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.0354141, + "balance_loss_mlp": 1.02039337, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.141838535209461, + "language_loss": 0.72404617, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74526477, + "num_input_tokens_seen": 243920160, + "step": 11305, + "time_per_iteration": 2.610905408859253 + }, + { + "auxiliary_loss_clip": 0.01077826, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.03317118, + "balance_loss_mlp": 1.01685822, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 5.0032422559424345, + "language_loss": 0.65469074, + "learning_rate": 9.826245813561882e-07, + "loss": 0.67575455, + "num_input_tokens_seen": 243939015, + "step": 11306, + "time_per_iteration": 2.7048749923706055 + }, + { + "auxiliary_loss_clip": 0.01076424, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.0337038, + "balance_loss_mlp": 1.01629913, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.640424133351911, + "language_loss": 0.80287528, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82392013, + "num_input_tokens_seen": 243958470, + "step": 11307, + "time_per_iteration": 2.6125741004943848 + }, + { + "auxiliary_loss_clip": 0.01071905, + "auxiliary_loss_mlp": 0.01035598, + "balance_loss_clip": 1.03286028, + "balance_loss_mlp": 1.02285504, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.979896059667067, + "language_loss": 0.88874936, + "learning_rate": 9.819540435969066e-07, + "loss": 0.90982437, + "num_input_tokens_seen": 243975450, + "step": 11308, + "time_per_iteration": 2.534458637237549 + }, + { + "auxiliary_loss_clip": 0.0106027, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_clip": 1.0296855, + "balance_loss_mlp": 1.02841496, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 1.8642978844786091, + "language_loss": 0.71072137, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73174703, + "num_input_tokens_seen": 243994355, + "step": 11309, + "time_per_iteration": 2.6952970027923584 + }, + { + "auxiliary_loss_clip": 0.01072991, + "auxiliary_loss_mlp": 0.01035241, + "balance_loss_clip": 1.03710651, + "balance_loss_mlp": 1.02398205, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 1.8503980448264141, + "language_loss": 0.84449005, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86557245, + "num_input_tokens_seen": 244011620, + "step": 11310, + "time_per_iteration": 2.6561009883880615 + }, + { + "auxiliary_loss_clip": 0.01070071, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.0344646, + "balance_loss_mlp": 1.01743007, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.6009683963922137, + "language_loss": 0.8271808, + "learning_rate": 9.80948526522792e-07, + "loss": 0.84816515, + "num_input_tokens_seen": 244029925, + "step": 11311, + "time_per_iteration": 2.5871779918670654 + }, + { + "auxiliary_loss_clip": 0.01046916, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.02915168, + "balance_loss_mlp": 1.01767349, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 1.6398705505401285, + "language_loss": 0.76359928, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78438079, + "num_input_tokens_seen": 244051225, + "step": 11312, + "time_per_iteration": 2.7166755199432373 + }, + { + "auxiliary_loss_clip": 0.01025807, + "auxiliary_loss_mlp": 0.01006023, + "balance_loss_clip": 1.00498605, + "balance_loss_mlp": 1.00498629, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.656732869478002, + "language_loss": 0.57262099, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59293932, + "num_input_tokens_seen": 244115930, + "step": 11313, + "time_per_iteration": 3.2223000526428223 + }, + { + "auxiliary_loss_clip": 0.01088241, + "auxiliary_loss_mlp": 0.01028079, + "balance_loss_clip": 1.03259587, + "balance_loss_mlp": 1.01667047, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 2.310533764215365, + "language_loss": 0.68767071, + "learning_rate": 9.799433572314754e-07, + "loss": 0.70883387, + "num_input_tokens_seen": 244137320, + "step": 11314, + "time_per_iteration": 4.162773609161377 + }, + { + "auxiliary_loss_clip": 0.01084703, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.03142679, + "balance_loss_mlp": 1.01800537, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.8200667122382845, + "language_loss": 0.81520295, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83633584, + "num_input_tokens_seen": 244152755, + "step": 11315, + "time_per_iteration": 2.603926420211792 + }, + { + "auxiliary_loss_clip": 0.01060258, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.03456306, + "balance_loss_mlp": 1.01462555, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 2.173868555094718, + "language_loss": 0.69882822, + "learning_rate": 9.792734377526718e-07, + "loss": 0.71969414, + "num_input_tokens_seen": 244171480, + "step": 11316, + "time_per_iteration": 2.6907739639282227 + }, + { + "auxiliary_loss_clip": 0.01089522, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.03557241, + "balance_loss_mlp": 1.01869535, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.097108352095546, + "language_loss": 0.66772699, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68891662, + "num_input_tokens_seen": 244187920, + "step": 11317, + "time_per_iteration": 2.6184322834014893 + }, + { + "auxiliary_loss_clip": 0.01093194, + "auxiliary_loss_mlp": 0.01039032, + "balance_loss_clip": 1.03878546, + "balance_loss_mlp": 1.0282495, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 5.715836190518366, + "language_loss": 0.74909312, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77041531, + "num_input_tokens_seen": 244209565, + "step": 11318, + "time_per_iteration": 2.5963633060455322 + }, + { + "auxiliary_loss_clip": 0.01066121, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.03035533, + "balance_loss_mlp": 1.01748657, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 1.6901460592782307, + "language_loss": 0.67900151, + "learning_rate": 9.782688488616143e-07, + "loss": 0.69994861, + "num_input_tokens_seen": 244228015, + "step": 11319, + "time_per_iteration": 4.881742000579834 + }, + { + "auxiliary_loss_clip": 0.01062061, + "auxiliary_loss_mlp": 0.00749499, + "balance_loss_clip": 1.03700244, + "balance_loss_mlp": 1.00031722, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.7941021615069477, + "language_loss": 0.7661947, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78431022, + "num_input_tokens_seen": 244245615, + "step": 11320, + "time_per_iteration": 2.8116204738616943 + }, + { + "auxiliary_loss_clip": 0.01074096, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.03529, + "balance_loss_mlp": 1.01714051, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 2.0798919726745964, + "language_loss": 0.74869013, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76971829, + "num_input_tokens_seen": 244263625, + "step": 11321, + "time_per_iteration": 2.684713125228882 + }, + { + "auxiliary_loss_clip": 0.01081196, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.03606474, + "balance_loss_mlp": 1.02220714, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.8198428698039752, + "language_loss": 0.72628474, + "learning_rate": 9.772646086678758e-07, + "loss": 0.74743426, + "num_input_tokens_seen": 244282745, + "step": 11322, + "time_per_iteration": 2.5936310291290283 + }, + { + "auxiliary_loss_clip": 0.01048541, + "auxiliary_loss_mlp": 0.00749527, + "balance_loss_clip": 1.03253245, + "balance_loss_mlp": 1.00030982, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 2.0193439587745727, + "language_loss": 0.78614348, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80412424, + "num_input_tokens_seen": 244303770, + "step": 11323, + "time_per_iteration": 2.670485734939575 + }, + { + "auxiliary_loss_clip": 0.00998363, + "auxiliary_loss_mlp": 0.01000255, + "balance_loss_clip": 1.00996017, + "balance_loss_mlp": 0.99928331, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.754601169114768, + "language_loss": 0.57135904, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59134525, + "num_input_tokens_seen": 244355910, + "step": 11324, + "time_per_iteration": 3.013801097869873 + }, + { + "auxiliary_loss_clip": 0.01081885, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.03508639, + "balance_loss_mlp": 1.02103913, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 6.058818709073751, + "language_loss": 0.68175036, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70290887, + "num_input_tokens_seen": 244376610, + "step": 11325, + "time_per_iteration": 2.626490831375122 + }, + { + "auxiliary_loss_clip": 0.01089771, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.03292882, + "balance_loss_mlp": 1.02059186, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.527011434534586, + "language_loss": 0.69933522, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72056234, + "num_input_tokens_seen": 244393000, + "step": 11326, + "time_per_iteration": 2.6230106353759766 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.03408527, + "balance_loss_mlp": 1.01795936, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.92039853055378, + "language_loss": 0.73229539, + "learning_rate": 9.75591650825392e-07, + "loss": 0.75360519, + "num_input_tokens_seen": 244409515, + "step": 11327, + "time_per_iteration": 2.562896251678467 + }, + { + "auxiliary_loss_clip": 0.01086239, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.03330791, + "balance_loss_mlp": 1.01834249, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 1.7731338333640205, + "language_loss": 0.77371663, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79487753, + "num_input_tokens_seen": 244427165, + "step": 11328, + "time_per_iteration": 4.20743465423584 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.03490877, + "balance_loss_mlp": 1.01739478, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 1.7107156424637238, + "language_loss": 0.64306855, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66437399, + "num_input_tokens_seen": 244445705, + "step": 11329, + "time_per_iteration": 2.5793380737304688 + }, + { + "auxiliary_loss_clip": 0.01048286, + "auxiliary_loss_mlp": 0.00749459, + "balance_loss_clip": 1.03617597, + "balance_loss_mlp": 1.00034809, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 2.0697753902536173, + "language_loss": 0.7912299, + "learning_rate": 9.745883421664096e-07, + "loss": 0.80920732, + "num_input_tokens_seen": 244460415, + "step": 11330, + "time_per_iteration": 2.7555198669433594 + }, + { + "auxiliary_loss_clip": 0.01091031, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.03532302, + "balance_loss_mlp": 1.01650894, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 2.2909345183421044, + "language_loss": 0.63987458, + "learning_rate": 9.742539836972665e-07, + "loss": 0.66107082, + "num_input_tokens_seen": 244480555, + "step": 11331, + "time_per_iteration": 2.6357734203338623 + }, + { + "auxiliary_loss_clip": 0.0105261, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.03425026, + "balance_loss_mlp": 1.02053523, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.5295077756944389, + "language_loss": 0.72444719, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74531221, + "num_input_tokens_seen": 244498540, + "step": 11332, + "time_per_iteration": 2.695840835571289 + }, + { + "auxiliary_loss_clip": 0.01091026, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.03593862, + "balance_loss_mlp": 1.01902652, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.0892142736155246, + "language_loss": 0.75147486, + "learning_rate": 9.735853834608326e-07, + "loss": 0.7726984, + "num_input_tokens_seen": 244517015, + "step": 11333, + "time_per_iteration": 2.5402746200561523 + }, + { + "auxiliary_loss_clip": 0.01092444, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.0351609, + "balance_loss_mlp": 1.0197922, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.4851463537915557, + "language_loss": 0.72110605, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74235237, + "num_input_tokens_seen": 244537450, + "step": 11334, + "time_per_iteration": 2.656991958618164 + }, + { + "auxiliary_loss_clip": 0.01084498, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.0351671, + "balance_loss_mlp": 1.01907909, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.6770934905050137, + "language_loss": 0.85570794, + "learning_rate": 9.729169389113791e-07, + "loss": 0.8768549, + "num_input_tokens_seen": 244555640, + "step": 11335, + "time_per_iteration": 2.5677266120910645 + }, + { + "auxiliary_loss_clip": 0.01080522, + "auxiliary_loss_mlp": 0.01025387, + "balance_loss_clip": 1.03151321, + "balance_loss_mlp": 1.01461053, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.6926502710253941, + "language_loss": 0.82002461, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84108365, + "num_input_tokens_seen": 244574005, + "step": 11336, + "time_per_iteration": 2.5521607398986816 + }, + { + "auxiliary_loss_clip": 0.01054749, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.03027225, + "balance_loss_mlp": 1.01887608, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.9643041365833145, + "language_loss": 0.81698847, + "learning_rate": 9.72248650150294e-07, + "loss": 0.8378371, + "num_input_tokens_seen": 244591395, + "step": 11337, + "time_per_iteration": 2.6336252689361572 + }, + { + "auxiliary_loss_clip": 0.01050864, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.03246915, + "balance_loss_mlp": 1.01753652, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 2.017442090167673, + "language_loss": 0.72486931, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74566466, + "num_input_tokens_seen": 244610400, + "step": 11338, + "time_per_iteration": 2.7191035747528076 + }, + { + "auxiliary_loss_clip": 0.0105298, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.03045726, + "balance_loss_mlp": 1.02410197, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 2.6761285328844324, + "language_loss": 0.77558792, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79647899, + "num_input_tokens_seen": 244630400, + "step": 11339, + "time_per_iteration": 2.723975896835327 + }, + { + "auxiliary_loss_clip": 0.01062968, + "auxiliary_loss_mlp": 0.01033793, + "balance_loss_clip": 1.0303812, + "balance_loss_mlp": 1.02206326, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 2.0539619778549283, + "language_loss": 0.70469296, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72566062, + "num_input_tokens_seen": 244649155, + "step": 11340, + "time_per_iteration": 2.701831579208374 + }, + { + "auxiliary_loss_clip": 0.01083021, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.03783774, + "balance_loss_mlp": 1.0220139, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.783359425744478, + "language_loss": 0.84084404, + "learning_rate": 9.709125403986722e-07, + "loss": 0.8620109, + "num_input_tokens_seen": 244665470, + "step": 11341, + "time_per_iteration": 2.6339163780212402 + }, + { + "auxiliary_loss_clip": 0.01066857, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.03327394, + "balance_loss_mlp": 1.02175117, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.8546683332681655, + "language_loss": 0.67927659, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70028818, + "num_input_tokens_seen": 244684390, + "step": 11342, + "time_per_iteration": 2.6654911041259766 + }, + { + "auxiliary_loss_clip": 0.01048161, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.03516519, + "balance_loss_mlp": 1.01500368, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.5811134820862742, + "language_loss": 0.74710041, + "learning_rate": 9.702447196107963e-07, + "loss": 0.76784837, + "num_input_tokens_seen": 244703370, + "step": 11343, + "time_per_iteration": 2.710488796234131 + }, + { + "auxiliary_loss_clip": 0.01058789, + "auxiliary_loss_mlp": 0.01040805, + "balance_loss_clip": 1.03439331, + "balance_loss_mlp": 1.02779937, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 2.3362042227608506, + "language_loss": 0.79676312, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81775904, + "num_input_tokens_seen": 244723325, + "step": 11344, + "time_per_iteration": 4.297702789306641 + }, + { + "auxiliary_loss_clip": 0.01066167, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.03385651, + "balance_loss_mlp": 1.01763344, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.352186040358079, + "language_loss": 0.66669345, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68764919, + "num_input_tokens_seen": 244745650, + "step": 11345, + "time_per_iteration": 2.789483070373535 + }, + { + "auxiliary_loss_clip": 0.01083368, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.03557849, + "balance_loss_mlp": 1.01911998, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.914190723188827, + "language_loss": 0.64921141, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67035937, + "num_input_tokens_seen": 244760270, + "step": 11346, + "time_per_iteration": 2.6470983028411865 + }, + { + "auxiliary_loss_clip": 0.01026833, + "auxiliary_loss_mlp": 0.00749875, + "balance_loss_clip": 1.02579689, + "balance_loss_mlp": 1.00033641, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.5532869129662077, + "language_loss": 0.78642583, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80419284, + "num_input_tokens_seen": 244779565, + "step": 11347, + "time_per_iteration": 2.7158925533294678 + }, + { + "auxiliary_loss_clip": 0.01017759, + "auxiliary_loss_mlp": 0.01007868, + "balance_loss_clip": 1.00631046, + "balance_loss_mlp": 1.00680733, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7316301444029892, + "language_loss": 0.52566254, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54591882, + "num_input_tokens_seen": 244838480, + "step": 11348, + "time_per_iteration": 3.1651339530944824 + }, + { + "auxiliary_loss_clip": 0.01097959, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.03418958, + "balance_loss_mlp": 1.02181053, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.755053791195205, + "language_loss": 0.79893368, + "learning_rate": 9.682421948143873e-07, + "loss": 0.8202461, + "num_input_tokens_seen": 244855265, + "step": 11349, + "time_per_iteration": 2.5498242378234863 + }, + { + "auxiliary_loss_clip": 0.01095699, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.03726041, + "balance_loss_mlp": 1.01708269, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 1.6870978319348822, + "language_loss": 0.73505509, + "learning_rate": 9.67908577543096e-07, + "loss": 0.75631922, + "num_input_tokens_seen": 244875555, + "step": 11350, + "time_per_iteration": 2.7551040649414062 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.03694093, + "balance_loss_mlp": 1.01874042, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.5414551426620147, + "language_loss": 0.79207075, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81339884, + "num_input_tokens_seen": 244895270, + "step": 11351, + "time_per_iteration": 2.5254874229431152 + }, + { + "auxiliary_loss_clip": 0.01088776, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.03425622, + "balance_loss_mlp": 1.02236676, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.575684288116048, + "language_loss": 0.73167062, + "learning_rate": 9.672414604241954e-07, + "loss": 0.7528944, + "num_input_tokens_seen": 244914535, + "step": 11352, + "time_per_iteration": 2.619374990463257 + }, + { + "auxiliary_loss_clip": 0.01053218, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.03193617, + "balance_loss_mlp": 1.02441597, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 2.3292070362827677, + "language_loss": 0.80205989, + "learning_rate": 9.669079606018814e-07, + "loss": 0.82296336, + "num_input_tokens_seen": 244936095, + "step": 11353, + "time_per_iteration": 2.7475473880767822 + }, + { + "auxiliary_loss_clip": 0.01088583, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.03325284, + "balance_loss_mlp": 1.01781082, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.611136277222705, + "language_loss": 0.78293175, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80411327, + "num_input_tokens_seen": 244955290, + "step": 11354, + "time_per_iteration": 4.087191820144653 + }, + { + "auxiliary_loss_clip": 0.0104396, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.03389001, + "balance_loss_mlp": 1.0168668, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.168649331164401, + "language_loss": 0.62119448, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64191794, + "num_input_tokens_seen": 244972935, + "step": 11355, + "time_per_iteration": 2.701565742492676 + }, + { + "auxiliary_loss_clip": 0.01041007, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.02850497, + "balance_loss_mlp": 1.01857376, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 2.0072713562199644, + "language_loss": 0.82057226, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84129047, + "num_input_tokens_seen": 244989440, + "step": 11356, + "time_per_iteration": 2.647899866104126 + }, + { + "auxiliary_loss_clip": 0.01085461, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.03811431, + "balance_loss_mlp": 1.01786113, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 1.8489618204656535, + "language_loss": 0.7845875, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80574334, + "num_input_tokens_seen": 245007830, + "step": 11357, + "time_per_iteration": 2.6226699352264404 + }, + { + "auxiliary_loss_clip": 0.01004753, + "auxiliary_loss_mlp": 0.01011535, + "balance_loss_clip": 1.00448096, + "balance_loss_mlp": 1.01017022, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8166540217817262, + "language_loss": 0.596632, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61679494, + "num_input_tokens_seen": 245070720, + "step": 11358, + "time_per_iteration": 4.722620725631714 + }, + { + "auxiliary_loss_clip": 0.01056386, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_clip": 1.02925706, + "balance_loss_mlp": 1.02909422, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 2.0783544186806187, + "language_loss": 0.7814225, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80242276, + "num_input_tokens_seen": 245089070, + "step": 11359, + "time_per_iteration": 2.695909023284912 + }, + { + "auxiliary_loss_clip": 0.01089663, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.03416026, + "balance_loss_mlp": 1.02041817, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 2.1675423684888475, + "language_loss": 0.8170951, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83830881, + "num_input_tokens_seen": 245106500, + "step": 11360, + "time_per_iteration": 2.6683523654937744 + }, + { + "auxiliary_loss_clip": 0.01087293, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.03611374, + "balance_loss_mlp": 1.02474499, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 2.0691917872229815, + "language_loss": 0.75589871, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77715302, + "num_input_tokens_seen": 245125260, + "step": 11361, + "time_per_iteration": 2.5936219692230225 + }, + { + "auxiliary_loss_clip": 0.01011347, + "auxiliary_loss_mlp": 0.01006568, + "balance_loss_clip": 1.01544666, + "balance_loss_mlp": 1.00533414, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.866204892166431, + "language_loss": 0.5970943, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61727345, + "num_input_tokens_seen": 245188730, + "step": 11362, + "time_per_iteration": 3.306150197982788 + }, + { + "auxiliary_loss_clip": 0.01074439, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.03230119, + "balance_loss_mlp": 1.01953161, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 3.491005998130259, + "language_loss": 0.75419462, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77526295, + "num_input_tokens_seen": 245205065, + "step": 11363, + "time_per_iteration": 2.6108739376068115 + }, + { + "auxiliary_loss_clip": 0.01077518, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.03293145, + "balance_loss_mlp": 1.02629566, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.1612359787900908, + "language_loss": 0.88859773, + "learning_rate": 9.632420508845063e-07, + "loss": 0.90975744, + "num_input_tokens_seen": 245224265, + "step": 11364, + "time_per_iteration": 2.65693736076355 + }, + { + "auxiliary_loss_clip": 0.01075877, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.03262615, + "balance_loss_mlp": 1.02061152, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 1.8572706887945845, + "language_loss": 0.88210642, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90317976, + "num_input_tokens_seen": 245243360, + "step": 11365, + "time_per_iteration": 2.659421682357788 + }, + { + "auxiliary_loss_clip": 0.01063935, + "auxiliary_loss_mlp": 0.0104246, + "balance_loss_clip": 1.03467333, + "balance_loss_mlp": 1.02876914, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.9197651420489708, + "language_loss": 0.80861914, + "learning_rate": 9.625760324338272e-07, + "loss": 0.82968307, + "num_input_tokens_seen": 245256350, + "step": 11366, + "time_per_iteration": 2.685624599456787 + }, + { + "auxiliary_loss_clip": 0.01078325, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.03221774, + "balance_loss_mlp": 1.01660681, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.520566827055259, + "language_loss": 0.76613426, + "learning_rate": 9.622430822110062e-07, + "loss": 0.7871989, + "num_input_tokens_seen": 245277575, + "step": 11367, + "time_per_iteration": 2.7115209102630615 + }, + { + "auxiliary_loss_clip": 0.01079143, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.03786719, + "balance_loss_mlp": 1.02438951, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.4615853425803544, + "language_loss": 0.68830645, + "learning_rate": 9.619101713400312e-07, + "loss": 0.70946252, + "num_input_tokens_seen": 245296615, + "step": 11368, + "time_per_iteration": 4.246813774108887 + }, + { + "auxiliary_loss_clip": 0.01056354, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.02978539, + "balance_loss_mlp": 1.0246284, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.7866046563822535, + "language_loss": 0.73093826, + "learning_rate": 9.615772998335261e-07, + "loss": 0.75186288, + "num_input_tokens_seen": 245316275, + "step": 11369, + "time_per_iteration": 2.7074551582336426 + }, + { + "auxiliary_loss_clip": 0.01087117, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.03365374, + "balance_loss_mlp": 1.0196681, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 2.8923230080441646, + "language_loss": 0.78881478, + "learning_rate": 9.612444677041138e-07, + "loss": 0.81000125, + "num_input_tokens_seen": 245334595, + "step": 11370, + "time_per_iteration": 2.604142904281616 + }, + { + "auxiliary_loss_clip": 0.01016327, + "auxiliary_loss_mlp": 0.01007521, + "balance_loss_clip": 1.00505424, + "balance_loss_mlp": 1.00640631, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7418961337987238, + "language_loss": 0.59819967, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61843812, + "num_input_tokens_seen": 245389750, + "step": 11371, + "time_per_iteration": 3.0432708263397217 + }, + { + "auxiliary_loss_clip": 0.01072144, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.0329988, + "balance_loss_mlp": 1.01763988, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.4395541071776456, + "language_loss": 0.63543713, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65644115, + "num_input_tokens_seen": 245407530, + "step": 11372, + "time_per_iteration": 2.659757375717163 + }, + { + "auxiliary_loss_clip": 0.01086828, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.03420949, + "balance_loss_mlp": 1.01468563, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.4603011540200455, + "language_loss": 0.7167787, + "learning_rate": 9.602462077046375e-07, + "loss": 0.73791128, + "num_input_tokens_seen": 245427000, + "step": 11373, + "time_per_iteration": 2.6394870281219482 + }, + { + "auxiliary_loss_clip": 0.00996536, + "auxiliary_loss_mlp": 0.01006779, + "balance_loss_clip": 1.00474811, + "balance_loss_mlp": 1.00559855, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.2211413728354141, + "language_loss": 0.56664729, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58668041, + "num_input_tokens_seen": 245491620, + "step": 11374, + "time_per_iteration": 3.4250683784484863 + }, + { + "auxiliary_loss_clip": 0.01092643, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.03615928, + "balance_loss_mlp": 1.01526999, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.8699282253179363, + "language_loss": 0.74144506, + "learning_rate": 9.595808981551312e-07, + "loss": 0.76264453, + "num_input_tokens_seen": 245511285, + "step": 11375, + "time_per_iteration": 2.6485912799835205 + }, + { + "auxiliary_loss_clip": 0.01080845, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.03590739, + "balance_loss_mlp": 1.02267528, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.6341022561080205, + "language_loss": 0.70712215, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72827959, + "num_input_tokens_seen": 245532910, + "step": 11376, + "time_per_iteration": 2.8318493366241455 + }, + { + "auxiliary_loss_clip": 0.01101637, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.03412974, + "balance_loss_mlp": 1.01880932, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 2.156108704571042, + "language_loss": 0.74362999, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76495409, + "num_input_tokens_seen": 245550540, + "step": 11377, + "time_per_iteration": 2.5526463985443115 + }, + { + "auxiliary_loss_clip": 0.01007901, + "auxiliary_loss_mlp": 0.01003041, + "balance_loss_clip": 1.00661194, + "balance_loss_mlp": 1.0019505, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7440537382107384, + "language_loss": 0.56876349, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58887291, + "num_input_tokens_seen": 245619570, + "step": 11378, + "time_per_iteration": 3.2470362186431885 + }, + { + "auxiliary_loss_clip": 0.01100688, + "auxiliary_loss_mlp": 0.01035335, + "balance_loss_clip": 1.03472781, + "balance_loss_mlp": 1.02336001, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.723868121240188, + "language_loss": 0.78124833, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80260849, + "num_input_tokens_seen": 245637980, + "step": 11379, + "time_per_iteration": 2.6637744903564453 + }, + { + "auxiliary_loss_clip": 0.0109559, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.03459191, + "balance_loss_mlp": 1.01614082, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 2.139810168355607, + "language_loss": 0.68992329, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71114218, + "num_input_tokens_seen": 245655690, + "step": 11380, + "time_per_iteration": 2.51934814453125 + }, + { + "auxiliary_loss_clip": 0.01077441, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.03442359, + "balance_loss_mlp": 1.01760662, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 2.1790922228497966, + "language_loss": 0.78310579, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80417788, + "num_input_tokens_seen": 245671525, + "step": 11381, + "time_per_iteration": 2.563094139099121 + }, + { + "auxiliary_loss_clip": 0.01015222, + "auxiliary_loss_mlp": 0.00998536, + "balance_loss_clip": 1.00478947, + "balance_loss_mlp": 0.99745142, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8752307038404259, + "language_loss": 0.67170137, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69183892, + "num_input_tokens_seen": 245724115, + "step": 11382, + "time_per_iteration": 2.991509199142456 + }, + { + "auxiliary_loss_clip": 0.01015334, + "auxiliary_loss_mlp": 0.0100042, + "balance_loss_clip": 1.00472963, + "balance_loss_mlp": 0.99935359, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8227644034560496, + "language_loss": 0.5806073, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60076487, + "num_input_tokens_seen": 245789245, + "step": 11383, + "time_per_iteration": 3.164428949356079 + }, + { + "auxiliary_loss_clip": 0.01046399, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.03003049, + "balance_loss_mlp": 1.02016377, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 1.7131371691095125, + "language_loss": 0.79684585, + "learning_rate": 9.565889595521517e-07, + "loss": 0.81763709, + "num_input_tokens_seen": 245812420, + "step": 11384, + "time_per_iteration": 4.339466094970703 + }, + { + "auxiliary_loss_clip": 0.01090669, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.03482366, + "balance_loss_mlp": 1.02219224, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 2.0095034148792603, + "language_loss": 0.77066231, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79190707, + "num_input_tokens_seen": 245829135, + "step": 11385, + "time_per_iteration": 2.610187530517578 + }, + { + "auxiliary_loss_clip": 0.01065084, + "auxiliary_loss_mlp": 0.01036281, + "balance_loss_clip": 1.03286862, + "balance_loss_mlp": 1.02301323, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.0987658502257855, + "language_loss": 0.84502292, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86603653, + "num_input_tokens_seen": 245847140, + "step": 11386, + "time_per_iteration": 2.6924548149108887 + }, + { + "auxiliary_loss_clip": 0.01089809, + "auxiliary_loss_mlp": 0.01036338, + "balance_loss_clip": 1.03559065, + "balance_loss_mlp": 1.02529895, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.348948675619523, + "language_loss": 0.8329438, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85420531, + "num_input_tokens_seen": 245862855, + "step": 11387, + "time_per_iteration": 2.6019914150238037 + }, + { + "auxiliary_loss_clip": 0.010785, + "auxiliary_loss_mlp": 0.01027004, + "balance_loss_clip": 1.03107858, + "balance_loss_mlp": 1.01536322, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.8017203630972745, + "language_loss": 0.72213948, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74319458, + "num_input_tokens_seen": 245885415, + "step": 11388, + "time_per_iteration": 2.722468614578247 + }, + { + "auxiliary_loss_clip": 0.01083458, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.03500295, + "balance_loss_mlp": 1.01842558, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 1.983313253629327, + "language_loss": 0.62504995, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64617586, + "num_input_tokens_seen": 245906285, + "step": 11389, + "time_per_iteration": 2.8985869884490967 + }, + { + "auxiliary_loss_clip": 0.01004465, + "auxiliary_loss_mlp": 0.01005006, + "balance_loss_clip": 1.00439644, + "balance_loss_mlp": 1.00395739, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7838712239569479, + "language_loss": 0.55990744, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58000213, + "num_input_tokens_seen": 245967620, + "step": 11390, + "time_per_iteration": 3.3119635581970215 + }, + { + "auxiliary_loss_clip": 0.01059246, + "auxiliary_loss_mlp": 0.00749512, + "balance_loss_clip": 1.03238559, + "balance_loss_mlp": 1.00027752, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 1.9582994353034189, + "language_loss": 0.88036859, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89845616, + "num_input_tokens_seen": 245985075, + "step": 11391, + "time_per_iteration": 2.7470719814300537 + }, + { + "auxiliary_loss_clip": 0.01054207, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.03434765, + "balance_loss_mlp": 1.01928616, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.8608113259036607, + "language_loss": 0.79439783, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81524718, + "num_input_tokens_seen": 246003560, + "step": 11392, + "time_per_iteration": 2.765803337097168 + }, + { + "auxiliary_loss_clip": 0.01073472, + "auxiliary_loss_mlp": 0.01027464, + "balance_loss_clip": 1.03102803, + "balance_loss_mlp": 1.01582968, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 4.420250344365269, + "language_loss": 0.70213753, + "learning_rate": 9.536002258147104e-07, + "loss": 0.72314692, + "num_input_tokens_seen": 246019600, + "step": 11393, + "time_per_iteration": 2.6382884979248047 + }, + { + "auxiliary_loss_clip": 0.01060865, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.03369975, + "balance_loss_mlp": 1.01746178, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.7664181510039718, + "language_loss": 0.64291501, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66382653, + "num_input_tokens_seen": 246038920, + "step": 11394, + "time_per_iteration": 4.2238006591796875 + }, + { + "auxiliary_loss_clip": 0.01070656, + "auxiliary_loss_mlp": 0.00749769, + "balance_loss_clip": 1.03244519, + "balance_loss_mlp": 1.00031376, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 1.9714336108048756, + "language_loss": 0.80820912, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82641333, + "num_input_tokens_seen": 246060490, + "step": 11395, + "time_per_iteration": 2.743739366531372 + }, + { + "auxiliary_loss_clip": 0.01061114, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.034899, + "balance_loss_mlp": 1.01878691, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.707745711408893, + "language_loss": 0.7288996, + "learning_rate": 9.526046950148527e-07, + "loss": 0.74982154, + "num_input_tokens_seen": 246081465, + "step": 11396, + "time_per_iteration": 2.806980848312378 + }, + { + "auxiliary_loss_clip": 0.01068041, + "auxiliary_loss_mlp": 0.01025527, + "balance_loss_clip": 1.03400362, + "balance_loss_mlp": 1.01284885, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 2.4660619224487985, + "language_loss": 0.79259211, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81352776, + "num_input_tokens_seen": 246096110, + "step": 11397, + "time_per_iteration": 2.6837692260742188 + }, + { + "auxiliary_loss_clip": 0.01016923, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.02822268, + "balance_loss_mlp": 1.02095103, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 2.54610681684988, + "language_loss": 0.71653616, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73704076, + "num_input_tokens_seen": 246114785, + "step": 11398, + "time_per_iteration": 2.808030366897583 + }, + { + "auxiliary_loss_clip": 0.01049165, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.03201985, + "balance_loss_mlp": 1.02291965, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.8151163510127155, + "language_loss": 0.70984483, + "learning_rate": 9.516095216709996e-07, + "loss": 0.73067605, + "num_input_tokens_seen": 246136375, + "step": 11399, + "time_per_iteration": 4.222275257110596 + }, + { + "auxiliary_loss_clip": 0.01083727, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.03416181, + "balance_loss_mlp": 1.02021599, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.6072417330872573, + "language_loss": 0.70290768, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72406524, + "num_input_tokens_seen": 246155090, + "step": 11400, + "time_per_iteration": 2.5695207118988037 + }, + { + "auxiliary_loss_clip": 0.01063077, + "auxiliary_loss_mlp": 0.01037235, + "balance_loss_clip": 1.03518236, + "balance_loss_mlp": 1.02267921, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.8319041677843313, + "language_loss": 0.78022504, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80122811, + "num_input_tokens_seen": 246172645, + "step": 11401, + "time_per_iteration": 2.702475070953369 + }, + { + "auxiliary_loss_clip": 0.01099693, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.03527868, + "balance_loss_mlp": 1.01923323, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 2.1770787343744695, + "language_loss": 0.75655198, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77785122, + "num_input_tokens_seen": 246189055, + "step": 11402, + "time_per_iteration": 2.573282241821289 + }, + { + "auxiliary_loss_clip": 0.01081525, + "auxiliary_loss_mlp": 0.01039997, + "balance_loss_clip": 1.03173363, + "balance_loss_mlp": 1.02697921, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 2.233491218009896, + "language_loss": 0.72742116, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74863636, + "num_input_tokens_seen": 246207990, + "step": 11403, + "time_per_iteration": 2.681290864944458 + }, + { + "auxiliary_loss_clip": 0.01098032, + "auxiliary_loss_mlp": 0.0102687, + "balance_loss_clip": 1.03475368, + "balance_loss_mlp": 1.0157423, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.181513379717884, + "language_loss": 0.81045139, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83170044, + "num_input_tokens_seen": 246221595, + "step": 11404, + "time_per_iteration": 2.593136787414551 + }, + { + "auxiliary_loss_clip": 0.01069061, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.03285813, + "balance_loss_mlp": 1.02601075, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3520212390041497, + "language_loss": 0.7801162, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80118382, + "num_input_tokens_seen": 246242970, + "step": 11405, + "time_per_iteration": 2.703247547149658 + }, + { + "auxiliary_loss_clip": 0.01015475, + "auxiliary_loss_mlp": 0.01010205, + "balance_loss_clip": 1.00433064, + "balance_loss_mlp": 1.00915623, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.8002037184061396, + "language_loss": 0.61025631, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63051307, + "num_input_tokens_seen": 246300405, + "step": 11406, + "time_per_iteration": 3.169276475906372 + }, + { + "auxiliary_loss_clip": 0.01069546, + "auxiliary_loss_mlp": 0.01037445, + "balance_loss_clip": 1.03357816, + "balance_loss_mlp": 1.02479732, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.7956871986436813, + "language_loss": 0.77046108, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79153103, + "num_input_tokens_seen": 246318780, + "step": 11407, + "time_per_iteration": 2.7416255474090576 + }, + { + "auxiliary_loss_clip": 0.01079891, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.03598094, + "balance_loss_mlp": 1.02203727, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.2482747883179193, + "language_loss": 0.70756584, + "learning_rate": 9.486261497711991e-07, + "loss": 0.72870994, + "num_input_tokens_seen": 246339405, + "step": 11408, + "time_per_iteration": 4.282681465148926 + }, + { + "auxiliary_loss_clip": 0.01090723, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.03415012, + "balance_loss_mlp": 1.01554704, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.7344844348077753, + "language_loss": 0.70220476, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72338498, + "num_input_tokens_seen": 246357055, + "step": 11409, + "time_per_iteration": 2.58443546295166 + }, + { + "auxiliary_loss_clip": 0.01054351, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.03565109, + "balance_loss_mlp": 1.0178802, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 2.392054678599037, + "language_loss": 0.78441465, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80523938, + "num_input_tokens_seen": 246374050, + "step": 11410, + "time_per_iteration": 2.711414098739624 + }, + { + "auxiliary_loss_clip": 0.01091986, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.03436756, + "balance_loss_mlp": 1.02230954, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 2.6314168166383496, + "language_loss": 0.71542466, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73669285, + "num_input_tokens_seen": 246392910, + "step": 11411, + "time_per_iteration": 2.593200445175171 + }, + { + "auxiliary_loss_clip": 0.01046647, + "auxiliary_loss_mlp": 0.0103692, + "balance_loss_clip": 1.0308665, + "balance_loss_mlp": 1.02273369, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 2.778524297396448, + "language_loss": 0.70042652, + "learning_rate": 9.473012427332654e-07, + "loss": 0.72126216, + "num_input_tokens_seen": 246411540, + "step": 11412, + "time_per_iteration": 2.7017483711242676 + }, + { + "auxiliary_loss_clip": 0.01101662, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.03494835, + "balance_loss_mlp": 1.01671159, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 3.1181712525878056, + "language_loss": 0.71503645, + "learning_rate": 9.469701157384919e-07, + "loss": 0.7363413, + "num_input_tokens_seen": 246423295, + "step": 11413, + "time_per_iteration": 2.5525124073028564 + }, + { + "auxiliary_loss_clip": 0.01090048, + "auxiliary_loss_mlp": 0.01034076, + "balance_loss_clip": 1.03453994, + "balance_loss_mlp": 1.02264941, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.8664547807750342, + "language_loss": 0.73569405, + "learning_rate": 9.466390286747164e-07, + "loss": 0.7569353, + "num_input_tokens_seen": 246441045, + "step": 11414, + "time_per_iteration": 2.614997148513794 + }, + { + "auxiliary_loss_clip": 0.01075716, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.03392291, + "balance_loss_mlp": 1.01722884, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 3.7659358735561206, + "language_loss": 0.86787248, + "learning_rate": 9.46307981554495e-07, + "loss": 0.888924, + "num_input_tokens_seen": 246456905, + "step": 11415, + "time_per_iteration": 2.593515634536743 + }, + { + "auxiliary_loss_clip": 0.01091294, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.03478169, + "balance_loss_mlp": 1.02112913, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.6895528196663594, + "language_loss": 0.67103827, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69228196, + "num_input_tokens_seen": 246477545, + "step": 11416, + "time_per_iteration": 2.7034833431243896 + }, + { + "auxiliary_loss_clip": 0.01074799, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.03398466, + "balance_loss_mlp": 1.02241552, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.443298902364666, + "language_loss": 0.76066464, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78176701, + "num_input_tokens_seen": 246496705, + "step": 11417, + "time_per_iteration": 2.665539264678955 + }, + { + "auxiliary_loss_clip": 0.01071395, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.03180206, + "balance_loss_mlp": 1.01784873, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 2.0587829750167983, + "language_loss": 0.77326238, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79428542, + "num_input_tokens_seen": 246514860, + "step": 11418, + "time_per_iteration": 2.739900827407837 + }, + { + "auxiliary_loss_clip": 0.01054114, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.03374696, + "balance_loss_mlp": 1.01988876, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.6249960043775404, + "language_loss": 0.76485789, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78571332, + "num_input_tokens_seen": 246536145, + "step": 11419, + "time_per_iteration": 2.890357494354248 + }, + { + "auxiliary_loss_clip": 0.0109847, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.03366613, + "balance_loss_mlp": 1.02108228, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 3.8496972535426224, + "language_loss": 0.71439129, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73569518, + "num_input_tokens_seen": 246553265, + "step": 11420, + "time_per_iteration": 2.6504268646240234 + }, + { + "auxiliary_loss_clip": 0.01057936, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.03022265, + "balance_loss_mlp": 1.01581919, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.403034528972662, + "language_loss": 0.74722719, + "learning_rate": 9.443225383506712e-07, + "loss": 0.7680881, + "num_input_tokens_seen": 246575130, + "step": 11421, + "time_per_iteration": 2.790259599685669 + }, + { + "auxiliary_loss_clip": 0.01086789, + "auxiliary_loss_mlp": 0.01027196, + "balance_loss_clip": 1.03508067, + "balance_loss_mlp": 1.01576376, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.6761080644633564, + "language_loss": 0.77072525, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79186511, + "num_input_tokens_seen": 246593095, + "step": 11422, + "time_per_iteration": 2.6249170303344727 + }, + { + "auxiliary_loss_clip": 0.0109297, + "auxiliary_loss_mlp": 0.01037316, + "balance_loss_clip": 1.03588891, + "balance_loss_mlp": 1.02510929, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.7051308756802988, + "language_loss": 0.77138126, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79268408, + "num_input_tokens_seen": 246612165, + "step": 11423, + "time_per_iteration": 2.6421408653259277 + }, + { + "auxiliary_loss_clip": 0.01069769, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.03518629, + "balance_loss_mlp": 1.0183686, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.3940921541547828, + "language_loss": 0.72664785, + "learning_rate": 9.433303570032129e-07, + "loss": 0.74764454, + "num_input_tokens_seen": 246632065, + "step": 11424, + "time_per_iteration": 4.214555978775024 + }, + { + "auxiliary_loss_clip": 0.01079765, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.03443503, + "balance_loss_mlp": 1.01629853, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 2.1140894672534523, + "language_loss": 0.64977503, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67084891, + "num_input_tokens_seen": 246651245, + "step": 11425, + "time_per_iteration": 2.7489118576049805 + }, + { + "auxiliary_loss_clip": 0.01062749, + "auxiliary_loss_mlp": 0.01025619, + "balance_loss_clip": 1.03404486, + "balance_loss_mlp": 1.01430058, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.4746434920482907, + "language_loss": 0.71814662, + "learning_rate": 9.426691030957657e-07, + "loss": 0.7390303, + "num_input_tokens_seen": 246672225, + "step": 11426, + "time_per_iteration": 2.703701972961426 + }, + { + "auxiliary_loss_clip": 0.01042764, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.03002095, + "balance_loss_mlp": 1.02169764, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.3384843332364555, + "language_loss": 0.850191, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87096161, + "num_input_tokens_seen": 246688385, + "step": 11427, + "time_per_iteration": 2.7075536251068115 + }, + { + "auxiliary_loss_clip": 0.01089363, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.0356847, + "balance_loss_mlp": 1.0211575, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.4538813596977713, + "language_loss": 0.75926584, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78048897, + "num_input_tokens_seen": 246710730, + "step": 11428, + "time_per_iteration": 2.66890811920166 + }, + { + "auxiliary_loss_clip": 0.01066263, + "auxiliary_loss_mlp": 0.01040147, + "balance_loss_clip": 1.03252351, + "balance_loss_mlp": 1.02722502, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 4.425223108766802, + "language_loss": 0.73166084, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75272495, + "num_input_tokens_seen": 246730350, + "step": 11429, + "time_per_iteration": 2.7895214557647705 + }, + { + "auxiliary_loss_clip": 0.01075855, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.033692, + "balance_loss_mlp": 1.01526713, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 1.9163521728819288, + "language_loss": 0.832546, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85357118, + "num_input_tokens_seen": 246751700, + "step": 11430, + "time_per_iteration": 2.7684528827667236 + }, + { + "auxiliary_loss_clip": 0.01091232, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.03464949, + "balance_loss_mlp": 1.02167654, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.0118105391221337, + "language_loss": 0.7022832, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72352731, + "num_input_tokens_seen": 246769860, + "step": 11431, + "time_per_iteration": 2.5853257179260254 + }, + { + "auxiliary_loss_clip": 0.01078775, + "auxiliary_loss_mlp": 0.00749666, + "balance_loss_clip": 1.03223503, + "balance_loss_mlp": 1.00037646, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.7245545346372573, + "language_loss": 0.80076981, + "learning_rate": 9.406863040327355e-07, + "loss": 0.81905425, + "num_input_tokens_seen": 246789905, + "step": 11432, + "time_per_iteration": 2.7115352153778076 + }, + { + "auxiliary_loss_clip": 0.01076766, + "auxiliary_loss_mlp": 0.01023623, + "balance_loss_clip": 1.03375661, + "balance_loss_mlp": 1.01210701, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.5249697018560715, + "language_loss": 0.67693245, + "learning_rate": 9.403559780416295e-07, + "loss": 0.69793636, + "num_input_tokens_seen": 246808815, + "step": 11433, + "time_per_iteration": 4.386366367340088 + }, + { + "auxiliary_loss_clip": 0.01092174, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.0373311, + "balance_loss_mlp": 1.02447033, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.9353233115863642, + "language_loss": 0.72665405, + "learning_rate": 9.400256922323309e-07, + "loss": 0.74793267, + "num_input_tokens_seen": 246829775, + "step": 11434, + "time_per_iteration": 2.769231081008911 + }, + { + "auxiliary_loss_clip": 0.01065472, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.03645515, + "balance_loss_mlp": 1.01560497, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.6830224854908553, + "language_loss": 0.80602777, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82695591, + "num_input_tokens_seen": 246848045, + "step": 11435, + "time_per_iteration": 2.68908953666687 + }, + { + "auxiliary_loss_clip": 0.01101016, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.03387904, + "balance_loss_mlp": 1.0199666, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 1.935874266848337, + "language_loss": 0.81045729, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83178896, + "num_input_tokens_seen": 246866095, + "step": 11436, + "time_per_iteration": 2.5950253009796143 + }, + { + "auxiliary_loss_clip": 0.01050787, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.02868974, + "balance_loss_mlp": 1.02597833, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 1.8089954448899, + "language_loss": 0.81956476, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84044445, + "num_input_tokens_seen": 246883975, + "step": 11437, + "time_per_iteration": 2.6939620971679688 + }, + { + "auxiliary_loss_clip": 0.01081338, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.03462005, + "balance_loss_mlp": 1.02091765, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 2.195342870263757, + "language_loss": 0.77892017, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80006862, + "num_input_tokens_seen": 246901560, + "step": 11438, + "time_per_iteration": 4.122196435928345 + }, + { + "auxiliary_loss_clip": 0.0109494, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.03345525, + "balance_loss_mlp": 1.02088952, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.8510904586098935, + "language_loss": 0.72302985, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74429643, + "num_input_tokens_seen": 246922655, + "step": 11439, + "time_per_iteration": 2.668314218521118 + }, + { + "auxiliary_loss_clip": 0.01089252, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.03476572, + "balance_loss_mlp": 1.01500773, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 1.8417794062118231, + "language_loss": 0.75510401, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77626425, + "num_input_tokens_seen": 246940100, + "step": 11440, + "time_per_iteration": 2.7737770080566406 + }, + { + "auxiliary_loss_clip": 0.01050144, + "auxiliary_loss_mlp": 0.01033714, + "balance_loss_clip": 1.0300982, + "balance_loss_mlp": 1.02237093, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.6594128647307476, + "language_loss": 0.72236311, + "learning_rate": 9.377148177097167e-07, + "loss": 0.74320173, + "num_input_tokens_seen": 246958545, + "step": 11441, + "time_per_iteration": 2.593017578125 + }, + { + "auxiliary_loss_clip": 0.01059389, + "auxiliary_loss_mlp": 0.01041022, + "balance_loss_clip": 1.03073621, + "balance_loss_mlp": 1.02699685, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.7858885147297159, + "language_loss": 0.66424948, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68525362, + "num_input_tokens_seen": 246974805, + "step": 11442, + "time_per_iteration": 2.6218862533569336 + }, + { + "auxiliary_loss_clip": 0.01085678, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.03479469, + "balance_loss_mlp": 1.01895285, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 2.0714730404417656, + "language_loss": 0.69670117, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71785283, + "num_input_tokens_seen": 246992505, + "step": 11443, + "time_per_iteration": 2.567471742630005 + }, + { + "auxiliary_loss_clip": 0.01079408, + "auxiliary_loss_mlp": 0.01029344, + "balance_loss_clip": 1.03509855, + "balance_loss_mlp": 1.01746523, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.3976266519060716, + "language_loss": 0.76281059, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78389812, + "num_input_tokens_seen": 247013370, + "step": 11444, + "time_per_iteration": 2.6844890117645264 + }, + { + "auxiliary_loss_clip": 0.01097464, + "auxiliary_loss_mlp": 0.01026613, + "balance_loss_clip": 1.03378105, + "balance_loss_mlp": 1.01556802, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 3.0112952887155244, + "language_loss": 0.76374751, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78498828, + "num_input_tokens_seen": 247029855, + "step": 11445, + "time_per_iteration": 2.560025215148926 + }, + { + "auxiliary_loss_clip": 0.01014328, + "auxiliary_loss_mlp": 0.01000025, + "balance_loss_clip": 1.00387502, + "balance_loss_mlp": 0.99897641, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8236799866250484, + "language_loss": 0.58334816, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60349166, + "num_input_tokens_seen": 247085030, + "step": 11446, + "time_per_iteration": 3.1749048233032227 + }, + { + "auxiliary_loss_clip": 0.01086569, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.0309186, + "balance_loss_mlp": 1.01545668, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.466201399972883, + "language_loss": 0.75679904, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77793849, + "num_input_tokens_seen": 247104840, + "step": 11447, + "time_per_iteration": 4.19299054145813 + }, + { + "auxiliary_loss_clip": 0.01079202, + "auxiliary_loss_mlp": 0.0103269, + "balance_loss_clip": 1.03168344, + "balance_loss_mlp": 1.02087069, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.467605455371534, + "language_loss": 0.73573339, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75685227, + "num_input_tokens_seen": 247121905, + "step": 11448, + "time_per_iteration": 2.6640260219573975 + }, + { + "auxiliary_loss_clip": 0.01084291, + "auxiliary_loss_mlp": 0.01040183, + "balance_loss_clip": 1.03232551, + "balance_loss_mlp": 1.02611029, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.602460080029714, + "language_loss": 0.74427265, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76551741, + "num_input_tokens_seen": 247142375, + "step": 11449, + "time_per_iteration": 2.6251885890960693 + }, + { + "auxiliary_loss_clip": 0.01097794, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.03413844, + "balance_loss_mlp": 1.02088022, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.6113745619099358, + "language_loss": 0.69950354, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72079831, + "num_input_tokens_seen": 247161095, + "step": 11450, + "time_per_iteration": 2.5679478645324707 + }, + { + "auxiliary_loss_clip": 0.0106217, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.03378963, + "balance_loss_mlp": 1.0224154, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 1.7800180121728137, + "language_loss": 0.759745, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78071952, + "num_input_tokens_seen": 247178565, + "step": 11451, + "time_per_iteration": 2.7253172397613525 + }, + { + "auxiliary_loss_clip": 0.01088154, + "auxiliary_loss_mlp": 0.01026981, + "balance_loss_clip": 1.0346992, + "balance_loss_mlp": 1.01567984, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.3923014762260084, + "language_loss": 0.69258082, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71373212, + "num_input_tokens_seen": 247202345, + "step": 11452, + "time_per_iteration": 2.6010444164276123 + }, + { + "auxiliary_loss_clip": 0.01098026, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.03380942, + "balance_loss_mlp": 1.01907492, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.5977803253488254, + "language_loss": 0.71777797, + "learning_rate": 9.337579130475042e-07, + "loss": 0.73907626, + "num_input_tokens_seen": 247219240, + "step": 11453, + "time_per_iteration": 2.539093017578125 + }, + { + "auxiliary_loss_clip": 0.01016372, + "auxiliary_loss_mlp": 0.00746728, + "balance_loss_clip": 1.00534558, + "balance_loss_mlp": 0.99987936, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7815436621097672, + "language_loss": 0.50719488, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52482587, + "num_input_tokens_seen": 247272010, + "step": 11454, + "time_per_iteration": 3.0123729705810547 + }, + { + "auxiliary_loss_clip": 0.01085171, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.03456664, + "balance_loss_mlp": 1.02131605, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.9302106960771275, + "language_loss": 0.75166821, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77284044, + "num_input_tokens_seen": 247290630, + "step": 11455, + "time_per_iteration": 2.536526918411255 + }, + { + "auxiliary_loss_clip": 0.01078518, + "auxiliary_loss_mlp": 0.010362, + "balance_loss_clip": 1.03171146, + "balance_loss_mlp": 1.02345645, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.563996700002559, + "language_loss": 0.73254192, + "learning_rate": 9.327695957583803e-07, + "loss": 0.75368917, + "num_input_tokens_seen": 247304800, + "step": 11456, + "time_per_iteration": 2.6146738529205322 + }, + { + "auxiliary_loss_clip": 0.01075721, + "auxiliary_loss_mlp": 0.0103616, + "balance_loss_clip": 1.03410077, + "balance_loss_mlp": 1.0251385, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.5777768215441632, + "language_loss": 0.80728966, + "learning_rate": 9.32440237584319e-07, + "loss": 0.82840848, + "num_input_tokens_seen": 247323450, + "step": 11457, + "time_per_iteration": 2.6239516735076904 + }, + { + "auxiliary_loss_clip": 0.01092476, + "auxiliary_loss_mlp": 0.00749427, + "balance_loss_clip": 1.03627086, + "balance_loss_mlp": 1.00021648, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5988639173915822, + "language_loss": 0.75800192, + "learning_rate": 9.321109198922301e-07, + "loss": 0.77642095, + "num_input_tokens_seen": 247343845, + "step": 11458, + "time_per_iteration": 2.6130640506744385 + }, + { + "auxiliary_loss_clip": 0.01101398, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.03562808, + "balance_loss_mlp": 1.01849341, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.79221711668101, + "language_loss": 0.68199515, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70330399, + "num_input_tokens_seen": 247356650, + "step": 11459, + "time_per_iteration": 2.500802755355835 + }, + { + "auxiliary_loss_clip": 0.01047851, + "auxiliary_loss_mlp": 0.01028136, + "balance_loss_clip": 1.03084302, + "balance_loss_mlp": 1.01666236, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.764412225119596, + "language_loss": 0.68631208, + "learning_rate": 9.314524060039221e-07, + "loss": 0.7070719, + "num_input_tokens_seen": 247377340, + "step": 11460, + "time_per_iteration": 2.755951166152954 + }, + { + "auxiliary_loss_clip": 0.01074885, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.03633535, + "balance_loss_mlp": 1.01830781, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 2.5959458055622244, + "language_loss": 0.7668739, + "learning_rate": 9.311232098326731e-07, + "loss": 0.78793609, + "num_input_tokens_seen": 247395805, + "step": 11461, + "time_per_iteration": 2.6676719188690186 + }, + { + "auxiliary_loss_clip": 0.01077493, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.03355026, + "balance_loss_mlp": 1.01929235, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.6478214393117652, + "language_loss": 0.69589472, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71697736, + "num_input_tokens_seen": 247413165, + "step": 11462, + "time_per_iteration": 2.793271780014038 + }, + { + "auxiliary_loss_clip": 0.01090618, + "auxiliary_loss_mlp": 0.01024231, + "balance_loss_clip": 1.03580332, + "balance_loss_mlp": 1.01237535, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.5236975392358647, + "language_loss": 0.87204862, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89319712, + "num_input_tokens_seen": 247433140, + "step": 11463, + "time_per_iteration": 2.650716543197632 + }, + { + "auxiliary_loss_clip": 0.01051777, + "auxiliary_loss_mlp": 0.01029693, + "balance_loss_clip": 1.03531361, + "balance_loss_mlp": 1.01979315, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.6886110584470604, + "language_loss": 0.68325442, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70406914, + "num_input_tokens_seen": 247451265, + "step": 11464, + "time_per_iteration": 2.748202323913574 + }, + { + "auxiliary_loss_clip": 0.01089773, + "auxiliary_loss_mlp": 0.01036882, + "balance_loss_clip": 1.03507066, + "balance_loss_mlp": 1.02546763, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.6655002049960468, + "language_loss": 0.65046811, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67173469, + "num_input_tokens_seen": 247471645, + "step": 11465, + "time_per_iteration": 4.107977867126465 + }, + { + "auxiliary_loss_clip": 0.01092707, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.03562331, + "balance_loss_mlp": 1.02295959, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.4236004149461958, + "language_loss": 0.72562283, + "learning_rate": 9.294778372047649e-07, + "loss": 0.7468974, + "num_input_tokens_seen": 247491170, + "step": 11466, + "time_per_iteration": 2.646665096282959 + }, + { + "auxiliary_loss_clip": 0.01100925, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.03541386, + "balance_loss_mlp": 1.01976132, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 2.05369528891536, + "language_loss": 0.72002983, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74134779, + "num_input_tokens_seen": 247509005, + "step": 11467, + "time_per_iteration": 2.5424559116363525 + }, + { + "auxiliary_loss_clip": 0.01083834, + "auxiliary_loss_mlp": 0.01037629, + "balance_loss_clip": 1.03544629, + "balance_loss_mlp": 1.02467728, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 1.8585307326254588, + "language_loss": 0.80542886, + "learning_rate": 9.288199722264156e-07, + "loss": 0.82664347, + "num_input_tokens_seen": 247527050, + "step": 11468, + "time_per_iteration": 2.55688738822937 + }, + { + "auxiliary_loss_clip": 0.01102414, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.03537846, + "balance_loss_mlp": 1.02108145, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.5205347174607302, + "language_loss": 0.66138875, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68274105, + "num_input_tokens_seen": 247547765, + "step": 11469, + "time_per_iteration": 2.656623601913452 + }, + { + "auxiliary_loss_clip": 0.01016113, + "auxiliary_loss_mlp": 0.00999516, + "balance_loss_clip": 1.00589943, + "balance_loss_mlp": 0.99863416, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.8079986664100375, + "language_loss": 0.55305195, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57320821, + "num_input_tokens_seen": 247603515, + "step": 11470, + "time_per_iteration": 3.0290603637695312 + }, + { + "auxiliary_loss_clip": 0.01086102, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.03478158, + "balance_loss_mlp": 1.0223943, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 1.7517625497166824, + "language_loss": 0.77571726, + "learning_rate": 9.278334794344715e-07, + "loss": 0.79689574, + "num_input_tokens_seen": 247622110, + "step": 11471, + "time_per_iteration": 2.5584404468536377 + }, + { + "auxiliary_loss_clip": 0.01071887, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_clip": 1.03094888, + "balance_loss_mlp": 1.01800871, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 1.6743771015481936, + "language_loss": 0.7886945, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80970782, + "num_input_tokens_seen": 247641905, + "step": 11472, + "time_per_iteration": 2.579289197921753 + }, + { + "auxiliary_loss_clip": 0.01068939, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.03205657, + "balance_loss_mlp": 1.01789522, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.6374200408392927, + "language_loss": 0.76342732, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78440428, + "num_input_tokens_seen": 247660945, + "step": 11473, + "time_per_iteration": 4.142683029174805 + }, + { + "auxiliary_loss_clip": 0.01052258, + "auxiliary_loss_mlp": 0.01041684, + "balance_loss_clip": 1.0295465, + "balance_loss_mlp": 1.02718842, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 2.0969338282270793, + "language_loss": 0.75447571, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77541518, + "num_input_tokens_seen": 247678395, + "step": 11474, + "time_per_iteration": 2.648956775665283 + }, + { + "auxiliary_loss_clip": 0.01052805, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.03724217, + "balance_loss_mlp": 1.0178386, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.5638881936055893, + "language_loss": 0.74379647, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76462126, + "num_input_tokens_seen": 247698380, + "step": 11475, + "time_per_iteration": 2.798509120941162 + }, + { + "auxiliary_loss_clip": 0.01064476, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.03005028, + "balance_loss_mlp": 1.02314329, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 4.690444956957752, + "language_loss": 0.88653517, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90752947, + "num_input_tokens_seen": 247716370, + "step": 11476, + "time_per_iteration": 2.7468018531799316 + }, + { + "auxiliary_loss_clip": 0.01099255, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.03454304, + "balance_loss_mlp": 1.02121019, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.4746633682375558, + "language_loss": 0.70719731, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72851133, + "num_input_tokens_seen": 247737335, + "step": 11477, + "time_per_iteration": 2.596014976501465 + }, + { + "auxiliary_loss_clip": 0.01094957, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.0359292, + "balance_loss_mlp": 1.02400613, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.159265830123959, + "language_loss": 0.67932063, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70063406, + "num_input_tokens_seen": 247756680, + "step": 11478, + "time_per_iteration": 4.092036724090576 + }, + { + "auxiliary_loss_clip": 0.0109044, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.03462923, + "balance_loss_mlp": 1.02163243, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 2.082430678268631, + "language_loss": 0.76263797, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78387105, + "num_input_tokens_seen": 247774265, + "step": 11479, + "time_per_iteration": 2.5888290405273438 + }, + { + "auxiliary_loss_clip": 0.01090156, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.03385663, + "balance_loss_mlp": 1.01750267, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 1.5652443359921735, + "language_loss": 0.78577769, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80697328, + "num_input_tokens_seen": 247792395, + "step": 11480, + "time_per_iteration": 2.5613515377044678 + }, + { + "auxiliary_loss_clip": 0.01053161, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.03105485, + "balance_loss_mlp": 1.01643372, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.5423209918459229, + "language_loss": 0.755054, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77587199, + "num_input_tokens_seen": 247811985, + "step": 11481, + "time_per_iteration": 2.8309922218322754 + }, + { + "auxiliary_loss_clip": 0.0106228, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.03291798, + "balance_loss_mlp": 1.01797962, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.98178448154367, + "language_loss": 0.6910376, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71195513, + "num_input_tokens_seen": 247831880, + "step": 11482, + "time_per_iteration": 2.6922030448913574 + }, + { + "auxiliary_loss_clip": 0.01100324, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.0351491, + "balance_loss_mlp": 1.02056503, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 2.670939702219525, + "language_loss": 0.8265782, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84789968, + "num_input_tokens_seen": 247851170, + "step": 11483, + "time_per_iteration": 2.564073324203491 + }, + { + "auxiliary_loss_clip": 0.01101657, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.0350337, + "balance_loss_mlp": 1.02037573, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 1.8606637501862624, + "language_loss": 0.65394676, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67527473, + "num_input_tokens_seen": 247868950, + "step": 11484, + "time_per_iteration": 2.614421844482422 + }, + { + "auxiliary_loss_clip": 0.0106323, + "auxiliary_loss_mlp": 0.01040052, + "balance_loss_clip": 1.0312326, + "balance_loss_mlp": 1.02779126, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.6861282853581376, + "language_loss": 0.73756373, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75859654, + "num_input_tokens_seen": 247889805, + "step": 11485, + "time_per_iteration": 2.757129669189453 + }, + { + "auxiliary_loss_clip": 0.0108965, + "auxiliary_loss_mlp": 0.00749617, + "balance_loss_clip": 1.03613043, + "balance_loss_mlp": 1.00035012, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.7550640381684124, + "language_loss": 0.85008126, + "learning_rate": 9.22906510853017e-07, + "loss": 0.86847395, + "num_input_tokens_seen": 247908585, + "step": 11486, + "time_per_iteration": 2.6205344200134277 + }, + { + "auxiliary_loss_clip": 0.01039349, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.03152454, + "balance_loss_mlp": 1.02166557, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.5847510926158834, + "language_loss": 0.72632432, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74705648, + "num_input_tokens_seen": 247928480, + "step": 11487, + "time_per_iteration": 2.7127959728240967 + }, + { + "auxiliary_loss_clip": 0.01006874, + "auxiliary_loss_mlp": 0.01007914, + "balance_loss_clip": 1.00555563, + "balance_loss_mlp": 1.00700784, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.9434396133140411, + "language_loss": 0.66632801, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68647587, + "num_input_tokens_seen": 247988855, + "step": 11488, + "time_per_iteration": 4.841999769210815 + }, + { + "auxiliary_loss_clip": 0.01078062, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.03520083, + "balance_loss_mlp": 1.02267182, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 2.705221615569821, + "language_loss": 0.75070256, + "learning_rate": 9.219222185664519e-07, + "loss": 0.77183747, + "num_input_tokens_seen": 248007685, + "step": 11489, + "time_per_iteration": 2.704155921936035 + }, + { + "auxiliary_loss_clip": 0.01086981, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.03294897, + "balance_loss_mlp": 1.02244806, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.8692808984092004, + "language_loss": 0.62228036, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64350224, + "num_input_tokens_seen": 248025145, + "step": 11490, + "time_per_iteration": 2.702080249786377 + }, + { + "auxiliary_loss_clip": 0.01079321, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.03568411, + "balance_loss_mlp": 1.01829624, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.7093336719214192, + "language_loss": 0.72732502, + "learning_rate": 9.212662280920937e-07, + "loss": 0.74841559, + "num_input_tokens_seen": 248043750, + "step": 11491, + "time_per_iteration": 2.650416851043701 + }, + { + "auxiliary_loss_clip": 0.01077455, + "auxiliary_loss_mlp": 0.00749577, + "balance_loss_clip": 1.032866, + "balance_loss_mlp": 1.00027955, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.425415798200942, + "language_loss": 0.70275629, + "learning_rate": 9.20938294207235e-07, + "loss": 0.7210266, + "num_input_tokens_seen": 248065765, + "step": 11492, + "time_per_iteration": 2.636108636856079 + }, + { + "auxiliary_loss_clip": 0.01067324, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.03936434, + "balance_loss_mlp": 1.0202899, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.935714561486563, + "language_loss": 0.74758542, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76857841, + "num_input_tokens_seen": 248083810, + "step": 11493, + "time_per_iteration": 2.7542285919189453 + }, + { + "auxiliary_loss_clip": 0.01101377, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.03641582, + "balance_loss_mlp": 1.01935697, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.7227646395898295, + "language_loss": 0.74442196, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76574433, + "num_input_tokens_seen": 248103185, + "step": 11494, + "time_per_iteration": 2.5407230854034424 + }, + { + "auxiliary_loss_clip": 0.01074943, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.03325534, + "balance_loss_mlp": 1.01992619, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.835511075437169, + "language_loss": 0.68419832, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70526552, + "num_input_tokens_seen": 248125665, + "step": 11495, + "time_per_iteration": 2.700951099395752 + }, + { + "auxiliary_loss_clip": 0.01077523, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.03200173, + "balance_loss_mlp": 1.01940632, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 2.157998826764237, + "language_loss": 0.73923409, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76032043, + "num_input_tokens_seen": 248142545, + "step": 11496, + "time_per_iteration": 2.6180362701416016 + }, + { + "auxiliary_loss_clip": 0.01060196, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.02947116, + "balance_loss_mlp": 1.01837695, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.6217988273960713, + "language_loss": 0.79977334, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82067192, + "num_input_tokens_seen": 248160225, + "step": 11497, + "time_per_iteration": 2.6124026775360107 + }, + { + "auxiliary_loss_clip": 0.01061419, + "auxiliary_loss_mlp": 0.0103609, + "balance_loss_clip": 1.0339725, + "balance_loss_mlp": 1.02397239, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.5933939643970323, + "language_loss": 0.80556464, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82653975, + "num_input_tokens_seen": 248180430, + "step": 11498, + "time_per_iteration": 2.666891574859619 + }, + { + "auxiliary_loss_clip": 0.01087154, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.03331792, + "balance_loss_mlp": 1.01909375, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.7464900308756572, + "language_loss": 0.85944128, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88062441, + "num_input_tokens_seen": 248202365, + "step": 11499, + "time_per_iteration": 2.6463751792907715 + }, + { + "auxiliary_loss_clip": 0.01066873, + "auxiliary_loss_mlp": 0.00749457, + "balance_loss_clip": 1.03382564, + "balance_loss_mlp": 1.00028765, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.4956120560333512, + "language_loss": 0.75463557, + "learning_rate": 9.183162972252145e-07, + "loss": 0.7727989, + "num_input_tokens_seen": 248221750, + "step": 11500, + "time_per_iteration": 2.6453986167907715 + }, + { + "auxiliary_loss_clip": 0.01041367, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.03319633, + "balance_loss_mlp": 1.0281148, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.8054783565650578, + "language_loss": 0.76796257, + "learning_rate": 9.179887320509921e-07, + "loss": 0.78879732, + "num_input_tokens_seen": 248239535, + "step": 11501, + "time_per_iteration": 2.7373101711273193 + }, + { + "auxiliary_loss_clip": 0.01084521, + "auxiliary_loss_mlp": 0.01036849, + "balance_loss_clip": 1.03428948, + "balance_loss_mlp": 1.02445102, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.8327420708677227, + "language_loss": 0.73110765, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75232136, + "num_input_tokens_seen": 248259055, + "step": 11502, + "time_per_iteration": 2.6145107746124268 + }, + { + "auxiliary_loss_clip": 0.01030409, + "auxiliary_loss_mlp": 0.01038623, + "balance_loss_clip": 1.0315764, + "balance_loss_mlp": 1.02413869, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 2.4380755753714958, + "language_loss": 0.73336947, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75405979, + "num_input_tokens_seen": 248276765, + "step": 11503, + "time_per_iteration": 2.870699405670166 + }, + { + "auxiliary_loss_clip": 0.01082689, + "auxiliary_loss_mlp": 0.01036893, + "balance_loss_clip": 1.03291988, + "balance_loss_mlp": 1.02421558, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.807877557441313, + "language_loss": 0.77128351, + "learning_rate": 9.170062827578575e-07, + "loss": 0.7924794, + "num_input_tokens_seen": 248295310, + "step": 11504, + "time_per_iteration": 3.0844204425811768 + }, + { + "auxiliary_loss_clip": 0.01050516, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.02879167, + "balance_loss_mlp": 1.02156007, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.7390095030439852, + "language_loss": 0.73776752, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75862151, + "num_input_tokens_seen": 248315230, + "step": 11505, + "time_per_iteration": 4.249408960342407 + }, + { + "auxiliary_loss_clip": 0.01035677, + "auxiliary_loss_mlp": 0.00749486, + "balance_loss_clip": 1.03024399, + "balance_loss_mlp": 1.00024283, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.7680565925735328, + "language_loss": 0.87679195, + "learning_rate": 9.163515218778886e-07, + "loss": 0.8946436, + "num_input_tokens_seen": 248332980, + "step": 11506, + "time_per_iteration": 2.810365915298462 + }, + { + "auxiliary_loss_clip": 0.01075189, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.034657, + "balance_loss_mlp": 1.01732838, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 2.3325209674791085, + "language_loss": 0.70586616, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72690266, + "num_input_tokens_seen": 248352865, + "step": 11507, + "time_per_iteration": 2.6946499347686768 + }, + { + "auxiliary_loss_clip": 0.01074261, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.03355026, + "balance_loss_mlp": 1.02063918, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 1.926294861666224, + "language_loss": 0.76966, + "learning_rate": 9.156969253661538e-07, + "loss": 0.7907269, + "num_input_tokens_seen": 248371125, + "step": 11508, + "time_per_iteration": 2.6442461013793945 + }, + { + "auxiliary_loss_clip": 0.01081638, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03337526, + "balance_loss_mlp": 1.01849008, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 1.5395156354665203, + "language_loss": 0.74878049, + "learning_rate": 9.153696887794027e-07, + "loss": 0.76989353, + "num_input_tokens_seen": 248390455, + "step": 11509, + "time_per_iteration": 2.6511847972869873 + }, + { + "auxiliary_loss_clip": 0.01050367, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.03440511, + "balance_loss_mlp": 1.0211767, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.5436753937525438, + "language_loss": 0.6425004, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66333449, + "num_input_tokens_seen": 248411305, + "step": 11510, + "time_per_iteration": 2.6978728771209717 + }, + { + "auxiliary_loss_clip": 0.01068711, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.03462839, + "balance_loss_mlp": 1.01820505, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 2.0806876152322964, + "language_loss": 0.75208867, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77308774, + "num_input_tokens_seen": 248430190, + "step": 11511, + "time_per_iteration": 2.670330047607422 + }, + { + "auxiliary_loss_clip": 0.01069152, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.03471947, + "balance_loss_mlp": 1.01805985, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.7754160394310505, + "language_loss": 0.62594521, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64692837, + "num_input_tokens_seen": 248450830, + "step": 11512, + "time_per_iteration": 2.6842057704925537 + }, + { + "auxiliary_loss_clip": 0.01068587, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.03287315, + "balance_loss_mlp": 1.01997733, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.9425500665513513, + "language_loss": 0.82887125, + "learning_rate": 9.140611538493666e-07, + "loss": 0.84987265, + "num_input_tokens_seen": 248468585, + "step": 11513, + "time_per_iteration": 4.294336318969727 + }, + { + "auxiliary_loss_clip": 0.01035693, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.03177273, + "balance_loss_mlp": 1.01547658, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.5080393884139978, + "language_loss": 0.78204226, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80265832, + "num_input_tokens_seen": 248490535, + "step": 11514, + "time_per_iteration": 2.731484889984131 + }, + { + "auxiliary_loss_clip": 0.01051803, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.03152001, + "balance_loss_mlp": 1.01966679, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 1.8357615654695936, + "language_loss": 0.74884236, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76967406, + "num_input_tokens_seen": 248508575, + "step": 11515, + "time_per_iteration": 2.761209726333618 + }, + { + "auxiliary_loss_clip": 0.01053533, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.03420889, + "balance_loss_mlp": 1.01950264, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 2.0559969592308707, + "language_loss": 0.53817719, + "learning_rate": 9.130801849869694e-07, + "loss": 0.5590198, + "num_input_tokens_seen": 248527025, + "step": 11516, + "time_per_iteration": 2.754747152328491 + }, + { + "auxiliary_loss_clip": 0.01084204, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.03328013, + "balance_loss_mlp": 1.01890874, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.6434045745443775, + "language_loss": 0.73128146, + "learning_rate": 9.127532777818557e-07, + "loss": 0.7524327, + "num_input_tokens_seen": 248544275, + "step": 11517, + "time_per_iteration": 4.173719882965088 + }, + { + "auxiliary_loss_clip": 0.01102084, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.03562915, + "balance_loss_mlp": 1.02371705, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.6259910193095046, + "language_loss": 0.76495945, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78633678, + "num_input_tokens_seen": 248561870, + "step": 11518, + "time_per_iteration": 2.630699396133423 + }, + { + "auxiliary_loss_clip": 0.01094633, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.03640854, + "balance_loss_mlp": 1.01991987, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.5192995811972814, + "language_loss": 0.64345187, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66472882, + "num_input_tokens_seen": 248588190, + "step": 11519, + "time_per_iteration": 2.775761365890503 + }, + { + "auxiliary_loss_clip": 0.01065643, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.03284097, + "balance_loss_mlp": 1.02245951, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 15.65359289216926, + "language_loss": 0.62294912, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64394873, + "num_input_tokens_seen": 248606460, + "step": 11520, + "time_per_iteration": 2.6027915477752686 + }, + { + "auxiliary_loss_clip": 0.01061125, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.03227913, + "balance_loss_mlp": 1.02743673, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 1.8199327506251328, + "language_loss": 0.77625489, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79727846, + "num_input_tokens_seen": 248623715, + "step": 11521, + "time_per_iteration": 2.679323673248291 + }, + { + "auxiliary_loss_clip": 0.01085894, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.03268719, + "balance_loss_mlp": 1.02487791, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.8309587146954345, + "language_loss": 0.81738073, + "learning_rate": 9.111193604317304e-07, + "loss": 0.8386333, + "num_input_tokens_seen": 248640575, + "step": 11522, + "time_per_iteration": 2.5627126693725586 + }, + { + "auxiliary_loss_clip": 0.01086984, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.03778267, + "balance_loss_mlp": 1.02273536, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.471671534927225, + "language_loss": 0.76795888, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78916997, + "num_input_tokens_seen": 248663535, + "step": 11523, + "time_per_iteration": 2.6538772583007812 + }, + { + "auxiliary_loss_clip": 0.01067603, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.03284335, + "balance_loss_mlp": 1.02237523, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 1.7116280522410414, + "language_loss": 0.68661815, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70762861, + "num_input_tokens_seen": 248681125, + "step": 11524, + "time_per_iteration": 2.6526312828063965 + }, + { + "auxiliary_loss_clip": 0.01070217, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.03524125, + "balance_loss_mlp": 1.01866031, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.779141911967387, + "language_loss": 0.64279187, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66380584, + "num_input_tokens_seen": 248700555, + "step": 11525, + "time_per_iteration": 2.689544916152954 + }, + { + "auxiliary_loss_clip": 0.01060972, + "auxiliary_loss_mlp": 0.01039428, + "balance_loss_clip": 1.03511548, + "balance_loss_mlp": 1.02704763, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 2.300003560960151, + "language_loss": 0.70723784, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72824186, + "num_input_tokens_seen": 248716095, + "step": 11526, + "time_per_iteration": 2.6266515254974365 + }, + { + "auxiliary_loss_clip": 0.01075558, + "auxiliary_loss_mlp": 0.01028847, + "balance_loss_clip": 1.0319376, + "balance_loss_mlp": 1.01764154, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.5388129374238093, + "language_loss": 0.76062638, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78167045, + "num_input_tokens_seen": 248735330, + "step": 11527, + "time_per_iteration": 2.6773228645324707 + }, + { + "auxiliary_loss_clip": 0.0106846, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.02998364, + "balance_loss_mlp": 1.02059126, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 2.3927120246974947, + "language_loss": 0.7952739, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81628633, + "num_input_tokens_seen": 248754530, + "step": 11528, + "time_per_iteration": 4.206785440444946 + }, + { + "auxiliary_loss_clip": 0.0108695, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.03474236, + "balance_loss_mlp": 1.02118993, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.4414051535172356, + "language_loss": 0.76010227, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78129864, + "num_input_tokens_seen": 248775825, + "step": 11529, + "time_per_iteration": 2.676687479019165 + }, + { + "auxiliary_loss_clip": 0.01097976, + "auxiliary_loss_mlp": 0.0074937, + "balance_loss_clip": 1.03455591, + "balance_loss_mlp": 1.00028276, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.6059956587068618, + "language_loss": 0.72488892, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74336243, + "num_input_tokens_seen": 248796180, + "step": 11530, + "time_per_iteration": 2.638521194458008 + }, + { + "auxiliary_loss_clip": 0.01084795, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.0361011, + "balance_loss_mlp": 1.01924908, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 2.3032916640111165, + "language_loss": 0.78967369, + "learning_rate": 9.081809115407513e-07, + "loss": 0.81085598, + "num_input_tokens_seen": 248814735, + "step": 11531, + "time_per_iteration": 2.622065305709839 + }, + { + "auxiliary_loss_clip": 0.01084695, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.03431034, + "balance_loss_mlp": 1.02117038, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.3527857988534746, + "language_loss": 0.69304669, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71421158, + "num_input_tokens_seen": 248839140, + "step": 11532, + "time_per_iteration": 2.819615364074707 + }, + { + "auxiliary_loss_clip": 0.01073996, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.03430843, + "balance_loss_mlp": 1.01759672, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.4028939688003286, + "language_loss": 0.67105442, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69209635, + "num_input_tokens_seen": 248858300, + "step": 11533, + "time_per_iteration": 2.6151349544525146 + }, + { + "auxiliary_loss_clip": 0.01076339, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.03612638, + "balance_loss_mlp": 1.02303791, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 2.6224194646220185, + "language_loss": 0.58923155, + "learning_rate": 9.072021733655007e-07, + "loss": 0.61035013, + "num_input_tokens_seen": 248876310, + "step": 11534, + "time_per_iteration": 2.6404011249542236 + }, + { + "auxiliary_loss_clip": 0.0106333, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.03155804, + "balance_loss_mlp": 1.01929665, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 1.9781059892090536, + "language_loss": 0.70991063, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73085988, + "num_input_tokens_seen": 248895650, + "step": 11535, + "time_per_iteration": 2.6486616134643555 + }, + { + "auxiliary_loss_clip": 0.0100463, + "auxiliary_loss_mlp": 0.01000682, + "balance_loss_clip": 1.00385976, + "balance_loss_mlp": 0.99956149, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7111915858844847, + "language_loss": 0.59057897, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61063212, + "num_input_tokens_seen": 248963920, + "step": 11536, + "time_per_iteration": 3.2749710083007812 + }, + { + "auxiliary_loss_clip": 0.01088329, + "auxiliary_loss_mlp": 0.00749373, + "balance_loss_clip": 1.03578854, + "balance_loss_mlp": 1.00020993, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 2.1029923449441745, + "language_loss": 0.72587103, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74424809, + "num_input_tokens_seen": 248983380, + "step": 11537, + "time_per_iteration": 2.713718891143799 + }, + { + "auxiliary_loss_clip": 0.01015363, + "auxiliary_loss_mlp": 0.00746705, + "balance_loss_clip": 1.00500703, + "balance_loss_mlp": 0.99994451, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7458782089895492, + "language_loss": 0.55557275, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57319343, + "num_input_tokens_seen": 249044680, + "step": 11538, + "time_per_iteration": 3.1057469844818115 + }, + { + "auxiliary_loss_clip": 0.01083395, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.0329318, + "balance_loss_mlp": 1.02126479, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.556455094065347, + "language_loss": 0.77821207, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79936171, + "num_input_tokens_seen": 249061060, + "step": 11539, + "time_per_iteration": 2.67082142829895 + }, + { + "auxiliary_loss_clip": 0.01076531, + "auxiliary_loss_mlp": 0.01028678, + "balance_loss_clip": 1.03357089, + "balance_loss_mlp": 1.01759124, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.7613197130370375, + "language_loss": 0.63970971, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66076183, + "num_input_tokens_seen": 249081430, + "step": 11540, + "time_per_iteration": 2.747997999191284 + }, + { + "auxiliary_loss_clip": 0.01051365, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.03281343, + "balance_loss_mlp": 1.02202058, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 2.016806196536353, + "language_loss": 0.86686277, + "learning_rate": 9.049199018987437e-07, + "loss": 0.88771045, + "num_input_tokens_seen": 249103020, + "step": 11541, + "time_per_iteration": 2.6799306869506836 + }, + { + "auxiliary_loss_clip": 0.01100525, + "auxiliary_loss_mlp": 0.00749481, + "balance_loss_clip": 1.03562427, + "balance_loss_mlp": 1.00014889, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.7140324692482907, + "language_loss": 0.84376341, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86226344, + "num_input_tokens_seen": 249120810, + "step": 11542, + "time_per_iteration": 2.559913396835327 + }, + { + "auxiliary_loss_clip": 0.01090321, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.03594553, + "balance_loss_mlp": 1.01592708, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.8068924107796995, + "language_loss": 0.74906319, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77024847, + "num_input_tokens_seen": 249138050, + "step": 11543, + "time_per_iteration": 2.6062748432159424 + }, + { + "auxiliary_loss_clip": 0.01077105, + "auxiliary_loss_mlp": 0.01032548, + "balance_loss_clip": 1.0336175, + "balance_loss_mlp": 1.02144349, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 11.470278625952163, + "language_loss": 0.7608943, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78199077, + "num_input_tokens_seen": 249155570, + "step": 11544, + "time_per_iteration": 4.128098487854004 + }, + { + "auxiliary_loss_clip": 0.01044886, + "auxiliary_loss_mlp": 0.01038722, + "balance_loss_clip": 1.03181899, + "balance_loss_mlp": 1.02583587, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.9323686485875915, + "language_loss": 0.71532756, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73616362, + "num_input_tokens_seen": 249172960, + "step": 11545, + "time_per_iteration": 2.6303277015686035 + }, + { + "auxiliary_loss_clip": 0.01085379, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.03424776, + "balance_loss_mlp": 1.01458073, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.6562883266600512, + "language_loss": 0.79339015, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81449223, + "num_input_tokens_seen": 249192450, + "step": 11546, + "time_per_iteration": 2.644789695739746 + }, + { + "auxiliary_loss_clip": 0.01063829, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.03239155, + "balance_loss_mlp": 1.01922286, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 1.3552966112708125, + "language_loss": 0.78721631, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80816531, + "num_input_tokens_seen": 249214320, + "step": 11547, + "time_per_iteration": 2.712463617324829 + }, + { + "auxiliary_loss_clip": 0.01077186, + "auxiliary_loss_mlp": 0.00749502, + "balance_loss_clip": 1.03579569, + "balance_loss_mlp": 1.00025773, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.0653398971629113, + "language_loss": 0.80845308, + "learning_rate": 9.026396651834834e-07, + "loss": 0.82671988, + "num_input_tokens_seen": 249230925, + "step": 11548, + "time_per_iteration": 2.673790216445923 + }, + { + "auxiliary_loss_clip": 0.01024353, + "auxiliary_loss_mlp": 0.00746696, + "balance_loss_clip": 1.00398123, + "balance_loss_mlp": 0.99986804, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.7326890411295872, + "language_loss": 0.53734124, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55505168, + "num_input_tokens_seen": 249293975, + "step": 11549, + "time_per_iteration": 3.1422455310821533 + }, + { + "auxiliary_loss_clip": 0.01078672, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.03065753, + "balance_loss_mlp": 1.02067327, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.4811303407656846, + "language_loss": 0.73894429, + "learning_rate": 9.01988543302e-07, + "loss": 0.76005977, + "num_input_tokens_seen": 249315285, + "step": 11550, + "time_per_iteration": 2.6588385105133057 + }, + { + "auxiliary_loss_clip": 0.01077233, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.03669477, + "balance_loss_mlp": 1.02256739, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.816496640765636, + "language_loss": 0.74588341, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76699656, + "num_input_tokens_seen": 249333505, + "step": 11551, + "time_per_iteration": 2.6429402828216553 + }, + { + "auxiliary_loss_clip": 0.01101097, + "auxiliary_loss_mlp": 0.01036454, + "balance_loss_clip": 1.03518295, + "balance_loss_mlp": 1.02450287, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 2.7290274622784336, + "language_loss": 0.84489906, + "learning_rate": 9.01337587967333e-07, + "loss": 0.8662746, + "num_input_tokens_seen": 249354180, + "step": 11552, + "time_per_iteration": 2.573505401611328 + }, + { + "auxiliary_loss_clip": 0.0109978, + "auxiliary_loss_mlp": 0.01032366, + "balance_loss_clip": 1.03520322, + "balance_loss_mlp": 1.02086258, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.8910424367990633, + "language_loss": 0.6736908, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69501227, + "num_input_tokens_seen": 249377035, + "step": 11553, + "time_per_iteration": 4.186346530914307 + }, + { + "auxiliary_loss_clip": 0.01084158, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.03683066, + "balance_loss_mlp": 1.01824784, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.6242320071903886, + "language_loss": 0.79565442, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81679833, + "num_input_tokens_seen": 249396155, + "step": 11554, + "time_per_iteration": 2.618372678756714 + }, + { + "auxiliary_loss_clip": 0.01090098, + "auxiliary_loss_mlp": 0.01026388, + "balance_loss_clip": 1.03364897, + "balance_loss_mlp": 1.01475286, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 1.810388057198008, + "language_loss": 0.72606629, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74723113, + "num_input_tokens_seen": 249414555, + "step": 11555, + "time_per_iteration": 2.6216933727264404 + }, + { + "auxiliary_loss_clip": 0.0106303, + "auxiliary_loss_mlp": 0.01028021, + "balance_loss_clip": 1.0320065, + "balance_loss_mlp": 1.01720893, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.899261430502667, + "language_loss": 0.77816629, + "learning_rate": 9.000361773333705e-07, + "loss": 0.79907686, + "num_input_tokens_seen": 249433570, + "step": 11556, + "time_per_iteration": 2.664748430252075 + }, + { + "auxiliary_loss_clip": 0.01038526, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.03171229, + "balance_loss_mlp": 1.02420187, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.5650899420612294, + "language_loss": 0.60044265, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62118411, + "num_input_tokens_seen": 249453735, + "step": 11557, + "time_per_iteration": 4.18264627456665 + }, + { + "auxiliary_loss_clip": 0.01075653, + "auxiliary_loss_mlp": 0.0103581, + "balance_loss_clip": 1.03906417, + "balance_loss_mlp": 1.02448547, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 2.395812635436782, + "language_loss": 0.85311025, + "learning_rate": 8.993857222314752e-07, + "loss": 0.8742249, + "num_input_tokens_seen": 249470805, + "step": 11558, + "time_per_iteration": 2.7786386013031006 + }, + { + "auxiliary_loss_clip": 0.01088487, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.03315246, + "balance_loss_mlp": 1.019418, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.6249871216125147, + "language_loss": 0.70474863, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72595298, + "num_input_tokens_seen": 249491150, + "step": 11559, + "time_per_iteration": 2.5967416763305664 + }, + { + "auxiliary_loss_clip": 0.01063597, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.03243303, + "balance_loss_mlp": 1.01920986, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.511690978440505, + "language_loss": 0.78162581, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80256069, + "num_input_tokens_seen": 249511560, + "step": 11560, + "time_per_iteration": 2.7462501525878906 + }, + { + "auxiliary_loss_clip": 0.01075758, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.03551292, + "balance_loss_mlp": 1.02005064, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.5116761122547477, + "language_loss": 0.76723075, + "learning_rate": 8.9841035262498e-07, + "loss": 0.78829652, + "num_input_tokens_seen": 249531910, + "step": 11561, + "time_per_iteration": 2.6671085357666016 + }, + { + "auxiliary_loss_clip": 0.01097448, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.03356862, + "balance_loss_mlp": 1.01957083, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 2.1690623355223746, + "language_loss": 0.78457212, + "learning_rate": 8.980853129511577e-07, + "loss": 0.8058635, + "num_input_tokens_seen": 249550300, + "step": 11562, + "time_per_iteration": 2.623645305633545 + }, + { + "auxiliary_loss_clip": 0.01090381, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.03384173, + "balance_loss_mlp": 1.02104115, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 4.791762055265461, + "language_loss": 0.69261575, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71384776, + "num_input_tokens_seen": 249567740, + "step": 11563, + "time_per_iteration": 2.6240830421447754 + }, + { + "auxiliary_loss_clip": 0.01077621, + "auxiliary_loss_mlp": 0.01024545, + "balance_loss_clip": 1.03311205, + "balance_loss_mlp": 1.01393509, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.1645089827415585, + "language_loss": 0.73454797, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75556958, + "num_input_tokens_seen": 249582700, + "step": 11564, + "time_per_iteration": 2.6091654300689697 + }, + { + "auxiliary_loss_clip": 0.01070378, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.03818882, + "balance_loss_mlp": 1.02282262, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 2.4421823217720577, + "language_loss": 0.72278172, + "learning_rate": 8.971104446872785e-07, + "loss": 0.74385059, + "num_input_tokens_seen": 249602920, + "step": 11565, + "time_per_iteration": 2.7352845668792725 + }, + { + "auxiliary_loss_clip": 0.01008869, + "auxiliary_loss_mlp": 0.00998041, + "balance_loss_clip": 1.00690699, + "balance_loss_mlp": 0.99711704, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9412119692330463, + "language_loss": 0.58441985, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60448891, + "num_input_tokens_seen": 249660400, + "step": 11566, + "time_per_iteration": 3.0286543369293213 + }, + { + "auxiliary_loss_clip": 0.01067893, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.0348767, + "balance_loss_mlp": 1.01977229, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 2.0604739336333155, + "language_loss": 0.74015093, + "learning_rate": 8.964607415992338e-07, + "loss": 0.7611481, + "num_input_tokens_seen": 249679335, + "step": 11567, + "time_per_iteration": 4.202380657196045 + }, + { + "auxiliary_loss_clip": 0.01066238, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.03154135, + "balance_loss_mlp": 1.02224314, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.2517155189359583, + "language_loss": 0.76808792, + "learning_rate": 8.961359528185313e-07, + "loss": 0.78909469, + "num_input_tokens_seen": 249701805, + "step": 11568, + "time_per_iteration": 2.735161304473877 + }, + { + "auxiliary_loss_clip": 0.01082729, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.03611124, + "balance_loss_mlp": 1.02167296, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 1.7602225618766985, + "language_loss": 0.72279572, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74394453, + "num_input_tokens_seen": 249720550, + "step": 11569, + "time_per_iteration": 2.586193799972534 + }, + { + "auxiliary_loss_clip": 0.01078805, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.03801072, + "balance_loss_mlp": 1.01488471, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.519732445768985, + "language_loss": 0.77652359, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79757786, + "num_input_tokens_seen": 249740325, + "step": 11570, + "time_per_iteration": 2.732259750366211 + }, + { + "auxiliary_loss_clip": 0.01088554, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.03308821, + "balance_loss_mlp": 1.02007747, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 1.815320368873409, + "language_loss": 0.74719661, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76839697, + "num_input_tokens_seen": 249760570, + "step": 11571, + "time_per_iteration": 2.6209659576416016 + }, + { + "auxiliary_loss_clip": 0.01079879, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.03202796, + "balance_loss_mlp": 1.01958323, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.8679967014216607, + "language_loss": 0.74736369, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76847404, + "num_input_tokens_seen": 249778290, + "step": 11572, + "time_per_iteration": 2.633763551712036 + }, + { + "auxiliary_loss_clip": 0.01075342, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.03094244, + "balance_loss_mlp": 1.01771224, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.6361614735430532, + "language_loss": 0.69820535, + "learning_rate": 8.94512637040814e-07, + "loss": 0.71924883, + "num_input_tokens_seen": 249800925, + "step": 11573, + "time_per_iteration": 2.8339052200317383 + }, + { + "auxiliary_loss_clip": 0.01079251, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.0355587, + "balance_loss_mlp": 1.01882577, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.6473314871532763, + "language_loss": 0.74351549, + "learning_rate": 8.941880995966095e-07, + "loss": 0.76461411, + "num_input_tokens_seen": 249820500, + "step": 11574, + "time_per_iteration": 2.688750743865967 + }, + { + "auxiliary_loss_clip": 0.01061295, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.02997303, + "balance_loss_mlp": 1.01874936, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.8312466181743445, + "language_loss": 0.74325013, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76416492, + "num_input_tokens_seen": 249839845, + "step": 11575, + "time_per_iteration": 2.6738383769989014 + }, + { + "auxiliary_loss_clip": 0.01090435, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.03447247, + "balance_loss_mlp": 1.0161171, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 5.658308644612869, + "language_loss": 0.78964555, + "learning_rate": 8.935391505179966e-07, + "loss": 0.81082457, + "num_input_tokens_seen": 249857400, + "step": 11576, + "time_per_iteration": 2.610494613647461 + }, + { + "auxiliary_loss_clip": 0.01055403, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.03170204, + "balance_loss_mlp": 1.02064157, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.9354480125183193, + "language_loss": 0.56213558, + "learning_rate": 8.932147389081985e-07, + "loss": 0.58300221, + "num_input_tokens_seen": 249871645, + "step": 11577, + "time_per_iteration": 2.6219913959503174 + }, + { + "auxiliary_loss_clip": 0.01013646, + "auxiliary_loss_mlp": 0.01024272, + "balance_loss_clip": 1.0288378, + "balance_loss_mlp": 1.013955, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.3271913903927892, + "language_loss": 0.76685411, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78723323, + "num_input_tokens_seen": 249894215, + "step": 11578, + "time_per_iteration": 2.900733709335327 + }, + { + "auxiliary_loss_clip": 0.01062383, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.03175807, + "balance_loss_mlp": 1.01905203, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 2.3313031683718872, + "language_loss": 0.79866397, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81958818, + "num_input_tokens_seen": 249912850, + "step": 11579, + "time_per_iteration": 2.6434872150421143 + }, + { + "auxiliary_loss_clip": 0.01056474, + "auxiliary_loss_mlp": 0.01026585, + "balance_loss_clip": 1.02822745, + "balance_loss_mlp": 1.01537335, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 2.1513124893767492, + "language_loss": 0.72614026, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74697077, + "num_input_tokens_seen": 249932650, + "step": 11580, + "time_per_iteration": 2.6538803577423096 + }, + { + "auxiliary_loss_clip": 0.01083533, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.03673768, + "balance_loss_mlp": 1.01748776, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 2.418189381862664, + "language_loss": 0.6600498, + "learning_rate": 8.919175122860787e-07, + "loss": 0.68117809, + "num_input_tokens_seen": 249951205, + "step": 11581, + "time_per_iteration": 2.670806407928467 + }, + { + "auxiliary_loss_clip": 0.01100323, + "auxiliary_loss_mlp": 0.01027617, + "balance_loss_clip": 1.03501081, + "balance_loss_mlp": 1.01682878, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.7977550971899747, + "language_loss": 0.76534563, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78662509, + "num_input_tokens_seen": 249967045, + "step": 11582, + "time_per_iteration": 2.532113790512085 + }, + { + "auxiliary_loss_clip": 0.01071836, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.03084469, + "balance_loss_mlp": 1.01743817, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.8616975251377488, + "language_loss": 0.69858694, + "learning_rate": 8.91269151037425e-07, + "loss": 0.71958399, + "num_input_tokens_seen": 249984565, + "step": 11583, + "time_per_iteration": 2.6024725437164307 + }, + { + "auxiliary_loss_clip": 0.0107462, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.0403527, + "balance_loss_mlp": 1.02124858, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 1.8031616269673334, + "language_loss": 0.82278621, + "learning_rate": 8.909450334717301e-07, + "loss": 0.8438623, + "num_input_tokens_seen": 250004235, + "step": 11584, + "time_per_iteration": 4.227784156799316 + }, + { + "auxiliary_loss_clip": 0.01047345, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.03856814, + "balance_loss_mlp": 1.02154779, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.377122682263954, + "language_loss": 0.79559922, + "learning_rate": 8.906209579615107e-07, + "loss": 0.81641793, + "num_input_tokens_seen": 250017645, + "step": 11585, + "time_per_iteration": 2.7744667530059814 + }, + { + "auxiliary_loss_clip": 0.01095417, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.03324556, + "balance_loss_mlp": 1.01826966, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.642412250530097, + "language_loss": 0.77615738, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79739916, + "num_input_tokens_seen": 250037640, + "step": 11586, + "time_per_iteration": 2.5775420665740967 + }, + { + "auxiliary_loss_clip": 0.01083324, + "auxiliary_loss_mlp": 0.01027829, + "balance_loss_clip": 1.03253245, + "balance_loss_mlp": 1.01761293, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.8414583063944607, + "language_loss": 0.78809965, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80921113, + "num_input_tokens_seen": 250056490, + "step": 11587, + "time_per_iteration": 2.5724542140960693 + }, + { + "auxiliary_loss_clip": 0.01074673, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.0340085, + "balance_loss_mlp": 1.02099335, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 2.0376623647763865, + "language_loss": 0.72850156, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74956959, + "num_input_tokens_seen": 250074285, + "step": 11588, + "time_per_iteration": 2.6093900203704834 + }, + { + "auxiliary_loss_clip": 0.01072361, + "auxiliary_loss_mlp": 0.0102581, + "balance_loss_clip": 1.03196025, + "balance_loss_mlp": 1.01554585, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.7860919004047955, + "language_loss": 0.75108016, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77206194, + "num_input_tokens_seen": 250093350, + "step": 11589, + "time_per_iteration": 2.60819935798645 + }, + { + "auxiliary_loss_clip": 0.01077765, + "auxiliary_loss_mlp": 0.01032116, + "balance_loss_clip": 1.03392828, + "balance_loss_mlp": 1.02128601, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.8256499177934504, + "language_loss": 0.63966179, + "learning_rate": 8.890012116726012e-07, + "loss": 0.66076064, + "num_input_tokens_seen": 250114170, + "step": 11590, + "time_per_iteration": 2.690410614013672 + }, + { + "auxiliary_loss_clip": 0.00989395, + "auxiliary_loss_mlp": 0.01001004, + "balance_loss_clip": 1.01434577, + "balance_loss_mlp": 0.99977583, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7583223271912399, + "language_loss": 0.61264378, + "learning_rate": 8.88677388753248e-07, + "loss": 0.6325478, + "num_input_tokens_seen": 250178250, + "step": 11591, + "time_per_iteration": 3.42181658744812 + }, + { + "auxiliary_loss_clip": 0.01058177, + "auxiliary_loss_mlp": 0.0074935, + "balance_loss_clip": 1.04276001, + "balance_loss_mlp": 1.0001905, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 1.5684865337207197, + "language_loss": 0.69224405, + "learning_rate": 8.883536079753582e-07, + "loss": 0.71031934, + "num_input_tokens_seen": 250198420, + "step": 11592, + "time_per_iteration": 3.045463800430298 + }, + { + "auxiliary_loss_clip": 0.01059338, + "auxiliary_loss_mlp": 0.01025895, + "balance_loss_clip": 1.0314914, + "balance_loss_mlp": 1.01511872, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.6027461934641738, + "language_loss": 0.62611485, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64696717, + "num_input_tokens_seen": 250220650, + "step": 11593, + "time_per_iteration": 4.239361047744751 + }, + { + "auxiliary_loss_clip": 0.01073238, + "auxiliary_loss_mlp": 0.01024822, + "balance_loss_clip": 1.03330469, + "balance_loss_mlp": 1.01462936, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 2.335273635029342, + "language_loss": 0.54184258, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56282318, + "num_input_tokens_seen": 250241750, + "step": 11594, + "time_per_iteration": 2.689987897872925 + }, + { + "auxiliary_loss_clip": 0.01087202, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.03379703, + "balance_loss_mlp": 1.02086866, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 3.3069760331739935, + "language_loss": 0.7657972, + "learning_rate": 8.87382518613248e-07, + "loss": 0.78698528, + "num_input_tokens_seen": 250259445, + "step": 11595, + "time_per_iteration": 2.5949392318725586 + }, + { + "auxiliary_loss_clip": 0.01080288, + "auxiliary_loss_mlp": 0.00749484, + "balance_loss_clip": 1.03666365, + "balance_loss_mlp": 1.00020468, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.1552332500257436, + "language_loss": 0.71788412, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73618186, + "num_input_tokens_seen": 250275640, + "step": 11596, + "time_per_iteration": 2.618590831756592 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.03789377, + "balance_loss_mlp": 1.02091026, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.8280697533390018, + "language_loss": 0.76240408, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78375614, + "num_input_tokens_seen": 250296435, + "step": 11597, + "time_per_iteration": 4.035759449005127 + }, + { + "auxiliary_loss_clip": 0.01087114, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.03327131, + "balance_loss_mlp": 1.02182126, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.8012014344445266, + "language_loss": 0.74745941, + "learning_rate": 8.864118089662267e-07, + "loss": 0.7686584, + "num_input_tokens_seen": 250314035, + "step": 11598, + "time_per_iteration": 2.559265613555908 + }, + { + "auxiliary_loss_clip": 0.01082914, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.01948881, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 2.0179126097850837, + "language_loss": 0.89429104, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91543382, + "num_input_tokens_seen": 250332995, + "step": 11599, + "time_per_iteration": 2.664210319519043 + }, + { + "auxiliary_loss_clip": 0.01096942, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.03818798, + "balance_loss_mlp": 1.02496243, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.1809204264489974, + "language_loss": 0.69773769, + "learning_rate": 8.85764880317974e-07, + "loss": 0.71908247, + "num_input_tokens_seen": 250352120, + "step": 11600, + "time_per_iteration": 2.639413356781006 + }, + { + "auxiliary_loss_clip": 0.01057244, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.03025115, + "balance_loss_mlp": 1.02428746, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.7162996563369584, + "language_loss": 0.7666105, + "learning_rate": 8.854414793655771e-07, + "loss": 0.78753835, + "num_input_tokens_seen": 250371705, + "step": 11601, + "time_per_iteration": 2.673325538635254 + }, + { + "auxiliary_loss_clip": 0.01082566, + "auxiliary_loss_mlp": 0.00749164, + "balance_loss_clip": 1.03296757, + "balance_loss_mlp": 1.00018835, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.9036878776111466, + "language_loss": 0.72583145, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74414873, + "num_input_tokens_seen": 250390485, + "step": 11602, + "time_per_iteration": 2.5918421745300293 + }, + { + "auxiliary_loss_clip": 0.01078603, + "auxiliary_loss_mlp": 0.00749223, + "balance_loss_clip": 1.03438878, + "balance_loss_mlp": 1.00018764, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.083410235835613, + "language_loss": 0.76195341, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78023171, + "num_input_tokens_seen": 250407020, + "step": 11603, + "time_per_iteration": 2.6372272968292236 + }, + { + "auxiliary_loss_clip": 0.01049781, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.03059542, + "balance_loss_mlp": 1.01967311, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 1.4569533793259777, + "language_loss": 0.61764514, + "learning_rate": 8.844715301424557e-07, + "loss": 0.63844895, + "num_input_tokens_seen": 250425880, + "step": 11604, + "time_per_iteration": 2.660684823989868 + }, + { + "auxiliary_loss_clip": 0.01082187, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.03410161, + "balance_loss_mlp": 1.02030444, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.6682066346987554, + "language_loss": 0.81658363, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83772993, + "num_input_tokens_seen": 250442925, + "step": 11605, + "time_per_iteration": 2.613247871398926 + }, + { + "auxiliary_loss_clip": 0.01087368, + "auxiliary_loss_mlp": 0.01034171, + "balance_loss_clip": 1.03303123, + "balance_loss_mlp": 1.02309072, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 20.765601435385904, + "language_loss": 0.7047655, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72598088, + "num_input_tokens_seen": 250461220, + "step": 11606, + "time_per_iteration": 2.612873077392578 + }, + { + "auxiliary_loss_clip": 0.01079844, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.03503287, + "balance_loss_mlp": 1.01949012, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 1.9059073169069056, + "language_loss": 0.82443136, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84553641, + "num_input_tokens_seen": 250480975, + "step": 11607, + "time_per_iteration": 2.7244553565979004 + }, + { + "auxiliary_loss_clip": 0.01080159, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.03563166, + "balance_loss_mlp": 1.01895308, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 2.1578427605196584, + "language_loss": 0.78864586, + "learning_rate": 8.831788567821265e-07, + "loss": 0.80975682, + "num_input_tokens_seen": 250497980, + "step": 11608, + "time_per_iteration": 4.1101393699646 + }, + { + "auxiliary_loss_clip": 0.01079007, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.03281558, + "balance_loss_mlp": 1.01942933, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 1.7998850355232925, + "language_loss": 0.8965016, + "learning_rate": 8.828557942863357e-07, + "loss": 0.91759872, + "num_input_tokens_seen": 250511910, + "step": 11609, + "time_per_iteration": 2.5890073776245117 + }, + { + "auxiliary_loss_clip": 0.01067433, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.0341363, + "balance_loss_mlp": 1.01949263, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 2.1202368063095864, + "language_loss": 0.63843906, + "learning_rate": 8.82532774152765e-07, + "loss": 0.65942824, + "num_input_tokens_seen": 250531090, + "step": 11610, + "time_per_iteration": 2.630181074142456 + }, + { + "auxiliary_loss_clip": 0.01068103, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.03444386, + "balance_loss_mlp": 1.02254725, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 2.394417275879046, + "language_loss": 0.8444671, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86547852, + "num_input_tokens_seen": 250551565, + "step": 11611, + "time_per_iteration": 2.7403931617736816 + }, + { + "auxiliary_loss_clip": 0.01092621, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.03589487, + "balance_loss_mlp": 1.02088916, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 2.221779976860103, + "language_loss": 0.71163505, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73288321, + "num_input_tokens_seen": 250569625, + "step": 11612, + "time_per_iteration": 2.5440638065338135 + }, + { + "auxiliary_loss_clip": 0.01078386, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.03162491, + "balance_loss_mlp": 1.02017105, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 2.1009254345278197, + "language_loss": 0.80960602, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83071047, + "num_input_tokens_seen": 250586960, + "step": 11613, + "time_per_iteration": 2.550565004348755 + }, + { + "auxiliary_loss_clip": 0.01086699, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.03321707, + "balance_loss_mlp": 1.02298367, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 2.296328824875702, + "language_loss": 0.7506175, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77182233, + "num_input_tokens_seen": 250605080, + "step": 11614, + "time_per_iteration": 2.6036531925201416 + }, + { + "auxiliary_loss_clip": 0.01041048, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.03748298, + "balance_loss_mlp": 1.01904035, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.3313101834384544, + "language_loss": 0.76775694, + "learning_rate": 8.809183093468746e-07, + "loss": 0.78847569, + "num_input_tokens_seen": 250623965, + "step": 11615, + "time_per_iteration": 2.817256450653076 + }, + { + "auxiliary_loss_clip": 0.01071571, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.03324032, + "balance_loss_mlp": 1.01953602, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.247338234389012, + "language_loss": 0.73141694, + "learning_rate": 8.80595543643797e-07, + "loss": 0.75243604, + "num_input_tokens_seen": 250640675, + "step": 11616, + "time_per_iteration": 2.7410829067230225 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.03565502, + "balance_loss_mlp": 1.02643394, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.8307051541423287, + "language_loss": 0.84329307, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86465824, + "num_input_tokens_seen": 250660295, + "step": 11617, + "time_per_iteration": 2.5678551197052 + }, + { + "auxiliary_loss_clip": 0.01057375, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_clip": 1.03160322, + "balance_loss_mlp": 1.03052855, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.4735444517442136, + "language_loss": 0.59879845, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61980021, + "num_input_tokens_seen": 250678155, + "step": 11618, + "time_per_iteration": 2.611726760864258 + }, + { + "auxiliary_loss_clip": 0.01077587, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03517389, + "balance_loss_mlp": 1.01998889, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 1.7348978530560413, + "language_loss": 0.82788479, + "learning_rate": 8.796275012710903e-07, + "loss": 0.84897256, + "num_input_tokens_seen": 250697230, + "step": 11619, + "time_per_iteration": 2.706381320953369 + }, + { + "auxiliary_loss_clip": 0.01082587, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.03193736, + "balance_loss_mlp": 1.0196172, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 5.491280735308203, + "language_loss": 0.67648423, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69760561, + "num_input_tokens_seen": 250719865, + "step": 11620, + "time_per_iteration": 2.785764217376709 + }, + { + "auxiliary_loss_clip": 0.01047908, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.0323019, + "balance_loss_mlp": 1.01688468, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 2.199672448876675, + "language_loss": 0.72626734, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74703121, + "num_input_tokens_seen": 250736565, + "step": 11621, + "time_per_iteration": 2.7176284790039062 + }, + { + "auxiliary_loss_clip": 0.01044397, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.03175485, + "balance_loss_mlp": 1.02850986, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.8443445808100467, + "language_loss": 0.68528235, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70612955, + "num_input_tokens_seen": 250757235, + "step": 11622, + "time_per_iteration": 2.676520586013794 + }, + { + "auxiliary_loss_clip": 0.01031118, + "auxiliary_loss_mlp": 0.01026755, + "balance_loss_clip": 1.0299226, + "balance_loss_mlp": 1.01624084, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.7724589971481253, + "language_loss": 0.62551779, + "learning_rate": 8.783373729494721e-07, + "loss": 0.64609647, + "num_input_tokens_seen": 250775585, + "step": 11623, + "time_per_iteration": 2.7442498207092285 + }, + { + "auxiliary_loss_clip": 0.01101295, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.03333294, + "balance_loss_mlp": 1.01634622, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.880513711093002, + "language_loss": 0.6054064, + "learning_rate": 8.780149471723932e-07, + "loss": 0.62670147, + "num_input_tokens_seen": 250795725, + "step": 11624, + "time_per_iteration": 4.1834917068481445 + }, + { + "auxiliary_loss_clip": 0.01089976, + "auxiliary_loss_mlp": 0.01040968, + "balance_loss_clip": 1.03279364, + "balance_loss_mlp": 1.02900577, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.629404051523804, + "language_loss": 0.78091615, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80222559, + "num_input_tokens_seen": 250814555, + "step": 11625, + "time_per_iteration": 2.5779929161071777 + }, + { + "auxiliary_loss_clip": 0.0105921, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.03036034, + "balance_loss_mlp": 1.01925468, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 1.8537573767999942, + "language_loss": 0.66165686, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68254399, + "num_input_tokens_seen": 250833105, + "step": 11626, + "time_per_iteration": 2.647240400314331 + }, + { + "auxiliary_loss_clip": 0.0108061, + "auxiliary_loss_mlp": 0.00749445, + "balance_loss_clip": 1.03627849, + "balance_loss_mlp": 1.00023091, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 3.2555477385435725, + "language_loss": 0.70548964, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72379017, + "num_input_tokens_seen": 250852570, + "step": 11627, + "time_per_iteration": 2.637277841567993 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.03530431, + "balance_loss_mlp": 1.01929033, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.8932103250126135, + "language_loss": 0.62221491, + "learning_rate": 8.767256696441768e-07, + "loss": 0.6434769, + "num_input_tokens_seen": 250870500, + "step": 11628, + "time_per_iteration": 2.494364023208618 + }, + { + "auxiliary_loss_clip": 0.01089642, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.0329628, + "balance_loss_mlp": 1.02306771, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.6431013686705844, + "language_loss": 0.67881691, + "learning_rate": 8.764034567182581e-07, + "loss": 0.7000643, + "num_input_tokens_seen": 250892745, + "step": 11629, + "time_per_iteration": 2.740403652191162 + }, + { + "auxiliary_loss_clip": 0.01100584, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.03595698, + "balance_loss_mlp": 1.02119875, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.8534178977896691, + "language_loss": 0.72637331, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74770498, + "num_input_tokens_seen": 250910225, + "step": 11630, + "time_per_iteration": 2.576423168182373 + }, + { + "auxiliary_loss_clip": 0.01099792, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.03654563, + "balance_loss_mlp": 1.02268398, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.6590768169271803, + "language_loss": 0.74277103, + "learning_rate": 8.757591586993196e-07, + "loss": 0.7641046, + "num_input_tokens_seen": 250929715, + "step": 11631, + "time_per_iteration": 2.5037600994110107 + }, + { + "auxiliary_loss_clip": 0.01093093, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.03730726, + "balance_loss_mlp": 1.01866496, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 3.986575345756714, + "language_loss": 0.89144671, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91268617, + "num_input_tokens_seen": 250944230, + "step": 11632, + "time_per_iteration": 2.5550451278686523 + }, + { + "auxiliary_loss_clip": 0.01082495, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.03709102, + "balance_loss_mlp": 1.02542925, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.4707345949183592, + "language_loss": 0.80111521, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82230729, + "num_input_tokens_seen": 250961865, + "step": 11633, + "time_per_iteration": 4.116192817687988 + }, + { + "auxiliary_loss_clip": 0.01103213, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.0354259, + "balance_loss_mlp": 1.02323234, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 2.1238143956147795, + "language_loss": 0.67042106, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69181263, + "num_input_tokens_seen": 250982025, + "step": 11634, + "time_per_iteration": 2.5306947231292725 + }, + { + "auxiliary_loss_clip": 0.01005357, + "auxiliary_loss_mlp": 0.0100408, + "balance_loss_clip": 1.01229191, + "balance_loss_mlp": 1.00296521, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.7268865933139355, + "language_loss": 0.53138936, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55148375, + "num_input_tokens_seen": 251046900, + "step": 11635, + "time_per_iteration": 3.27073073387146 + }, + { + "auxiliary_loss_clip": 0.01074767, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.03425562, + "balance_loss_mlp": 1.01573944, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.8015954863440478, + "language_loss": 0.82134581, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84236598, + "num_input_tokens_seen": 251065050, + "step": 11636, + "time_per_iteration": 2.601003885269165 + }, + { + "auxiliary_loss_clip": 0.01100643, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.03504944, + "balance_loss_mlp": 1.01873946, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 3.115635364850995, + "language_loss": 0.83124483, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85255176, + "num_input_tokens_seen": 251083355, + "step": 11637, + "time_per_iteration": 4.174327611923218 + }, + { + "auxiliary_loss_clip": 0.0104843, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.03140926, + "balance_loss_mlp": 1.02173603, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 1.8973038753765303, + "language_loss": 0.67644757, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69726288, + "num_input_tokens_seen": 251096420, + "step": 11638, + "time_per_iteration": 2.680941343307495 + }, + { + "auxiliary_loss_clip": 0.01093446, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.0347681, + "balance_loss_mlp": 1.02043414, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 5.494158902030951, + "language_loss": 0.77844608, + "learning_rate": 8.731836728534459e-07, + "loss": 0.79970938, + "num_input_tokens_seen": 251115410, + "step": 11639, + "time_per_iteration": 2.602210283279419 + }, + { + "auxiliary_loss_clip": 0.01074933, + "auxiliary_loss_mlp": 0.0103574, + "balance_loss_clip": 1.03408599, + "balance_loss_mlp": 1.02370024, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.1054761046007564, + "language_loss": 0.82137454, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84248126, + "num_input_tokens_seen": 251133530, + "step": 11640, + "time_per_iteration": 2.6217150688171387 + }, + { + "auxiliary_loss_clip": 0.01067391, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.03282905, + "balance_loss_mlp": 1.01909268, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.7381927022666561, + "language_loss": 0.75622517, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77719975, + "num_input_tokens_seen": 251153985, + "step": 11641, + "time_per_iteration": 2.733583450317383 + }, + { + "auxiliary_loss_clip": 0.01076212, + "auxiliary_loss_mlp": 0.01022343, + "balance_loss_clip": 1.03474474, + "balance_loss_mlp": 1.01031518, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 2.067274267469923, + "language_loss": 0.77657259, + "learning_rate": 8.722185703539022e-07, + "loss": 0.79755807, + "num_input_tokens_seen": 251173225, + "step": 11642, + "time_per_iteration": 2.568542957305908 + }, + { + "auxiliary_loss_clip": 0.01098478, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.03805864, + "balance_loss_mlp": 1.01948595, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 2.550216577282053, + "language_loss": 0.74473095, + "learning_rate": 8.718969550356266e-07, + "loss": 0.76604861, + "num_input_tokens_seen": 251192485, + "step": 11643, + "time_per_iteration": 2.636115312576294 + }, + { + "auxiliary_loss_clip": 0.01066607, + "auxiliary_loss_mlp": 0.0102898, + "balance_loss_clip": 1.033741, + "balance_loss_mlp": 1.01747632, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.4380532814983835, + "language_loss": 0.60423434, + "learning_rate": 8.715753824951315e-07, + "loss": 0.6251902, + "num_input_tokens_seen": 251214965, + "step": 11644, + "time_per_iteration": 2.6957061290740967 + }, + { + "auxiliary_loss_clip": 0.0108468, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.03189647, + "balance_loss_mlp": 1.01835918, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.6624616344715228, + "language_loss": 0.81792849, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83906782, + "num_input_tokens_seen": 251234500, + "step": 11645, + "time_per_iteration": 2.6124870777130127 + }, + { + "auxiliary_loss_clip": 0.01088841, + "auxiliary_loss_mlp": 0.01027782, + "balance_loss_clip": 1.03435123, + "balance_loss_mlp": 1.01621246, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 5.095156452961516, + "language_loss": 0.68237305, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70353925, + "num_input_tokens_seen": 251254360, + "step": 11646, + "time_per_iteration": 2.5534915924072266 + }, + { + "auxiliary_loss_clip": 0.01081829, + "auxiliary_loss_mlp": 0.01036808, + "balance_loss_clip": 1.03333783, + "balance_loss_mlp": 1.02476764, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.648263658776379, + "language_loss": 0.7113694, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73255575, + "num_input_tokens_seen": 251274790, + "step": 11647, + "time_per_iteration": 4.177159786224365 + }, + { + "auxiliary_loss_clip": 0.0109178, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.03604674, + "balance_loss_mlp": 1.02156484, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.7252545225827236, + "language_loss": 0.71623993, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73749065, + "num_input_tokens_seen": 251296275, + "step": 11648, + "time_per_iteration": 2.6827657222747803 + }, + { + "auxiliary_loss_clip": 0.01038908, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.02654016, + "balance_loss_mlp": 1.01897526, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 2.0684177042800735, + "language_loss": 0.77483034, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79552585, + "num_input_tokens_seen": 251317375, + "step": 11649, + "time_per_iteration": 2.7722911834716797 + }, + { + "auxiliary_loss_clip": 0.0107543, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.03216577, + "balance_loss_mlp": 1.02260971, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 1.7805972918889432, + "language_loss": 0.78435177, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80544752, + "num_input_tokens_seen": 251333570, + "step": 11650, + "time_per_iteration": 2.583803415298462 + }, + { + "auxiliary_loss_clip": 0.01076703, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.03341031, + "balance_loss_mlp": 1.01909804, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 1.9871523593319724, + "language_loss": 0.78705585, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80812603, + "num_input_tokens_seen": 251351070, + "step": 11651, + "time_per_iteration": 2.598720073699951 + }, + { + "auxiliary_loss_clip": 0.01060887, + "auxiliary_loss_mlp": 0.01040108, + "balance_loss_clip": 1.03194547, + "balance_loss_mlp": 1.02771008, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.5424477259104614, + "language_loss": 0.69138062, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71239054, + "num_input_tokens_seen": 251370005, + "step": 11652, + "time_per_iteration": 2.724555492401123 + }, + { + "auxiliary_loss_clip": 0.01089921, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.03515768, + "balance_loss_mlp": 1.01875997, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.6014919301968977, + "language_loss": 0.74292231, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76412445, + "num_input_tokens_seen": 251391210, + "step": 11653, + "time_per_iteration": 2.5942604541778564 + }, + { + "auxiliary_loss_clip": 0.01072286, + "auxiliary_loss_mlp": 0.01031716, + "balance_loss_clip": 1.03458929, + "balance_loss_mlp": 1.01933622, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.2794082706059884, + "language_loss": 0.70841098, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72945094, + "num_input_tokens_seen": 251411505, + "step": 11654, + "time_per_iteration": 2.5385141372680664 + }, + { + "auxiliary_loss_clip": 0.01053245, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.03195906, + "balance_loss_mlp": 1.02022183, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 6.990837771911821, + "language_loss": 0.73172283, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75258303, + "num_input_tokens_seen": 251428975, + "step": 11655, + "time_per_iteration": 2.6076526641845703 + }, + { + "auxiliary_loss_clip": 0.01098718, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.03783035, + "balance_loss_mlp": 1.02447462, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.949790989883395, + "language_loss": 0.70659232, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72795379, + "num_input_tokens_seen": 251446940, + "step": 11656, + "time_per_iteration": 2.552405834197998 + }, + { + "auxiliary_loss_clip": 0.01052928, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.03193736, + "balance_loss_mlp": 1.01929903, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.769992801701089, + "language_loss": 0.78042459, + "learning_rate": 8.673988377928092e-07, + "loss": 0.80125153, + "num_input_tokens_seen": 251466205, + "step": 11657, + "time_per_iteration": 2.741978645324707 + }, + { + "auxiliary_loss_clip": 0.01104558, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.0361712, + "balance_loss_mlp": 1.02139556, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 1.9539622098778204, + "language_loss": 0.78015107, + "learning_rate": 8.670778654208797e-07, + "loss": 0.80153817, + "num_input_tokens_seen": 251484820, + "step": 11658, + "time_per_iteration": 2.4651997089385986 + }, + { + "auxiliary_loss_clip": 0.01069826, + "auxiliary_loss_mlp": 0.01025497, + "balance_loss_clip": 1.02941561, + "balance_loss_mlp": 1.0141778, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 2.2631467241114525, + "language_loss": 0.82853186, + "learning_rate": 8.667569360094713e-07, + "loss": 0.8494851, + "num_input_tokens_seen": 251502670, + "step": 11659, + "time_per_iteration": 2.5822389125823975 + }, + { + "auxiliary_loss_clip": 0.01053776, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.03190589, + "balance_loss_mlp": 1.01607656, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 3.720542130455544, + "language_loss": 0.69398642, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71479356, + "num_input_tokens_seen": 251521630, + "step": 11660, + "time_per_iteration": 2.7711243629455566 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.033885, + "balance_loss_mlp": 1.02226794, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 2.1191763804599746, + "language_loss": 0.80620134, + "learning_rate": 8.661152061168924e-07, + "loss": 0.82755697, + "num_input_tokens_seen": 251540105, + "step": 11661, + "time_per_iteration": 2.5572144985198975 + }, + { + "auxiliary_loss_clip": 0.01086525, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.03174806, + "balance_loss_mlp": 1.01776397, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.6422069256437413, + "language_loss": 0.79535186, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81650597, + "num_input_tokens_seen": 251560530, + "step": 11662, + "time_per_iteration": 2.625246286392212 + }, + { + "auxiliary_loss_clip": 0.01082898, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.03213024, + "balance_loss_mlp": 1.01954687, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 1.7168289951541826, + "language_loss": 0.83651412, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85767078, + "num_input_tokens_seen": 251577930, + "step": 11663, + "time_per_iteration": 2.593626022338867 + }, + { + "auxiliary_loss_clip": 0.01016455, + "auxiliary_loss_mlp": 0.01002927, + "balance_loss_clip": 1.00588429, + "balance_loss_mlp": 1.00191355, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8227639805724082, + "language_loss": 0.53712529, + "learning_rate": 8.651529337861209e-07, + "loss": 0.5573191, + "num_input_tokens_seen": 251638820, + "step": 11664, + "time_per_iteration": 4.577855587005615 + }, + { + "auxiliary_loss_clip": 0.01076451, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.03266525, + "balance_loss_mlp": 1.01940811, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 2.038364836190313, + "language_loss": 0.78780794, + "learning_rate": 8.64832262393344e-07, + "loss": 0.80888462, + "num_input_tokens_seen": 251658070, + "step": 11665, + "time_per_iteration": 2.636014699935913 + }, + { + "auxiliary_loss_clip": 0.01080773, + "auxiliary_loss_mlp": 0.01028827, + "balance_loss_clip": 1.03112197, + "balance_loss_mlp": 1.01746655, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.7722625619101633, + "language_loss": 0.76908082, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79017675, + "num_input_tokens_seen": 251671575, + "step": 11666, + "time_per_iteration": 2.614211320877075 + }, + { + "auxiliary_loss_clip": 0.01082438, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.03294778, + "balance_loss_mlp": 1.01886654, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 2.262847346482459, + "language_loss": 0.81092155, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83204734, + "num_input_tokens_seen": 251689350, + "step": 11667, + "time_per_iteration": 2.570767402648926 + }, + { + "auxiliary_loss_clip": 0.0106523, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.03167582, + "balance_loss_mlp": 1.02259731, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.029594989646587, + "language_loss": 0.64969331, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67068505, + "num_input_tokens_seen": 251704635, + "step": 11668, + "time_per_iteration": 2.668860912322998 + }, + { + "auxiliary_loss_clip": 0.01078415, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.032511, + "balance_loss_mlp": 1.01496565, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.195426987743117, + "language_loss": 0.76404661, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78509426, + "num_input_tokens_seen": 251723035, + "step": 11669, + "time_per_iteration": 2.624643564224243 + }, + { + "auxiliary_loss_clip": 0.01007063, + "auxiliary_loss_mlp": 0.00999208, + "balance_loss_clip": 1.00705385, + "balance_loss_mlp": 0.99820691, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6940138363937264, + "language_loss": 0.54495448, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56501716, + "num_input_tokens_seen": 251791630, + "step": 11670, + "time_per_iteration": 3.2328245639801025 + }, + { + "auxiliary_loss_clip": 0.01075464, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.03521252, + "balance_loss_mlp": 1.02637076, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.947640665675841, + "language_loss": 0.81519711, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83632922, + "num_input_tokens_seen": 251809840, + "step": 11671, + "time_per_iteration": 2.6022682189941406 + }, + { + "auxiliary_loss_clip": 0.01091692, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.03582358, + "balance_loss_mlp": 1.01750028, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 2.071563776840402, + "language_loss": 0.7513091, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77252126, + "num_input_tokens_seen": 251827550, + "step": 11672, + "time_per_iteration": 2.527268886566162 + }, + { + "auxiliary_loss_clip": 0.01079607, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.03143382, + "balance_loss_mlp": 1.01793003, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.729434518645247, + "language_loss": 0.86487496, + "learning_rate": 8.622684419164883e-07, + "loss": 0.88596952, + "num_input_tokens_seen": 251844880, + "step": 11673, + "time_per_iteration": 4.056189298629761 + }, + { + "auxiliary_loss_clip": 0.01081833, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.03206134, + "balance_loss_mlp": 1.01492894, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.7968482112757114, + "language_loss": 0.72903812, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75012362, + "num_input_tokens_seen": 251861025, + "step": 11674, + "time_per_iteration": 2.5698113441467285 + }, + { + "auxiliary_loss_clip": 0.01080022, + "auxiliary_loss_mlp": 0.00749098, + "balance_loss_clip": 1.03565121, + "balance_loss_mlp": 1.00018501, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.7519585175385648, + "language_loss": 0.72127163, + "learning_rate": 8.616279179832329e-07, + "loss": 0.73956281, + "num_input_tokens_seen": 251880175, + "step": 11675, + "time_per_iteration": 2.5972650051116943 + }, + { + "auxiliary_loss_clip": 0.010595, + "auxiliary_loss_mlp": 0.01027392, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.01534557, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 2.560500204464737, + "language_loss": 0.51167333, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53254223, + "num_input_tokens_seen": 251899005, + "step": 11676, + "time_per_iteration": 2.6436426639556885 + }, + { + "auxiliary_loss_clip": 0.01005607, + "auxiliary_loss_mlp": 0.00746594, + "balance_loss_clip": 1.00523543, + "balance_loss_mlp": 0.99983686, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7341625226852004, + "language_loss": 0.59220755, + "learning_rate": 8.609875667187079e-07, + "loss": 0.60972959, + "num_input_tokens_seen": 251966790, + "step": 11677, + "time_per_iteration": 4.721893787384033 + }, + { + "auxiliary_loss_clip": 0.0108206, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.03132653, + "balance_loss_mlp": 1.01961803, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 1.796913624355752, + "language_loss": 0.6229645, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64411235, + "num_input_tokens_seen": 251989315, + "step": 11678, + "time_per_iteration": 2.5868616104125977 + }, + { + "auxiliary_loss_clip": 0.01098812, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.03443837, + "balance_loss_mlp": 1.02041256, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.9641357424976702, + "language_loss": 0.79272717, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81403726, + "num_input_tokens_seen": 252006620, + "step": 11679, + "time_per_iteration": 2.4968490600585938 + }, + { + "auxiliary_loss_clip": 0.01067098, + "auxiliary_loss_mlp": 0.01041769, + "balance_loss_clip": 1.03154182, + "balance_loss_mlp": 1.02971697, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.194077226097413, + "language_loss": 0.7124241, + "learning_rate": 8.600273637882567e-07, + "loss": 0.73351276, + "num_input_tokens_seen": 252024570, + "step": 11680, + "time_per_iteration": 2.597168207168579 + }, + { + "auxiliary_loss_clip": 0.01055967, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.03020144, + "balance_loss_mlp": 1.01898909, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 2.117123119207314, + "language_loss": 0.7501415, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77101648, + "num_input_tokens_seen": 252042775, + "step": 11681, + "time_per_iteration": 2.610758066177368 + }, + { + "auxiliary_loss_clip": 0.01081546, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.0353744, + "balance_loss_mlp": 1.02026439, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.4614982172489581, + "language_loss": 0.7669239, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78804636, + "num_input_tokens_seen": 252063690, + "step": 11682, + "time_per_iteration": 2.6772685050964355 + }, + { + "auxiliary_loss_clip": 0.0106945, + "auxiliary_loss_mlp": 0.00749335, + "balance_loss_clip": 1.03567755, + "balance_loss_mlp": 1.00019979, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 2.1261345422746722, + "language_loss": 0.73846638, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75665414, + "num_input_tokens_seen": 252080335, + "step": 11683, + "time_per_iteration": 2.630676507949829 + }, + { + "auxiliary_loss_clip": 0.01065692, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.03393435, + "balance_loss_mlp": 1.01473522, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 2.0389353061975886, + "language_loss": 0.71351331, + "learning_rate": 8.587476984611976e-07, + "loss": 0.7344417, + "num_input_tokens_seen": 252101075, + "step": 11684, + "time_per_iteration": 2.664503574371338 + }, + { + "auxiliary_loss_clip": 0.01091434, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.03540456, + "balance_loss_mlp": 1.02002573, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.8147592493924078, + "language_loss": 0.71714616, + "learning_rate": 8.584278902901128e-07, + "loss": 0.73837948, + "num_input_tokens_seen": 252120510, + "step": 11685, + "time_per_iteration": 2.7287471294403076 + }, + { + "auxiliary_loss_clip": 0.010845, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.0318532, + "balance_loss_mlp": 1.0182538, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.7199246317777122, + "language_loss": 0.84417558, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86531246, + "num_input_tokens_seen": 252137590, + "step": 11686, + "time_per_iteration": 2.576439142227173 + }, + { + "auxiliary_loss_clip": 0.01016663, + "auxiliary_loss_mlp": 0.01006347, + "balance_loss_clip": 1.0082761, + "balance_loss_mlp": 1.0051012, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9797536948861865, + "language_loss": 0.69803506, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71826518, + "num_input_tokens_seen": 252199830, + "step": 11687, + "time_per_iteration": 3.289698600769043 + }, + { + "auxiliary_loss_clip": 0.01060576, + "auxiliary_loss_mlp": 0.01030042, + "balance_loss_clip": 1.03099847, + "balance_loss_mlp": 1.01822853, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.002903369934983, + "language_loss": 0.77106857, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79197472, + "num_input_tokens_seen": 252217200, + "step": 11688, + "time_per_iteration": 4.18194055557251 + }, + { + "auxiliary_loss_clip": 0.01100334, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.03472161, + "balance_loss_mlp": 1.01939642, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.629587440772158, + "language_loss": 0.68351811, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70482969, + "num_input_tokens_seen": 252236105, + "step": 11689, + "time_per_iteration": 2.5861642360687256 + }, + { + "auxiliary_loss_clip": 0.01079532, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.0338037, + "balance_loss_mlp": 1.02068961, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 2.5670589494946356, + "language_loss": 0.80055583, + "learning_rate": 8.568294990051086e-07, + "loss": 0.82168174, + "num_input_tokens_seen": 252253315, + "step": 11690, + "time_per_iteration": 2.6210639476776123 + }, + { + "auxiliary_loss_clip": 0.01101907, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.03639627, + "balance_loss_mlp": 1.02198672, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 2.308284864012828, + "language_loss": 0.75950563, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78085929, + "num_input_tokens_seen": 252272765, + "step": 11691, + "time_per_iteration": 2.545835494995117 + }, + { + "auxiliary_loss_clip": 0.01073105, + "auxiliary_loss_mlp": 0.01023284, + "balance_loss_clip": 1.0334568, + "balance_loss_mlp": 1.012555, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.8278517701624857, + "language_loss": 0.81881833, + "learning_rate": 8.561904458502429e-07, + "loss": 0.83978212, + "num_input_tokens_seen": 252290510, + "step": 11692, + "time_per_iteration": 2.6273152828216553 + }, + { + "auxiliary_loss_clip": 0.01075796, + "auxiliary_loss_mlp": 0.01028964, + "balance_loss_clip": 1.03421664, + "balance_loss_mlp": 1.01726413, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.5109476085699145, + "language_loss": 0.76743948, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78848708, + "num_input_tokens_seen": 252309365, + "step": 11693, + "time_per_iteration": 2.645970344543457 + }, + { + "auxiliary_loss_clip": 0.0107261, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.03495514, + "balance_loss_mlp": 1.02113724, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.463289443064594, + "language_loss": 0.68445843, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70551056, + "num_input_tokens_seen": 252333010, + "step": 11694, + "time_per_iteration": 2.780918598175049 + }, + { + "auxiliary_loss_clip": 0.01100328, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.0341351, + "balance_loss_mlp": 1.01701808, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.2383123514663614, + "language_loss": 0.76105928, + "learning_rate": 8.552321914485203e-07, + "loss": 0.7823475, + "num_input_tokens_seen": 252351330, + "step": 11695, + "time_per_iteration": 2.5517284870147705 + }, + { + "auxiliary_loss_clip": 0.01078692, + "auxiliary_loss_mlp": 0.01039224, + "balance_loss_clip": 1.0374285, + "balance_loss_mlp": 1.02712405, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 1.9042950635777158, + "language_loss": 0.7351321, + "learning_rate": 8.549128601178852e-07, + "loss": 0.7563113, + "num_input_tokens_seen": 252369580, + "step": 11696, + "time_per_iteration": 2.632317304611206 + }, + { + "auxiliary_loss_clip": 0.01083796, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.03553641, + "balance_loss_mlp": 1.01834929, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.6070350948774985, + "language_loss": 0.7527135, + "learning_rate": 8.545935722090693e-07, + "loss": 0.7738564, + "num_input_tokens_seen": 252390525, + "step": 11697, + "time_per_iteration": 2.69484543800354 + }, + { + "auxiliary_loss_clip": 0.01047021, + "auxiliary_loss_mlp": 0.01034128, + "balance_loss_clip": 1.03530407, + "balance_loss_mlp": 1.02048457, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 2.733542772244257, + "language_loss": 0.80635166, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82716316, + "num_input_tokens_seen": 252407470, + "step": 11698, + "time_per_iteration": 2.612133502960205 + }, + { + "auxiliary_loss_clip": 0.01070089, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.03223133, + "balance_loss_mlp": 1.02429128, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.5842994742055982, + "language_loss": 0.84778094, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86885285, + "num_input_tokens_seen": 252427025, + "step": 11699, + "time_per_iteration": 2.57454776763916 + }, + { + "auxiliary_loss_clip": 0.01081543, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.03376627, + "balance_loss_mlp": 1.0170114, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 1.8353129795088803, + "language_loss": 0.78919983, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81031328, + "num_input_tokens_seen": 252445410, + "step": 11700, + "time_per_iteration": 2.5681443214416504 + }, + { + "auxiliary_loss_clip": 0.01088774, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.0334785, + "balance_loss_mlp": 1.01596427, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.749584826013567, + "language_loss": 0.74224949, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76341814, + "num_input_tokens_seen": 252463905, + "step": 11701, + "time_per_iteration": 2.6923089027404785 + }, + { + "auxiliary_loss_clip": 0.01095179, + "auxiliary_loss_mlp": 0.01028526, + "balance_loss_clip": 1.03727865, + "balance_loss_mlp": 1.01599681, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.2358270388335626, + "language_loss": 0.84178287, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86301994, + "num_input_tokens_seen": 252478655, + "step": 11702, + "time_per_iteration": 2.5690348148345947 + }, + { + "auxiliary_loss_clip": 0.01099469, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.0336678, + "balance_loss_mlp": 1.02510595, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 2.6516066480648512, + "language_loss": 0.61111712, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63248491, + "num_input_tokens_seen": 252498740, + "step": 11703, + "time_per_iteration": 4.077753305435181 + }, + { + "auxiliary_loss_clip": 0.01098708, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.03297412, + "balance_loss_mlp": 1.01504469, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 1.86353167100017, + "language_loss": 0.60990375, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63116133, + "num_input_tokens_seen": 252517800, + "step": 11704, + "time_per_iteration": 2.581512451171875 + }, + { + "auxiliary_loss_clip": 0.0108209, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_clip": 1.03316498, + "balance_loss_mlp": 1.01688075, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 2.370377682251208, + "language_loss": 0.70745492, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72854614, + "num_input_tokens_seen": 252539620, + "step": 11705, + "time_per_iteration": 2.6265881061553955 + }, + { + "auxiliary_loss_clip": 0.01087545, + "auxiliary_loss_mlp": 0.01033681, + "balance_loss_clip": 1.0339129, + "balance_loss_mlp": 1.02189755, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 2.5285800108590575, + "language_loss": 0.61945724, + "learning_rate": 8.517219370087645e-07, + "loss": 0.64066947, + "num_input_tokens_seen": 252557300, + "step": 11706, + "time_per_iteration": 2.583555221557617 + }, + { + "auxiliary_loss_clip": 0.0109214, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.03507388, + "balance_loss_mlp": 1.01449347, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 1.9742030615386037, + "language_loss": 0.68008965, + "learning_rate": 8.514030839837756e-07, + "loss": 0.7012648, + "num_input_tokens_seen": 252576715, + "step": 11707, + "time_per_iteration": 2.5707924365997314 + }, + { + "auxiliary_loss_clip": 0.01096525, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.03344953, + "balance_loss_mlp": 1.01832128, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 2.0400464538275473, + "language_loss": 0.7621026, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78335977, + "num_input_tokens_seen": 252596190, + "step": 11708, + "time_per_iteration": 2.616262197494507 + }, + { + "auxiliary_loss_clip": 0.01072926, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.03182602, + "balance_loss_mlp": 1.01718855, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 2.2011564103406416, + "language_loss": 0.72328663, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74430025, + "num_input_tokens_seen": 252613410, + "step": 11709, + "time_per_iteration": 2.674410343170166 + }, + { + "auxiliary_loss_clip": 0.01086504, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.03321016, + "balance_loss_mlp": 1.0160141, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.194645383873997, + "language_loss": 0.78880537, + "learning_rate": 8.504467862866267e-07, + "loss": 0.80994129, + "num_input_tokens_seen": 252629150, + "step": 11710, + "time_per_iteration": 2.516737937927246 + }, + { + "auxiliary_loss_clip": 0.01089927, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.03474617, + "balance_loss_mlp": 1.02109921, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.5794229080231528, + "language_loss": 0.7737754, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79500437, + "num_input_tokens_seen": 252648225, + "step": 11711, + "time_per_iteration": 2.644322633743286 + }, + { + "auxiliary_loss_clip": 0.01060152, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.02988541, + "balance_loss_mlp": 1.01843989, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.6353093803361145, + "language_loss": 0.73988485, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76077318, + "num_input_tokens_seen": 252665380, + "step": 11712, + "time_per_iteration": 2.602154493331909 + }, + { + "auxiliary_loss_clip": 0.00986039, + "auxiliary_loss_mlp": 0.01002057, + "balance_loss_clip": 1.00671542, + "balance_loss_mlp": 1.00096631, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8826189861876093, + "language_loss": 0.64653027, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66641122, + "num_input_tokens_seen": 252727950, + "step": 11713, + "time_per_iteration": 4.8089141845703125 + }, + { + "auxiliary_loss_clip": 0.01080626, + "auxiliary_loss_mlp": 0.01024995, + "balance_loss_clip": 1.03043056, + "balance_loss_mlp": 1.01450431, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.0234316024179115, + "language_loss": 0.72970665, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75076288, + "num_input_tokens_seen": 252746770, + "step": 11714, + "time_per_iteration": 2.605501413345337 + }, + { + "auxiliary_loss_clip": 0.0107419, + "auxiliary_loss_mlp": 0.00749511, + "balance_loss_clip": 1.03322089, + "balance_loss_mlp": 1.0002805, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 2.2979085578949396, + "language_loss": 0.79333252, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81156957, + "num_input_tokens_seen": 252765610, + "step": 11715, + "time_per_iteration": 2.5799450874328613 + }, + { + "auxiliary_loss_clip": 0.01067688, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.03242648, + "balance_loss_mlp": 1.0204668, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.1646130929075555, + "language_loss": 0.71228898, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73329246, + "num_input_tokens_seen": 252781610, + "step": 11716, + "time_per_iteration": 2.5346052646636963 + }, + { + "auxiliary_loss_clip": 0.01068391, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.03952599, + "balance_loss_mlp": 1.01992226, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 1.8708588118220089, + "language_loss": 0.66679287, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68779266, + "num_input_tokens_seen": 252800600, + "step": 11717, + "time_per_iteration": 4.162331819534302 + }, + { + "auxiliary_loss_clip": 0.01100123, + "auxiliary_loss_mlp": 0.01029264, + "balance_loss_clip": 1.03505766, + "balance_loss_mlp": 1.01875544, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 2.181841657533062, + "language_loss": 0.74201053, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76330435, + "num_input_tokens_seen": 252822310, + "step": 11718, + "time_per_iteration": 2.5056099891662598 + }, + { + "auxiliary_loss_clip": 0.01082653, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.03207815, + "balance_loss_mlp": 1.02049959, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 3.9362625505304574, + "language_loss": 0.79962587, + "learning_rate": 8.475802484232606e-07, + "loss": 0.82076198, + "num_input_tokens_seen": 252842355, + "step": 11719, + "time_per_iteration": 2.5467891693115234 + }, + { + "auxiliary_loss_clip": 0.0108972, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.03570318, + "balance_loss_mlp": 1.02361655, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.9539420103051937, + "language_loss": 0.65831041, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67956007, + "num_input_tokens_seen": 252866785, + "step": 11720, + "time_per_iteration": 2.754823684692383 + }, + { + "auxiliary_loss_clip": 0.01078929, + "auxiliary_loss_mlp": 0.01028672, + "balance_loss_clip": 1.03380227, + "balance_loss_mlp": 1.01691794, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.1058225655129026, + "language_loss": 0.797916, + "learning_rate": 8.46943720397872e-07, + "loss": 0.81899202, + "num_input_tokens_seen": 252881870, + "step": 11721, + "time_per_iteration": 2.607590436935425 + }, + { + "auxiliary_loss_clip": 0.00997703, + "auxiliary_loss_mlp": 0.01013591, + "balance_loss_clip": 1.0089426, + "balance_loss_mlp": 1.01238728, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7746662044389667, + "language_loss": 0.64787173, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66798472, + "num_input_tokens_seen": 252951300, + "step": 11722, + "time_per_iteration": 3.306441307067871 + }, + { + "auxiliary_loss_clip": 0.01077881, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.03524256, + "balance_loss_mlp": 1.0197947, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.7623596720307015, + "language_loss": 0.65841973, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67950338, + "num_input_tokens_seen": 252971400, + "step": 11723, + "time_per_iteration": 2.6033430099487305 + }, + { + "auxiliary_loss_clip": 0.01068533, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.0334444, + "balance_loss_mlp": 1.01744843, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.9902651687314594, + "language_loss": 0.81101429, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83198935, + "num_input_tokens_seen": 252989475, + "step": 11724, + "time_per_iteration": 2.58708119392395 + }, + { + "auxiliary_loss_clip": 0.01086695, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.03180695, + "balance_loss_mlp": 1.02150559, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 1.6420037584317666, + "language_loss": 0.73123288, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75242883, + "num_input_tokens_seen": 253007220, + "step": 11725, + "time_per_iteration": 2.5231375694274902 + }, + { + "auxiliary_loss_clip": 0.01049112, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.02962065, + "balance_loss_mlp": 1.01568007, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.3753737744196854, + "language_loss": 0.7875607, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80833483, + "num_input_tokens_seen": 253025410, + "step": 11726, + "time_per_iteration": 2.600543260574341 + }, + { + "auxiliary_loss_clip": 0.01077538, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.03172803, + "balance_loss_mlp": 1.01832223, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.6746878688761233, + "language_loss": 0.70785272, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72891766, + "num_input_tokens_seen": 253043305, + "step": 11727, + "time_per_iteration": 2.5091819763183594 + }, + { + "auxiliary_loss_clip": 0.01090724, + "auxiliary_loss_mlp": 0.00749195, + "balance_loss_clip": 1.03083897, + "balance_loss_mlp": 1.00022125, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 2.179442638708102, + "language_loss": 0.69026726, + "learning_rate": 8.44717250248668e-07, + "loss": 0.70866644, + "num_input_tokens_seen": 253062790, + "step": 11728, + "time_per_iteration": 4.061559677124023 + }, + { + "auxiliary_loss_clip": 0.01064063, + "auxiliary_loss_mlp": 0.0074932, + "balance_loss_clip": 1.03194571, + "balance_loss_mlp": 1.00021279, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 3.1991350013641955, + "language_loss": 0.73405176, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75218558, + "num_input_tokens_seen": 253082055, + "step": 11729, + "time_per_iteration": 2.690115213394165 + }, + { + "auxiliary_loss_clip": 0.01088933, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.03929245, + "balance_loss_mlp": 1.01787353, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.6661246945555643, + "language_loss": 0.77971733, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80090821, + "num_input_tokens_seen": 253102575, + "step": 11730, + "time_per_iteration": 2.6459431648254395 + }, + { + "auxiliary_loss_clip": 0.01098405, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.03308582, + "balance_loss_mlp": 1.02118516, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 3.108889484404216, + "language_loss": 0.63741589, + "learning_rate": 8.437637056415359e-07, + "loss": 0.65872312, + "num_input_tokens_seen": 253121290, + "step": 11731, + "time_per_iteration": 2.5063517093658447 + }, + { + "auxiliary_loss_clip": 0.01041524, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.03155184, + "balance_loss_mlp": 1.01587903, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.9692067514022695, + "language_loss": 0.74137288, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76206803, + "num_input_tokens_seen": 253139720, + "step": 11732, + "time_per_iteration": 2.668665885925293 + }, + { + "auxiliary_loss_clip": 0.0108487, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.03346276, + "balance_loss_mlp": 1.01762378, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.4654766449743937, + "language_loss": 0.71025443, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73138773, + "num_input_tokens_seen": 253160250, + "step": 11733, + "time_per_iteration": 2.515716075897217 + }, + { + "auxiliary_loss_clip": 0.01062633, + "auxiliary_loss_mlp": 0.01033199, + "balance_loss_clip": 1.03321052, + "balance_loss_mlp": 1.02178431, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.6755770816318716, + "language_loss": 0.73440522, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75536358, + "num_input_tokens_seen": 253178710, + "step": 11734, + "time_per_iteration": 2.626925230026245 + }, + { + "auxiliary_loss_clip": 0.01062327, + "auxiliary_loss_mlp": 0.01037213, + "balance_loss_clip": 1.03182006, + "balance_loss_mlp": 1.02365875, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.669475344559657, + "language_loss": 0.6903199, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71131527, + "num_input_tokens_seen": 253194805, + "step": 11735, + "time_per_iteration": 2.5636167526245117 + }, + { + "auxiliary_loss_clip": 0.01068929, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.03160083, + "balance_loss_mlp": 1.02065063, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 1.9553393782179314, + "language_loss": 0.72801626, + "learning_rate": 8.421753416821933e-07, + "loss": 0.7490412, + "num_input_tokens_seen": 253213895, + "step": 11736, + "time_per_iteration": 2.573045015335083 + }, + { + "auxiliary_loss_clip": 0.01074122, + "auxiliary_loss_mlp": 0.01024736, + "balance_loss_clip": 1.0322597, + "balance_loss_mlp": 1.01454353, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 2.792017994740541, + "language_loss": 0.68848401, + "learning_rate": 8.41857800556629e-07, + "loss": 0.70947266, + "num_input_tokens_seen": 253231620, + "step": 11737, + "time_per_iteration": 2.558624505996704 + }, + { + "auxiliary_loss_clip": 0.01067967, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.0353229, + "balance_loss_mlp": 1.0219698, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 3.5113693352004605, + "language_loss": 0.67285311, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69387472, + "num_input_tokens_seen": 253249590, + "step": 11738, + "time_per_iteration": 2.5525007247924805 + }, + { + "auxiliary_loss_clip": 0.01100156, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.03514266, + "balance_loss_mlp": 1.01841998, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.7214380985565567, + "language_loss": 0.74884295, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77014822, + "num_input_tokens_seen": 253273870, + "step": 11739, + "time_per_iteration": 2.7748606204986572 + }, + { + "auxiliary_loss_clip": 0.01068526, + "auxiliary_loss_mlp": 0.00749368, + "balance_loss_clip": 1.03169882, + "balance_loss_mlp": 1.00020361, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.5851084316322432, + "language_loss": 0.7144295, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73260844, + "num_input_tokens_seen": 253293720, + "step": 11740, + "time_per_iteration": 2.605905771255493 + }, + { + "auxiliary_loss_clip": 0.01066175, + "auxiliary_loss_mlp": 0.01025193, + "balance_loss_clip": 1.03347695, + "balance_loss_mlp": 1.01490533, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6337270681496427, + "language_loss": 0.82019031, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84110391, + "num_input_tokens_seen": 253313700, + "step": 11741, + "time_per_iteration": 2.6607165336608887 + }, + { + "auxiliary_loss_clip": 0.01074938, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.0317297, + "balance_loss_mlp": 1.01803696, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 2.0938643452469505, + "language_loss": 0.77552521, + "learning_rate": 8.402707539225993e-07, + "loss": 0.79657006, + "num_input_tokens_seen": 253332425, + "step": 11742, + "time_per_iteration": 2.588153600692749 + }, + { + "auxiliary_loss_clip": 0.0110174, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.03478074, + "balance_loss_mlp": 1.01952755, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 2.0265326037176696, + "language_loss": 0.64194345, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66327292, + "num_input_tokens_seen": 253353620, + "step": 11743, + "time_per_iteration": 4.108758926391602 + }, + { + "auxiliary_loss_clip": 0.01068331, + "auxiliary_loss_mlp": 0.01029386, + "balance_loss_clip": 1.02980983, + "balance_loss_mlp": 1.01680374, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 2.076515530320493, + "language_loss": 0.65538168, + "learning_rate": 8.396362430240902e-07, + "loss": 0.67635888, + "num_input_tokens_seen": 253370930, + "step": 11744, + "time_per_iteration": 2.554056406021118 + }, + { + "auxiliary_loss_clip": 0.01084123, + "auxiliary_loss_mlp": 0.0103489, + "balance_loss_clip": 1.03147006, + "balance_loss_mlp": 1.02370787, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 1.8708575561234577, + "language_loss": 0.63559449, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65678465, + "num_input_tokens_seen": 253389810, + "step": 11745, + "time_per_iteration": 2.5539183616638184 + }, + { + "auxiliary_loss_clip": 0.01060244, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.03009784, + "balance_loss_mlp": 1.02127028, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.6736483398726694, + "language_loss": 0.71596014, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73688889, + "num_input_tokens_seen": 253408685, + "step": 11746, + "time_per_iteration": 2.7033443450927734 + }, + { + "auxiliary_loss_clip": 0.01045601, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.03489304, + "balance_loss_mlp": 1.01955128, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.9846908183280048, + "language_loss": 0.79143822, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81220675, + "num_input_tokens_seen": 253429685, + "step": 11747, + "time_per_iteration": 2.7240631580352783 + }, + { + "auxiliary_loss_clip": 0.01083038, + "auxiliary_loss_mlp": 0.01028644, + "balance_loss_clip": 1.03192377, + "balance_loss_mlp": 1.0177002, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 3.344789007981107, + "language_loss": 0.65204, + "learning_rate": 8.383677493366031e-07, + "loss": 0.6731568, + "num_input_tokens_seen": 253448260, + "step": 11748, + "time_per_iteration": 2.5634593963623047 + }, + { + "auxiliary_loss_clip": 0.01055033, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.03088951, + "balance_loss_mlp": 1.02522707, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 1.6495051241345684, + "language_loss": 0.78963542, + "learning_rate": 8.380507360077003e-07, + "loss": 0.81055391, + "num_input_tokens_seen": 253467725, + "step": 11749, + "time_per_iteration": 2.6463301181793213 + }, + { + "auxiliary_loss_clip": 0.0102519, + "auxiliary_loss_mlp": 0.01005538, + "balance_loss_clip": 1.00469327, + "balance_loss_mlp": 1.00443482, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7911119506173063, + "language_loss": 0.54055262, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56085992, + "num_input_tokens_seen": 253526940, + "step": 11750, + "time_per_iteration": 3.016751527786255 + }, + { + "auxiliary_loss_clip": 0.01077856, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.03391695, + "balance_loss_mlp": 1.02070594, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 1.6577665452031305, + "language_loss": 0.78445327, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80555731, + "num_input_tokens_seen": 253546160, + "step": 11751, + "time_per_iteration": 2.6323771476745605 + }, + { + "auxiliary_loss_clip": 0.01061383, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.03094935, + "balance_loss_mlp": 1.01701093, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 2.1579152316239076, + "language_loss": 0.67676461, + "learning_rate": 8.370999604364634e-07, + "loss": 0.69765192, + "num_input_tokens_seen": 253565505, + "step": 11752, + "time_per_iteration": 2.6333835124969482 + }, + { + "auxiliary_loss_clip": 0.0104349, + "auxiliary_loss_mlp": 0.0074943, + "balance_loss_clip": 1.03291678, + "balance_loss_mlp": 1.00022137, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 3.246279640334918, + "language_loss": 0.76586086, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78378999, + "num_input_tokens_seen": 253585125, + "step": 11753, + "time_per_iteration": 4.252341270446777 + }, + { + "auxiliary_loss_clip": 0.01065886, + "auxiliary_loss_mlp": 0.00749109, + "balance_loss_clip": 1.03229284, + "balance_loss_mlp": 1.00024915, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.4667603134491958, + "language_loss": 0.7113688, + "learning_rate": 8.364663305220405e-07, + "loss": 0.72951871, + "num_input_tokens_seen": 253604815, + "step": 11754, + "time_per_iteration": 2.612483263015747 + }, + { + "auxiliary_loss_clip": 0.01055899, + "auxiliary_loss_mlp": 0.0103615, + "balance_loss_clip": 1.03202522, + "balance_loss_mlp": 1.02312684, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 4.66471519337322, + "language_loss": 0.89468515, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91560566, + "num_input_tokens_seen": 253622855, + "step": 11755, + "time_per_iteration": 2.5967466831207275 + }, + { + "auxiliary_loss_clip": 0.01076176, + "auxiliary_loss_mlp": 0.00749356, + "balance_loss_clip": 1.03385687, + "balance_loss_mlp": 1.00025344, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.5431386053645024, + "language_loss": 0.79744673, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81570208, + "num_input_tokens_seen": 253642760, + "step": 11756, + "time_per_iteration": 2.5772976875305176 + }, + { + "auxiliary_loss_clip": 0.00989468, + "auxiliary_loss_mlp": 0.01001027, + "balance_loss_clip": 1.00767672, + "balance_loss_mlp": 0.99988854, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8285150164587755, + "language_loss": 0.60425752, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62416244, + "num_input_tokens_seen": 253695685, + "step": 11757, + "time_per_iteration": 4.460303783416748 + }, + { + "auxiliary_loss_clip": 0.01065567, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.03373289, + "balance_loss_mlp": 1.02090573, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 1.6973344569636977, + "language_loss": 0.80170655, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82268405, + "num_input_tokens_seen": 253713305, + "step": 11758, + "time_per_iteration": 2.765810966491699 + }, + { + "auxiliary_loss_clip": 0.01055865, + "auxiliary_loss_mlp": 0.0074937, + "balance_loss_clip": 1.0303477, + "balance_loss_mlp": 1.00026965, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 1.7392698560345496, + "language_loss": 0.7747649, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79281729, + "num_input_tokens_seen": 253736100, + "step": 11759, + "time_per_iteration": 2.8712732791900635 + }, + { + "auxiliary_loss_clip": 0.01088507, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.0332191, + "balance_loss_mlp": 1.01685381, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 2.127831879188152, + "language_loss": 0.67878431, + "learning_rate": 8.34566500074583e-07, + "loss": 0.69995487, + "num_input_tokens_seen": 253757350, + "step": 11760, + "time_per_iteration": 2.5744106769561768 + }, + { + "auxiliary_loss_clip": 0.01061592, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.03337705, + "balance_loss_mlp": 1.01935554, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.837119306349754, + "language_loss": 0.80362821, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82454562, + "num_input_tokens_seen": 253772855, + "step": 11761, + "time_per_iteration": 2.5645291805267334 + }, + { + "auxiliary_loss_clip": 0.01067908, + "auxiliary_loss_mlp": 0.01039114, + "balance_loss_clip": 1.02972484, + "balance_loss_mlp": 1.02470195, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.343811141280805, + "language_loss": 0.74933702, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77040732, + "num_input_tokens_seen": 253790360, + "step": 11762, + "time_per_iteration": 2.5246589183807373 + }, + { + "auxiliary_loss_clip": 0.0107269, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.03164673, + "balance_loss_mlp": 1.02059925, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 2.8608629201825435, + "language_loss": 0.76884627, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78988475, + "num_input_tokens_seen": 253810585, + "step": 11763, + "time_per_iteration": 2.625964879989624 + }, + { + "auxiliary_loss_clip": 0.01069278, + "auxiliary_loss_mlp": 0.00749354, + "balance_loss_clip": 1.03404617, + "balance_loss_mlp": 1.0002315, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.2995330793046795, + "language_loss": 0.79097378, + "learning_rate": 8.333008301499453e-07, + "loss": 0.80916011, + "num_input_tokens_seen": 253829080, + "step": 11764, + "time_per_iteration": 2.702700614929199 + }, + { + "auxiliary_loss_clip": 0.01052758, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.03190637, + "balance_loss_mlp": 1.02184105, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.5879383233420303, + "language_loss": 0.79565465, + "learning_rate": 8.32984523242167e-07, + "loss": 0.8165192, + "num_input_tokens_seen": 253846780, + "step": 11765, + "time_per_iteration": 2.60005259513855 + }, + { + "auxiliary_loss_clip": 0.01096177, + "auxiliary_loss_mlp": 0.01027365, + "balance_loss_clip": 1.03381383, + "balance_loss_mlp": 1.01714861, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 2.306573165004428, + "language_loss": 0.68402761, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70526308, + "num_input_tokens_seen": 253867075, + "step": 11766, + "time_per_iteration": 2.6008434295654297 + }, + { + "auxiliary_loss_clip": 0.01075373, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.03097212, + "balance_loss_mlp": 1.0229274, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 2.0057408219667296, + "language_loss": 0.638381, + "learning_rate": 8.323520421986352e-07, + "loss": 0.65947866, + "num_input_tokens_seen": 253885790, + "step": 11767, + "time_per_iteration": 2.605499505996704 + }, + { + "auxiliary_loss_clip": 0.01086388, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.03222525, + "balance_loss_mlp": 1.01545215, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.784376638268657, + "language_loss": 0.52838552, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54951614, + "num_input_tokens_seen": 253907070, + "step": 11768, + "time_per_iteration": 4.0488128662109375 + }, + { + "auxiliary_loss_clip": 0.01074893, + "auxiliary_loss_mlp": 0.00749378, + "balance_loss_clip": 1.03298044, + "balance_loss_mlp": 1.00025141, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.698876323442641, + "language_loss": 0.75892788, + "learning_rate": 8.317197382644119e-07, + "loss": 0.77717054, + "num_input_tokens_seen": 253927290, + "step": 11769, + "time_per_iteration": 2.5902719497680664 + }, + { + "auxiliary_loss_clip": 0.0100599, + "auxiliary_loss_mlp": 0.01013734, + "balance_loss_clip": 1.00567937, + "balance_loss_mlp": 1.01239252, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8668966026025143, + "language_loss": 0.62029976, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64049697, + "num_input_tokens_seen": 253983440, + "step": 11770, + "time_per_iteration": 3.067113161087036 + }, + { + "auxiliary_loss_clip": 0.01062639, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.03004551, + "balance_loss_mlp": 1.02719772, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.707160619546261, + "language_loss": 0.76379919, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78481054, + "num_input_tokens_seen": 254003825, + "step": 11771, + "time_per_iteration": 2.6502859592437744 + }, + { + "auxiliary_loss_clip": 0.01083337, + "auxiliary_loss_mlp": 0.01024741, + "balance_loss_clip": 1.03187513, + "balance_loss_mlp": 1.01466203, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.7110899518019715, + "language_loss": 0.71288037, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73396111, + "num_input_tokens_seen": 254023345, + "step": 11772, + "time_per_iteration": 2.538564920425415 + }, + { + "auxiliary_loss_clip": 0.01051079, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.03298068, + "balance_loss_mlp": 1.01897287, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 2.0941015799654106, + "language_loss": 0.69818318, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71900266, + "num_input_tokens_seen": 254041815, + "step": 11773, + "time_per_iteration": 2.6785495281219482 + }, + { + "auxiliary_loss_clip": 0.01090461, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03505647, + "balance_loss_mlp": 1.01873398, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.452654605724828, + "language_loss": 0.70089495, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72209823, + "num_input_tokens_seen": 254062065, + "step": 11774, + "time_per_iteration": 2.561755895614624 + }, + { + "auxiliary_loss_clip": 0.01068343, + "auxiliary_loss_mlp": 0.01025256, + "balance_loss_clip": 1.03571439, + "balance_loss_mlp": 1.01457524, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.4961310284614813, + "language_loss": 0.74683321, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76776922, + "num_input_tokens_seen": 254080605, + "step": 11775, + "time_per_iteration": 2.658773899078369 + }, + { + "auxiliary_loss_clip": 0.01059198, + "auxiliary_loss_mlp": 0.00749302, + "balance_loss_clip": 1.03308094, + "balance_loss_mlp": 1.00023603, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.7338877872031435, + "language_loss": 0.86999261, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88807768, + "num_input_tokens_seen": 254098710, + "step": 11776, + "time_per_iteration": 2.6360669136047363 + }, + { + "auxiliary_loss_clip": 0.01077767, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03105569, + "balance_loss_mlp": 1.01862669, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.501172089872521, + "language_loss": 0.75058877, + "learning_rate": 8.291922955383641e-07, + "loss": 0.77165997, + "num_input_tokens_seen": 254117200, + "step": 11777, + "time_per_iteration": 2.5221996307373047 + }, + { + "auxiliary_loss_clip": 0.01084144, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.03765273, + "balance_loss_mlp": 1.01895249, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.5338827094351557, + "language_loss": 0.82322758, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84437597, + "num_input_tokens_seen": 254132115, + "step": 11778, + "time_per_iteration": 2.5865848064422607 + }, + { + "auxiliary_loss_clip": 0.01069997, + "auxiliary_loss_mlp": 0.01031165, + "balance_loss_clip": 1.03143239, + "balance_loss_mlp": 1.02115154, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.5078265723167883, + "language_loss": 0.84885931, + "learning_rate": 8.285608785887673e-07, + "loss": 0.8698709, + "num_input_tokens_seen": 254152285, + "step": 11779, + "time_per_iteration": 2.581634521484375 + }, + { + "auxiliary_loss_clip": 0.01075645, + "auxiliary_loss_mlp": 0.01034574, + "balance_loss_clip": 1.0339067, + "balance_loss_mlp": 1.02369583, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.3959098240442422, + "language_loss": 0.71685487, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73795706, + "num_input_tokens_seen": 254172805, + "step": 11780, + "time_per_iteration": 2.701509952545166 + }, + { + "auxiliary_loss_clip": 0.01044221, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.03083277, + "balance_loss_mlp": 1.01872814, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.5176845436641606, + "language_loss": 0.72967064, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75040889, + "num_input_tokens_seen": 254191890, + "step": 11781, + "time_per_iteration": 2.6412696838378906 + }, + { + "auxiliary_loss_clip": 0.01086301, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.03405869, + "balance_loss_mlp": 1.0199337, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.6206871769515814, + "language_loss": 0.77407622, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79524004, + "num_input_tokens_seen": 254210150, + "step": 11782, + "time_per_iteration": 2.5327725410461426 + }, + { + "auxiliary_loss_clip": 0.01072843, + "auxiliary_loss_mlp": 0.0102415, + "balance_loss_clip": 1.03164744, + "balance_loss_mlp": 1.01389802, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 1.5144199422024087, + "language_loss": 0.69742966, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71839958, + "num_input_tokens_seen": 254233015, + "step": 11783, + "time_per_iteration": 4.12218976020813 + }, + { + "auxiliary_loss_clip": 0.01060454, + "auxiliary_loss_mlp": 0.0103093, + "balance_loss_clip": 1.03377783, + "balance_loss_mlp": 1.01986134, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.6939356176721656, + "language_loss": 0.79011607, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81102991, + "num_input_tokens_seen": 254251345, + "step": 11784, + "time_per_iteration": 2.7402162551879883 + }, + { + "auxiliary_loss_clip": 0.01097435, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.0340637, + "balance_loss_mlp": 1.01816154, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 1.711109630061907, + "language_loss": 0.7728318, + "learning_rate": 8.266676942290609e-07, + "loss": 0.7940954, + "num_input_tokens_seen": 254269905, + "step": 11785, + "time_per_iteration": 2.478766441345215 + }, + { + "auxiliary_loss_clip": 0.01070404, + "auxiliary_loss_mlp": 0.01029412, + "balance_loss_clip": 1.03285897, + "balance_loss_mlp": 1.01798582, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.8254033759900827, + "language_loss": 0.78073144, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80172956, + "num_input_tokens_seen": 254289990, + "step": 11786, + "time_per_iteration": 2.6278417110443115 + }, + { + "auxiliary_loss_clip": 0.01100192, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.03446484, + "balance_loss_mlp": 1.01642263, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.361844237875026, + "language_loss": 0.79282784, + "learning_rate": 8.260369885912526e-07, + "loss": 0.81410497, + "num_input_tokens_seen": 254309085, + "step": 11787, + "time_per_iteration": 2.5763626098632812 + }, + { + "auxiliary_loss_clip": 0.01088902, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.03456426, + "balance_loss_mlp": 1.01683033, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 2.256951824139161, + "language_loss": 0.76772988, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78889394, + "num_input_tokens_seen": 254327045, + "step": 11788, + "time_per_iteration": 2.581967353820801 + }, + { + "auxiliary_loss_clip": 0.01055465, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.02993512, + "balance_loss_mlp": 1.02367473, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 2.0526767716667704, + "language_loss": 0.67690432, + "learning_rate": 8.254064610206212e-07, + "loss": 0.69783711, + "num_input_tokens_seen": 254344585, + "step": 11789, + "time_per_iteration": 2.603168487548828 + }, + { + "auxiliary_loss_clip": 0.01046253, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.0334425, + "balance_loss_mlp": 1.0185864, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 2.403606529739425, + "language_loss": 0.77672553, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79749697, + "num_input_tokens_seen": 254362470, + "step": 11790, + "time_per_iteration": 2.660647392272949 + }, + { + "auxiliary_loss_clip": 0.01078345, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.032902, + "balance_loss_mlp": 1.01690543, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 2.0557201078527605, + "language_loss": 0.7065751, + "learning_rate": 8.247761116128085e-07, + "loss": 0.72764385, + "num_input_tokens_seen": 254383190, + "step": 11791, + "time_per_iteration": 2.6227946281433105 + }, + { + "auxiliary_loss_clip": 0.01088222, + "auxiliary_loss_mlp": 0.01029708, + "balance_loss_clip": 1.03424954, + "balance_loss_mlp": 1.01784086, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 2.251215748367411, + "language_loss": 0.82103276, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84221208, + "num_input_tokens_seen": 254403115, + "step": 11792, + "time_per_iteration": 2.537130117416382 + }, + { + "auxiliary_loss_clip": 0.0105694, + "auxiliary_loss_mlp": 0.0102573, + "balance_loss_clip": 1.03329182, + "balance_loss_mlp": 1.01424968, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 1.823092404769671, + "language_loss": 0.64071345, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66154015, + "num_input_tokens_seen": 254421875, + "step": 11793, + "time_per_iteration": 4.140415668487549 + }, + { + "auxiliary_loss_clip": 0.01081221, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.03282595, + "balance_loss_mlp": 1.01857686, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.2467675311680257, + "language_loss": 0.70751399, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72862852, + "num_input_tokens_seen": 254440765, + "step": 11794, + "time_per_iteration": 2.541116237640381 + }, + { + "auxiliary_loss_clip": 0.01077275, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.03462601, + "balance_loss_mlp": 1.02261174, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.650176559668068, + "language_loss": 0.75765252, + "learning_rate": 8.23515947668052e-07, + "loss": 0.77875841, + "num_input_tokens_seen": 254459480, + "step": 11795, + "time_per_iteration": 2.5429718494415283 + }, + { + "auxiliary_loss_clip": 0.01062624, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.03305745, + "balance_loss_mlp": 1.01916003, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.340805154535586, + "language_loss": 0.74760777, + "learning_rate": 8.232010181829838e-07, + "loss": 0.76853049, + "num_input_tokens_seen": 254473985, + "step": 11796, + "time_per_iteration": 2.584411382675171 + }, + { + "auxiliary_loss_clip": 0.0108728, + "auxiliary_loss_mlp": 0.01038558, + "balance_loss_clip": 1.03479803, + "balance_loss_mlp": 1.02394855, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.6608214345199872, + "language_loss": 0.74140674, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76266515, + "num_input_tokens_seen": 254492135, + "step": 11797, + "time_per_iteration": 4.034473180770874 + }, + { + "auxiliary_loss_clip": 0.01054076, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.03322816, + "balance_loss_mlp": 1.01835728, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.4803099016007433, + "language_loss": 0.79334718, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81417942, + "num_input_tokens_seen": 254512865, + "step": 11798, + "time_per_iteration": 2.7305214405059814 + }, + { + "auxiliary_loss_clip": 0.01060651, + "auxiliary_loss_mlp": 0.01036132, + "balance_loss_clip": 1.0286212, + "balance_loss_mlp": 1.02403224, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.9121852127548675, + "language_loss": 0.66670239, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68767023, + "num_input_tokens_seen": 254532605, + "step": 11799, + "time_per_iteration": 2.553664445877075 + }, + { + "auxiliary_loss_clip": 0.01098962, + "auxiliary_loss_mlp": 0.01028628, + "balance_loss_clip": 1.03409827, + "balance_loss_mlp": 1.016904, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.666649227853805, + "language_loss": 0.8168627, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83813858, + "num_input_tokens_seen": 254553780, + "step": 11800, + "time_per_iteration": 2.5640363693237305 + }, + { + "auxiliary_loss_clip": 0.01072633, + "auxiliary_loss_mlp": 0.01026291, + "balance_loss_clip": 1.03100777, + "balance_loss_mlp": 1.01618218, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 2.013579519839003, + "language_loss": 0.86139357, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88238275, + "num_input_tokens_seen": 254567510, + "step": 11801, + "time_per_iteration": 2.487604856491089 + }, + { + "auxiliary_loss_clip": 0.01098837, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.03395796, + "balance_loss_mlp": 1.02349269, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 2.0580798458900023, + "language_loss": 0.76147729, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78280783, + "num_input_tokens_seen": 254585565, + "step": 11802, + "time_per_iteration": 2.493446111679077 + }, + { + "auxiliary_loss_clip": 0.01081543, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.03336024, + "balance_loss_mlp": 1.02752137, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 2.288613096984773, + "language_loss": 0.81860203, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83980417, + "num_input_tokens_seen": 254603465, + "step": 11803, + "time_per_iteration": 2.5205891132354736 + }, + { + "auxiliary_loss_clip": 0.01100056, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.03331304, + "balance_loss_mlp": 1.01913595, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.26400947529986, + "language_loss": 0.67486995, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69618452, + "num_input_tokens_seen": 254620500, + "step": 11804, + "time_per_iteration": 2.4531233310699463 + }, + { + "auxiliary_loss_clip": 0.01083119, + "auxiliary_loss_mlp": 0.01026509, + "balance_loss_clip": 1.03181672, + "balance_loss_mlp": 1.01638806, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.8089946857786505, + "language_loss": 0.78200769, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80310404, + "num_input_tokens_seen": 254638565, + "step": 11805, + "time_per_iteration": 2.6134181022644043 + }, + { + "auxiliary_loss_clip": 0.01075843, + "auxiliary_loss_mlp": 0.00749474, + "balance_loss_clip": 1.03160548, + "balance_loss_mlp": 1.00026631, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 2.4437614769089664, + "language_loss": 0.78948903, + "learning_rate": 8.200541796403667e-07, + "loss": 0.80774224, + "num_input_tokens_seen": 254657505, + "step": 11806, + "time_per_iteration": 2.6003100872039795 + }, + { + "auxiliary_loss_clip": 0.01067194, + "auxiliary_loss_mlp": 0.01035105, + "balance_loss_clip": 1.03181243, + "balance_loss_mlp": 1.02295721, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 3.680761872714906, + "language_loss": 0.56534159, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58636457, + "num_input_tokens_seen": 254674730, + "step": 11807, + "time_per_iteration": 2.731840133666992 + }, + { + "auxiliary_loss_clip": 0.01100903, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.03254461, + "balance_loss_mlp": 1.02173376, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 1.867766557547277, + "language_loss": 0.68426204, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70560086, + "num_input_tokens_seen": 254691665, + "step": 11808, + "time_per_iteration": 4.017885684967041 + }, + { + "auxiliary_loss_clip": 0.01086084, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.03327262, + "balance_loss_mlp": 1.02006316, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 2.3859371156081854, + "language_loss": 0.7162751, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73744416, + "num_input_tokens_seen": 254711610, + "step": 11809, + "time_per_iteration": 2.594116449356079 + }, + { + "auxiliary_loss_clip": 0.01024728, + "auxiliary_loss_mlp": 0.01000469, + "balance_loss_clip": 1.0041976, + "balance_loss_mlp": 0.99937838, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.748020851546644, + "language_loss": 0.59498262, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61523461, + "num_input_tokens_seen": 254772615, + "step": 11810, + "time_per_iteration": 3.2350172996520996 + }, + { + "auxiliary_loss_clip": 0.01013859, + "auxiliary_loss_mlp": 0.01043875, + "balance_loss_clip": 1.02764714, + "balance_loss_mlp": 1.03172755, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.7574851817491788, + "language_loss": 0.73798525, + "learning_rate": 8.18482437510784e-07, + "loss": 0.75856256, + "num_input_tokens_seen": 254791375, + "step": 11811, + "time_per_iteration": 2.861020088195801 + }, + { + "auxiliary_loss_clip": 0.01057246, + "auxiliary_loss_mlp": 0.01025791, + "balance_loss_clip": 1.03313041, + "balance_loss_mlp": 1.01506805, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.9558184549261433, + "language_loss": 0.83682597, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85765636, + "num_input_tokens_seen": 254809300, + "step": 11812, + "time_per_iteration": 2.6847970485687256 + }, + { + "auxiliary_loss_clip": 0.01101788, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.03585601, + "balance_loss_mlp": 1.01934409, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.5667728009210522, + "language_loss": 0.70039135, + "learning_rate": 8.178540541983716e-07, + "loss": 0.72171783, + "num_input_tokens_seen": 254829325, + "step": 11813, + "time_per_iteration": 2.6037938594818115 + }, + { + "auxiliary_loss_clip": 0.01094487, + "auxiliary_loss_mlp": 0.01023957, + "balance_loss_clip": 1.0315218, + "balance_loss_mlp": 1.01338291, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 2.1247848394603017, + "language_loss": 0.81494254, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83612698, + "num_input_tokens_seen": 254847690, + "step": 11814, + "time_per_iteration": 2.534571886062622 + }, + { + "auxiliary_loss_clip": 0.01098713, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.03492081, + "balance_loss_mlp": 1.01626897, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 1.8499323675453194, + "language_loss": 0.75972283, + "learning_rate": 8.172258501943301e-07, + "loss": 0.78098524, + "num_input_tokens_seen": 254865960, + "step": 11815, + "time_per_iteration": 2.5394976139068604 + }, + { + "auxiliary_loss_clip": 0.01055347, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.03299963, + "balance_loss_mlp": 1.01778054, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.7689442343839827, + "language_loss": 0.7851671, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80600947, + "num_input_tokens_seen": 254882815, + "step": 11816, + "time_per_iteration": 2.5972206592559814 + }, + { + "auxiliary_loss_clip": 0.01072181, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.0334996, + "balance_loss_mlp": 1.02266586, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.6811592722385207, + "language_loss": 0.86775321, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88881218, + "num_input_tokens_seen": 254898705, + "step": 11817, + "time_per_iteration": 2.5146238803863525 + }, + { + "auxiliary_loss_clip": 0.01049555, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.03281403, + "balance_loss_mlp": 1.02090085, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 2.1998335196417913, + "language_loss": 0.84624702, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86705738, + "num_input_tokens_seen": 254913665, + "step": 11818, + "time_per_iteration": 2.6074185371398926 + }, + { + "auxiliary_loss_clip": 0.01097549, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.03211951, + "balance_loss_mlp": 1.01596403, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 3.3688557962377472, + "language_loss": 0.75412613, + "learning_rate": 8.159699804924709e-07, + "loss": 0.77537847, + "num_input_tokens_seen": 254932140, + "step": 11819, + "time_per_iteration": 2.586987018585205 + }, + { + "auxiliary_loss_clip": 0.01053937, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.03154135, + "balance_loss_mlp": 1.01453137, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.6741395714067457, + "language_loss": 0.70891905, + "learning_rate": 8.156561252835883e-07, + "loss": 0.72972542, + "num_input_tokens_seen": 254951580, + "step": 11820, + "time_per_iteration": 2.6946864128112793 + }, + { + "auxiliary_loss_clip": 0.01085579, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.03363943, + "balance_loss_mlp": 1.01616549, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.937320451589911, + "language_loss": 0.74997962, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77110338, + "num_input_tokens_seen": 254969425, + "step": 11821, + "time_per_iteration": 2.588721990585327 + }, + { + "auxiliary_loss_clip": 0.00979599, + "auxiliary_loss_mlp": 0.01007617, + "balance_loss_clip": 1.01182055, + "balance_loss_mlp": 1.0064671, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.770007362522753, + "language_loss": 0.55034733, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57021952, + "num_input_tokens_seen": 255032680, + "step": 11822, + "time_per_iteration": 3.364675521850586 + }, + { + "auxiliary_loss_clip": 0.01081565, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.03247201, + "balance_loss_mlp": 1.01825416, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 2.107328160619185, + "language_loss": 0.60267711, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62378323, + "num_input_tokens_seen": 255054400, + "step": 11823, + "time_per_iteration": 3.066333293914795 + }, + { + "auxiliary_loss_clip": 0.01086368, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.03332567, + "balance_loss_mlp": 1.02140784, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 1.8606865665656496, + "language_loss": 0.71334028, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73452073, + "num_input_tokens_seen": 255072785, + "step": 11824, + "time_per_iteration": 4.07299280166626 + }, + { + "auxiliary_loss_clip": 0.01059444, + "auxiliary_loss_mlp": 0.00749439, + "balance_loss_clip": 1.02870703, + "balance_loss_mlp": 1.00020874, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 2.1954688782756415, + "language_loss": 0.72773093, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74581981, + "num_input_tokens_seen": 255091820, + "step": 11825, + "time_per_iteration": 2.560575485229492 + }, + { + "auxiliary_loss_clip": 0.01067279, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.03240418, + "balance_loss_mlp": 1.01868343, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.7459919118300116, + "language_loss": 0.79375356, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81471926, + "num_input_tokens_seen": 255111720, + "step": 11826, + "time_per_iteration": 2.6358771324157715 + }, + { + "auxiliary_loss_clip": 0.01084463, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.03285491, + "balance_loss_mlp": 1.02060103, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.4854541398803127, + "language_loss": 0.82873404, + "learning_rate": 8.134603969799527e-07, + "loss": 0.84988856, + "num_input_tokens_seen": 255133495, + "step": 11827, + "time_per_iteration": 2.737868309020996 + }, + { + "auxiliary_loss_clip": 0.0106579, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.0331527, + "balance_loss_mlp": 1.01889074, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.4957862715006986, + "language_loss": 0.62025058, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64121044, + "num_input_tokens_seen": 255156880, + "step": 11828, + "time_per_iteration": 2.660114288330078 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.03413475, + "balance_loss_mlp": 1.01967585, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.476506852462178, + "language_loss": 0.72069156, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74197483, + "num_input_tokens_seen": 255178920, + "step": 11829, + "time_per_iteration": 2.56534481048584 + }, + { + "auxiliary_loss_clip": 0.01096474, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.03389406, + "balance_loss_mlp": 1.0180068, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.788295334678601, + "language_loss": 0.80402744, + "learning_rate": 8.125200452317697e-07, + "loss": 0.8252759, + "num_input_tokens_seen": 255198095, + "step": 11830, + "time_per_iteration": 2.5310988426208496 + }, + { + "auxiliary_loss_clip": 0.01082287, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.03187168, + "balance_loss_mlp": 1.02112305, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 2.231825005222072, + "language_loss": 0.84532809, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86646706, + "num_input_tokens_seen": 255215860, + "step": 11831, + "time_per_iteration": 2.5811588764190674 + }, + { + "auxiliary_loss_clip": 0.01074227, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.03204453, + "balance_loss_mlp": 1.01876521, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.027943750194562, + "language_loss": 0.77099657, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79203475, + "num_input_tokens_seen": 255235425, + "step": 11832, + "time_per_iteration": 2.5858917236328125 + }, + { + "auxiliary_loss_clip": 0.01015531, + "auxiliary_loss_mlp": 0.00998216, + "balance_loss_clip": 1.00466633, + "balance_loss_mlp": 0.9972443, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7495612497870556, + "language_loss": 0.56582212, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58595955, + "num_input_tokens_seen": 255291680, + "step": 11833, + "time_per_iteration": 4.510669946670532 + }, + { + "auxiliary_loss_clip": 0.0104878, + "auxiliary_loss_mlp": 0.01035246, + "balance_loss_clip": 1.02959025, + "balance_loss_mlp": 1.02467179, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 2.164127160210594, + "language_loss": 0.71014118, + "learning_rate": 8.11266873367315e-07, + "loss": 0.73098135, + "num_input_tokens_seen": 255313880, + "step": 11834, + "time_per_iteration": 2.723304510116577 + }, + { + "auxiliary_loss_clip": 0.01099649, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03439116, + "balance_loss_mlp": 1.01977539, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 1.8274651519015301, + "language_loss": 0.79561985, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81692833, + "num_input_tokens_seen": 255332390, + "step": 11835, + "time_per_iteration": 2.6148369312286377 + }, + { + "auxiliary_loss_clip": 0.01083494, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.03154242, + "balance_loss_mlp": 1.01771522, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.5343499426854175, + "language_loss": 0.75890839, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78002238, + "num_input_tokens_seen": 255354025, + "step": 11836, + "time_per_iteration": 4.13025426864624 + }, + { + "auxiliary_loss_clip": 0.01025974, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.0312835, + "balance_loss_mlp": 1.01575089, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.6838366406241134, + "language_loss": 0.70192748, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72245431, + "num_input_tokens_seen": 255371400, + "step": 11837, + "time_per_iteration": 2.8607943058013916 + }, + { + "auxiliary_loss_clip": 0.01089906, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.03463471, + "balance_loss_mlp": 1.02113581, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.951351379038921, + "language_loss": 0.6149385, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63616747, + "num_input_tokens_seen": 255390710, + "step": 11838, + "time_per_iteration": 2.762878656387329 + }, + { + "auxiliary_loss_clip": 0.01087587, + "auxiliary_loss_mlp": 0.01028191, + "balance_loss_clip": 1.03461552, + "balance_loss_mlp": 1.01726532, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.9158527659843743, + "language_loss": 0.67446011, + "learning_rate": 8.097014228555426e-07, + "loss": 0.69561791, + "num_input_tokens_seen": 255408790, + "step": 11839, + "time_per_iteration": 2.569208860397339 + }, + { + "auxiliary_loss_clip": 0.01099372, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.03465247, + "balance_loss_mlp": 1.02204728, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 2.1383313178650702, + "language_loss": 0.8415525, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86287522, + "num_input_tokens_seen": 255426280, + "step": 11840, + "time_per_iteration": 2.5288751125335693 + }, + { + "auxiliary_loss_clip": 0.01074113, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.03357339, + "balance_loss_mlp": 1.01832509, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 2.2304575983013573, + "language_loss": 0.76569045, + "learning_rate": 8.090755585214277e-07, + "loss": 0.78672719, + "num_input_tokens_seen": 255442935, + "step": 11841, + "time_per_iteration": 2.579241991043091 + }, + { + "auxiliary_loss_clip": 0.01083616, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.03682923, + "balance_loss_mlp": 1.01858091, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 1.9167844027751526, + "language_loss": 0.74936521, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77049905, + "num_input_tokens_seen": 255460925, + "step": 11842, + "time_per_iteration": 2.571012020111084 + }, + { + "auxiliary_loss_clip": 0.0101945, + "auxiliary_loss_mlp": 0.01003277, + "balance_loss_clip": 1.01399112, + "balance_loss_mlp": 1.00227571, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.787587622933137, + "language_loss": 0.61615145, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63637871, + "num_input_tokens_seen": 255521360, + "step": 11843, + "time_per_iteration": 3.1166062355041504 + }, + { + "auxiliary_loss_clip": 0.01095165, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.03266549, + "balance_loss_mlp": 1.01898718, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.7158837605586776, + "language_loss": 0.80101776, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82226318, + "num_input_tokens_seen": 255541435, + "step": 11844, + "time_per_iteration": 2.6049602031707764 + }, + { + "auxiliary_loss_clip": 0.01038906, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.02707696, + "balance_loss_mlp": 1.02180862, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.607590679574073, + "language_loss": 0.78989565, + "learning_rate": 8.078243718677873e-07, + "loss": 0.81062192, + "num_input_tokens_seen": 255558505, + "step": 11845, + "time_per_iteration": 2.6370606422424316 + }, + { + "auxiliary_loss_clip": 0.01078311, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.03317952, + "balance_loss_mlp": 1.02110982, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 1.999354466449443, + "language_loss": 0.77510643, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79621446, + "num_input_tokens_seen": 255577815, + "step": 11846, + "time_per_iteration": 2.590942144393921 + }, + { + "auxiliary_loss_clip": 0.01085109, + "auxiliary_loss_mlp": 0.01032939, + "balance_loss_clip": 1.03281462, + "balance_loss_mlp": 1.02188802, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.9364594434815932, + "language_loss": 0.59273338, + "learning_rate": 8.071990497380421e-07, + "loss": 0.61391389, + "num_input_tokens_seen": 255595885, + "step": 11847, + "time_per_iteration": 2.5644948482513428 + }, + { + "auxiliary_loss_clip": 0.01082598, + "auxiliary_loss_mlp": 0.00749111, + "balance_loss_clip": 1.03262258, + "balance_loss_mlp": 1.00022411, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.732426894404334, + "language_loss": 0.7136308, + "learning_rate": 8.068864565139395e-07, + "loss": 0.7319479, + "num_input_tokens_seen": 255616750, + "step": 11848, + "time_per_iteration": 4.141420125961304 + }, + { + "auxiliary_loss_clip": 0.01014979, + "auxiliary_loss_mlp": 0.01007388, + "balance_loss_clip": 1.00426435, + "balance_loss_mlp": 1.00639212, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8090801544049517, + "language_loss": 0.63002229, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65024602, + "num_input_tokens_seen": 255677900, + "step": 11849, + "time_per_iteration": 3.096078634262085 + }, + { + "auxiliary_loss_clip": 0.01069518, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.03128314, + "balance_loss_mlp": 1.0223887, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.5240733787409386, + "language_loss": 0.64071536, + "learning_rate": 8.0626140580654e-07, + "loss": 0.66174543, + "num_input_tokens_seen": 255699140, + "step": 11850, + "time_per_iteration": 2.781045436859131 + }, + { + "auxiliary_loss_clip": 0.01086882, + "auxiliary_loss_mlp": 0.0103065, + "balance_loss_clip": 1.03239274, + "balance_loss_mlp": 1.01993942, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.9337190962965694, + "language_loss": 0.70143282, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72260815, + "num_input_tokens_seen": 255719640, + "step": 11851, + "time_per_iteration": 2.6205344200134277 + }, + { + "auxiliary_loss_clip": 0.01089489, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.03604531, + "balance_loss_mlp": 1.02175188, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.7662962820961248, + "language_loss": 0.83168161, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85289288, + "num_input_tokens_seen": 255740450, + "step": 11852, + "time_per_iteration": 2.6493093967437744 + }, + { + "auxiliary_loss_clip": 0.01084275, + "auxiliary_loss_mlp": 0.00749528, + "balance_loss_clip": 1.03181076, + "balance_loss_mlp": 1.00026941, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.872476035244644, + "language_loss": 0.73095262, + "learning_rate": 8.053241692752126e-07, + "loss": 0.74929065, + "num_input_tokens_seen": 255758070, + "step": 11853, + "time_per_iteration": 2.7184627056121826 + }, + { + "auxiliary_loss_clip": 0.01054159, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.02961159, + "balance_loss_mlp": 1.02259731, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 1.9491951081630388, + "language_loss": 0.92204195, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94291735, + "num_input_tokens_seen": 255775685, + "step": 11854, + "time_per_iteration": 2.620934009552002 + }, + { + "auxiliary_loss_clip": 0.01086807, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.03445733, + "balance_loss_mlp": 1.01972878, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.9950905444444371, + "language_loss": 0.79678369, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81795257, + "num_input_tokens_seen": 255794750, + "step": 11855, + "time_per_iteration": 2.5876033306121826 + }, + { + "auxiliary_loss_clip": 0.0104562, + "auxiliary_loss_mlp": 0.01035493, + "balance_loss_clip": 1.02930629, + "balance_loss_mlp": 1.02315474, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 2.065277894516427, + "language_loss": 0.73120505, + "learning_rate": 8.043873404639192e-07, + "loss": 0.75201613, + "num_input_tokens_seen": 255813325, + "step": 11856, + "time_per_iteration": 2.6689107418060303 + }, + { + "auxiliary_loss_clip": 0.01090435, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.03573799, + "balance_loss_mlp": 1.01671612, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.5768585590726762, + "language_loss": 0.69446433, + "learning_rate": 8.040751548532046e-07, + "loss": 0.71564692, + "num_input_tokens_seen": 255832470, + "step": 11857, + "time_per_iteration": 2.6201446056365967 + }, + { + "auxiliary_loss_clip": 0.01083839, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.03234613, + "balance_loss_mlp": 1.01805639, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.2105040101394167, + "language_loss": 0.84534466, + "learning_rate": 8.03763014592081e-07, + "loss": 0.86647272, + "num_input_tokens_seen": 255849740, + "step": 11858, + "time_per_iteration": 2.5665218830108643 + }, + { + "auxiliary_loss_clip": 0.01104222, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.03649175, + "balance_loss_mlp": 1.01610827, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 13.423728993266154, + "language_loss": 0.80282634, + "learning_rate": 8.034509196923829e-07, + "loss": 0.8241421, + "num_input_tokens_seen": 255866975, + "step": 11859, + "time_per_iteration": 2.602879524230957 + }, + { + "auxiliary_loss_clip": 0.01074549, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.03262329, + "balance_loss_mlp": 1.02144933, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 2.206355173336151, + "language_loss": 0.68902636, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71009547, + "num_input_tokens_seen": 255892915, + "step": 11860, + "time_per_iteration": 3.0203115940093994 + }, + { + "auxiliary_loss_clip": 0.01086944, + "auxiliary_loss_mlp": 0.01028574, + "balance_loss_clip": 1.03283811, + "balance_loss_mlp": 1.01661086, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.9854392933485963, + "language_loss": 0.64451414, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66566932, + "num_input_tokens_seen": 255911480, + "step": 11861, + "time_per_iteration": 2.5431101322174072 + }, + { + "auxiliary_loss_clip": 0.01079473, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.03334987, + "balance_loss_mlp": 1.0178566, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.6295287535007708, + "language_loss": 0.67152715, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69261712, + "num_input_tokens_seen": 255931140, + "step": 11862, + "time_per_iteration": 4.272973537445068 + }, + { + "auxiliary_loss_clip": 0.01067873, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.03139639, + "balance_loss_mlp": 1.02525091, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 1.9776074820752076, + "language_loss": 0.67078686, + "learning_rate": 8.022029939445214e-07, + "loss": 0.69182044, + "num_input_tokens_seen": 255951665, + "step": 11863, + "time_per_iteration": 2.693143844604492 + }, + { + "auxiliary_loss_clip": 0.01055484, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.03395569, + "balance_loss_mlp": 1.02736807, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 2.1156564495650407, + "language_loss": 0.65681428, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67777622, + "num_input_tokens_seen": 255970055, + "step": 11864, + "time_per_iteration": 2.7551615238189697 + }, + { + "auxiliary_loss_clip": 0.01087915, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.03332782, + "balance_loss_mlp": 1.02105594, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 3.70218251891051, + "language_loss": 0.86026525, + "learning_rate": 8.015793035467697e-07, + "loss": 0.8814683, + "num_input_tokens_seen": 255987720, + "step": 11865, + "time_per_iteration": 2.5928359031677246 + }, + { + "auxiliary_loss_clip": 0.01052149, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.02799225, + "balance_loss_mlp": 1.01699185, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 1.8216646482133356, + "language_loss": 0.74773949, + "learning_rate": 8.012675265083304e-07, + "loss": 0.76855612, + "num_input_tokens_seen": 256005490, + "step": 11866, + "time_per_iteration": 2.6351747512817383 + }, + { + "auxiliary_loss_clip": 0.01057815, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.03149533, + "balance_loss_mlp": 1.02164841, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 3.539358278104304, + "language_loss": 0.70518774, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72610474, + "num_input_tokens_seen": 256026030, + "step": 11867, + "time_per_iteration": 2.6665725708007812 + }, + { + "auxiliary_loss_clip": 0.01086406, + "auxiliary_loss_mlp": 0.01027288, + "balance_loss_clip": 1.03429341, + "balance_loss_mlp": 1.01708937, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 2.90543797111461, + "language_loss": 0.71699798, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73813492, + "num_input_tokens_seen": 256043680, + "step": 11868, + "time_per_iteration": 2.64554762840271 + }, + { + "auxiliary_loss_clip": 0.0104926, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.03269887, + "balance_loss_mlp": 1.01803708, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.667367919869266, + "language_loss": 0.65927386, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68008405, + "num_input_tokens_seen": 256059705, + "step": 11869, + "time_per_iteration": 2.651637077331543 + }, + { + "auxiliary_loss_clip": 0.01068469, + "auxiliary_loss_mlp": 0.0102537, + "balance_loss_clip": 1.02910781, + "balance_loss_mlp": 1.01432538, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5029769620449571, + "language_loss": 0.77729034, + "learning_rate": 8.000208730333298e-07, + "loss": 0.79822874, + "num_input_tokens_seen": 256079785, + "step": 11870, + "time_per_iteration": 2.6453301906585693 + }, + { + "auxiliary_loss_clip": 0.01045076, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.03153551, + "balance_loss_mlp": 1.01806569, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.9146546292873408, + "language_loss": 0.80903077, + "learning_rate": 7.997093233933597e-07, + "loss": 0.82978034, + "num_input_tokens_seen": 256099000, + "step": 11871, + "time_per_iteration": 2.6899216175079346 + }, + { + "auxiliary_loss_clip": 0.01065791, + "auxiliary_loss_mlp": 0.01036849, + "balance_loss_clip": 1.03128958, + "balance_loss_mlp": 1.0252316, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.640460811250977, + "language_loss": 0.79154444, + "learning_rate": 7.993978192685331e-07, + "loss": 0.81257081, + "num_input_tokens_seen": 256117985, + "step": 11872, + "time_per_iteration": 2.6129825115203857 + }, + { + "auxiliary_loss_clip": 0.01089255, + "auxiliary_loss_mlp": 0.01025324, + "balance_loss_clip": 1.03395534, + "balance_loss_mlp": 1.01393938, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.390860248130159, + "language_loss": 0.83755994, + "learning_rate": 7.990863606706606e-07, + "loss": 0.85870576, + "num_input_tokens_seen": 256134350, + "step": 11873, + "time_per_iteration": 4.202666997909546 + }, + { + "auxiliary_loss_clip": 0.01056704, + "auxiliary_loss_mlp": 0.01028539, + "balance_loss_clip": 1.02888536, + "balance_loss_mlp": 1.01840019, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 1.9311290068669054, + "language_loss": 0.85606098, + "learning_rate": 7.987749476115539e-07, + "loss": 0.87691337, + "num_input_tokens_seen": 256150610, + "step": 11874, + "time_per_iteration": 2.560797691345215 + }, + { + "auxiliary_loss_clip": 0.01088705, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.033319, + "balance_loss_mlp": 1.01902616, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 1.9312717858512274, + "language_loss": 0.83321881, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85440683, + "num_input_tokens_seen": 256168620, + "step": 11875, + "time_per_iteration": 2.5566468238830566 + }, + { + "auxiliary_loss_clip": 0.01075153, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.03238511, + "balance_loss_mlp": 1.02102041, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.7835306952432368, + "language_loss": 0.69530642, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71641147, + "num_input_tokens_seen": 256186700, + "step": 11876, + "time_per_iteration": 4.126229763031006 + }, + { + "auxiliary_loss_clip": 0.01101199, + "auxiliary_loss_mlp": 0.01030405, + "balance_loss_clip": 1.03515029, + "balance_loss_mlp": 1.01907456, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 2.1706468186629473, + "language_loss": 0.77308679, + "learning_rate": 7.978409817849079e-07, + "loss": 0.79440278, + "num_input_tokens_seen": 256205390, + "step": 11877, + "time_per_iteration": 2.501309394836426 + }, + { + "auxiliary_loss_clip": 0.01086749, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.03388906, + "balance_loss_mlp": 1.02198815, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 1.8159734234966738, + "language_loss": 0.6999132, + "learning_rate": 7.97529750998934e-07, + "loss": 0.7211076, + "num_input_tokens_seen": 256224575, + "step": 11878, + "time_per_iteration": 2.5929317474365234 + }, + { + "auxiliary_loss_clip": 0.01060873, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.0216316, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 2.87835426111677, + "language_loss": 0.67498815, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69591373, + "num_input_tokens_seen": 256242130, + "step": 11879, + "time_per_iteration": 2.658238172531128 + }, + { + "auxiliary_loss_clip": 0.01038057, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.03134859, + "balance_loss_mlp": 1.02324104, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 2.394923478910248, + "language_loss": 0.69176483, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71251166, + "num_input_tokens_seen": 256261920, + "step": 11880, + "time_per_iteration": 2.6906659603118896 + }, + { + "auxiliary_loss_clip": 0.01068177, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.0304569, + "balance_loss_mlp": 1.02266479, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.746099157770999, + "language_loss": 0.80633569, + "learning_rate": 7.965963322749674e-07, + "loss": 0.8273626, + "num_input_tokens_seen": 256277970, + "step": 11881, + "time_per_iteration": 2.6268937587738037 + }, + { + "auxiliary_loss_clip": 0.01055654, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.02942383, + "balance_loss_mlp": 1.01699996, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.6638680807243988, + "language_loss": 0.63434595, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65517306, + "num_input_tokens_seen": 256298205, + "step": 11882, + "time_per_iteration": 2.713308811187744 + }, + { + "auxiliary_loss_clip": 0.01101454, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.03562152, + "balance_loss_mlp": 1.01796055, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.8095827518514522, + "language_loss": 0.68700266, + "learning_rate": 7.959742812719304e-07, + "loss": 0.70830542, + "num_input_tokens_seen": 256316685, + "step": 11883, + "time_per_iteration": 2.538186550140381 + }, + { + "auxiliary_loss_clip": 0.01085163, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.03232026, + "balance_loss_mlp": 1.02562022, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.9599490428017625, + "language_loss": 0.77829778, + "learning_rate": 7.956633242496788e-07, + "loss": 0.79952216, + "num_input_tokens_seen": 256334205, + "step": 11884, + "time_per_iteration": 2.6150431632995605 + }, + { + "auxiliary_loss_clip": 0.01092419, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.03271461, + "balance_loss_mlp": 1.01658201, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.41524363790211, + "language_loss": 0.74104118, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76225877, + "num_input_tokens_seen": 256353340, + "step": 11885, + "time_per_iteration": 2.5949044227600098 + }, + { + "auxiliary_loss_clip": 0.01005344, + "auxiliary_loss_mlp": 0.00999441, + "balance_loss_clip": 1.00478482, + "balance_loss_mlp": 0.99839842, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8836268823113885, + "language_loss": 0.66309965, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68314755, + "num_input_tokens_seen": 256411550, + "step": 11886, + "time_per_iteration": 3.1358866691589355 + }, + { + "auxiliary_loss_clip": 0.01056719, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.0332725, + "balance_loss_mlp": 1.01440001, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.6414756302858462, + "language_loss": 0.7494368, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77026063, + "num_input_tokens_seen": 256430360, + "step": 11887, + "time_per_iteration": 2.654843330383301 + }, + { + "auxiliary_loss_clip": 0.01086753, + "auxiliary_loss_mlp": 0.0102672, + "balance_loss_clip": 1.03312147, + "balance_loss_mlp": 1.01608062, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.5415889028307477, + "language_loss": 0.71748376, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73861849, + "num_input_tokens_seen": 256449750, + "step": 11888, + "time_per_iteration": 2.54899525642395 + }, + { + "auxiliary_loss_clip": 0.01080213, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.03044176, + "balance_loss_mlp": 1.02014065, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 2.179401413144181, + "language_loss": 0.84834945, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86946994, + "num_input_tokens_seen": 256467330, + "step": 11889, + "time_per_iteration": 4.160475730895996 + }, + { + "auxiliary_loss_clip": 0.01049687, + "auxiliary_loss_mlp": 0.01027328, + "balance_loss_clip": 1.03260434, + "balance_loss_mlp": 1.01650953, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 2.078630075275788, + "language_loss": 0.75929987, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78007001, + "num_input_tokens_seen": 256485705, + "step": 11890, + "time_per_iteration": 2.6906542778015137 + }, + { + "auxiliary_loss_clip": 0.01051243, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.02897966, + "balance_loss_mlp": 1.02008915, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.794941756991225, + "language_loss": 0.73949552, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76031578, + "num_input_tokens_seen": 256504755, + "step": 11891, + "time_per_iteration": 2.7221031188964844 + }, + { + "auxiliary_loss_clip": 0.01054375, + "auxiliary_loss_mlp": 0.01034972, + "balance_loss_clip": 1.03243184, + "balance_loss_mlp": 1.02349842, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 1.7276729480053983, + "language_loss": 0.67986548, + "learning_rate": 7.931773131302211e-07, + "loss": 0.70075899, + "num_input_tokens_seen": 256523670, + "step": 11892, + "time_per_iteration": 2.6892192363739014 + }, + { + "auxiliary_loss_clip": 0.01063575, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.0336566, + "balance_loss_mlp": 1.01883101, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 1.8477771445077924, + "language_loss": 0.74062866, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76157761, + "num_input_tokens_seen": 256542225, + "step": 11893, + "time_per_iteration": 2.6559009552001953 + }, + { + "auxiliary_loss_clip": 0.01101796, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.03495657, + "balance_loss_mlp": 1.0182097, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.288370755869107, + "language_loss": 0.66232306, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68364084, + "num_input_tokens_seen": 256560730, + "step": 11894, + "time_per_iteration": 2.556020498275757 + }, + { + "auxiliary_loss_clip": 0.01065182, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.0350101, + "balance_loss_mlp": 1.02001143, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.6015107077111264, + "language_loss": 0.77678388, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79774511, + "num_input_tokens_seen": 256580505, + "step": 11895, + "time_per_iteration": 2.704890489578247 + }, + { + "auxiliary_loss_clip": 0.01089975, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.03421903, + "balance_loss_mlp": 1.01839542, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 3.0791586319225677, + "language_loss": 0.69675416, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71795827, + "num_input_tokens_seen": 256597330, + "step": 11896, + "time_per_iteration": 2.5877041816711426 + }, + { + "auxiliary_loss_clip": 0.01078079, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.03239441, + "balance_loss_mlp": 1.02738726, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 4.832062257172254, + "language_loss": 0.86536968, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88654667, + "num_input_tokens_seen": 256616030, + "step": 11897, + "time_per_iteration": 2.6647119522094727 + }, + { + "auxiliary_loss_clip": 0.0107888, + "auxiliary_loss_mlp": 0.01031352, + "balance_loss_clip": 1.0349493, + "balance_loss_mlp": 1.0205754, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 1.939129394351589, + "language_loss": 0.78177536, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80287766, + "num_input_tokens_seen": 256635570, + "step": 11898, + "time_per_iteration": 2.6482338905334473 + }, + { + "auxiliary_loss_clip": 0.01073102, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.0318867, + "balance_loss_mlp": 1.01775217, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.7047281175883902, + "language_loss": 0.73060775, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75164151, + "num_input_tokens_seen": 256655290, + "step": 11899, + "time_per_iteration": 2.641575574874878 + }, + { + "auxiliary_loss_clip": 0.01087117, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.03267622, + "balance_loss_mlp": 1.02188849, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 3.1962400177561805, + "language_loss": 0.76066363, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78187096, + "num_input_tokens_seen": 256671605, + "step": 11900, + "time_per_iteration": 2.652125597000122 + }, + { + "auxiliary_loss_clip": 0.01090069, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.03520346, + "balance_loss_mlp": 1.01832128, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 2.6164994338303957, + "language_loss": 0.80842263, + "learning_rate": 7.903840517773886e-07, + "loss": 0.82961613, + "num_input_tokens_seen": 256689680, + "step": 11901, + "time_per_iteration": 2.537036657333374 + }, + { + "auxiliary_loss_clip": 0.01066275, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.03324974, + "balance_loss_mlp": 1.02060413, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 2.0742166406335705, + "language_loss": 0.81135392, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83234656, + "num_input_tokens_seen": 256707760, + "step": 11902, + "time_per_iteration": 4.103976011276245 + }, + { + "auxiliary_loss_clip": 0.01054168, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.03097272, + "balance_loss_mlp": 1.0170548, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 1.6299617143469272, + "language_loss": 0.6773743, + "learning_rate": 7.897638312866785e-07, + "loss": 0.69820136, + "num_input_tokens_seen": 256724150, + "step": 11903, + "time_per_iteration": 2.5892977714538574 + }, + { + "auxiliary_loss_clip": 0.01055502, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.02862155, + "balance_loss_mlp": 1.01913333, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.5733271417599106, + "language_loss": 0.75996464, + "learning_rate": 7.894537898738589e-07, + "loss": 0.7808221, + "num_input_tokens_seen": 256742780, + "step": 11904, + "time_per_iteration": 2.649951934814453 + }, + { + "auxiliary_loss_clip": 0.01079636, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.03485262, + "balance_loss_mlp": 1.02181911, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 2.2488390147964727, + "language_loss": 0.72190046, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74303752, + "num_input_tokens_seen": 256761355, + "step": 11905, + "time_per_iteration": 2.617784261703491 + }, + { + "auxiliary_loss_clip": 0.01063828, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.0327251, + "balance_loss_mlp": 1.01676702, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.5368001309457857, + "language_loss": 0.77765739, + "learning_rate": 7.88833844772076e-07, + "loss": 0.79857129, + "num_input_tokens_seen": 256781335, + "step": 11906, + "time_per_iteration": 2.72560453414917 + }, + { + "auxiliary_loss_clip": 0.01004232, + "auxiliary_loss_mlp": 0.01006604, + "balance_loss_clip": 1.00420928, + "balance_loss_mlp": 1.0055846, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7422890897723833, + "language_loss": 0.55341512, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57352346, + "num_input_tokens_seen": 256838890, + "step": 11907, + "time_per_iteration": 3.0913662910461426 + }, + { + "auxiliary_loss_clip": 0.01080346, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.03193426, + "balance_loss_mlp": 1.02153039, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.9084131419181063, + "language_loss": 0.69282424, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71396023, + "num_input_tokens_seen": 256858145, + "step": 11908, + "time_per_iteration": 2.606835126876831 + }, + { + "auxiliary_loss_clip": 0.01040424, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.02790022, + "balance_loss_mlp": 1.01911008, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.860246069002492, + "language_loss": 0.71280825, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73353213, + "num_input_tokens_seen": 256878545, + "step": 11909, + "time_per_iteration": 2.683122158050537 + }, + { + "auxiliary_loss_clip": 0.01087355, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.03495669, + "balance_loss_mlp": 1.01732242, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.5926753983564559, + "language_loss": 0.75028408, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77144301, + "num_input_tokens_seen": 256899920, + "step": 11910, + "time_per_iteration": 2.625586748123169 + }, + { + "auxiliary_loss_clip": 0.01074364, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.03375924, + "balance_loss_mlp": 1.02151823, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.5477366168676139, + "language_loss": 0.76664656, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78770638, + "num_input_tokens_seen": 256918460, + "step": 11911, + "time_per_iteration": 2.655458927154541 + }, + { + "auxiliary_loss_clip": 0.01053128, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03199744, + "balance_loss_mlp": 1.02040482, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.636187607087067, + "language_loss": 0.58633143, + "learning_rate": 7.869751121037192e-07, + "loss": 0.607198, + "num_input_tokens_seen": 256942015, + "step": 11912, + "time_per_iteration": 4.544330835342407 + }, + { + "auxiliary_loss_clip": 0.01089744, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.03620601, + "balance_loss_mlp": 1.02102757, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 1.6193227449611383, + "language_loss": 0.77951646, + "learning_rate": 7.866654842502376e-07, + "loss": 0.80074084, + "num_input_tokens_seen": 256961065, + "step": 11913, + "time_per_iteration": 2.602853775024414 + }, + { + "auxiliary_loss_clip": 0.01075155, + "auxiliary_loss_mlp": 0.01025897, + "balance_loss_clip": 1.03248048, + "balance_loss_mlp": 1.0157342, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.799889933089531, + "language_loss": 0.74288088, + "learning_rate": 7.863559024065234e-07, + "loss": 0.7638914, + "num_input_tokens_seen": 256982165, + "step": 11914, + "time_per_iteration": 2.638140916824341 + }, + { + "auxiliary_loss_clip": 0.01061011, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.03107738, + "balance_loss_mlp": 1.01934671, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.7021650021823984, + "language_loss": 0.7378335, + "learning_rate": 7.860463665843143e-07, + "loss": 0.75874627, + "num_input_tokens_seen": 256999825, + "step": 11915, + "time_per_iteration": 2.655747890472412 + }, + { + "auxiliary_loss_clip": 0.01098987, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.03334105, + "balance_loss_mlp": 1.01802945, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.820355243788823, + "language_loss": 0.81195682, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83323491, + "num_input_tokens_seen": 257017450, + "step": 11916, + "time_per_iteration": 2.539072275161743 + }, + { + "auxiliary_loss_clip": 0.01023768, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.03239918, + "balance_loss_mlp": 1.02052832, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 2.083732018248425, + "language_loss": 0.68559802, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70614994, + "num_input_tokens_seen": 257035465, + "step": 11917, + "time_per_iteration": 4.284849405288696 + }, + { + "auxiliary_loss_clip": 0.01073311, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.03153229, + "balance_loss_mlp": 1.01825511, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 2.4994451589796594, + "language_loss": 0.76178509, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78281903, + "num_input_tokens_seen": 257053750, + "step": 11918, + "time_per_iteration": 2.6388590335845947 + }, + { + "auxiliary_loss_clip": 0.01006608, + "auxiliary_loss_mlp": 0.01008891, + "balance_loss_clip": 1.00588751, + "balance_loss_mlp": 1.00779438, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6297473473797192, + "language_loss": 0.53875822, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55891323, + "num_input_tokens_seen": 257121215, + "step": 11919, + "time_per_iteration": 3.231065034866333 + }, + { + "auxiliary_loss_clip": 0.01079435, + "auxiliary_loss_mlp": 0.01026006, + "balance_loss_clip": 1.03547418, + "balance_loss_mlp": 1.01526546, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 2.0307727823670976, + "language_loss": 0.69273996, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71379435, + "num_input_tokens_seen": 257143370, + "step": 11920, + "time_per_iteration": 2.7106804847717285 + }, + { + "auxiliary_loss_clip": 0.01078106, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.03205919, + "balance_loss_mlp": 1.02057052, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 1.833141270775189, + "language_loss": 0.7493251, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77042663, + "num_input_tokens_seen": 257162160, + "step": 11921, + "time_per_iteration": 2.7227189540863037 + }, + { + "auxiliary_loss_clip": 0.01065511, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.03411698, + "balance_loss_mlp": 1.01842594, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 2.167578797033097, + "language_loss": 0.75870478, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77967834, + "num_input_tokens_seen": 257179300, + "step": 11922, + "time_per_iteration": 2.640188694000244 + }, + { + "auxiliary_loss_clip": 0.01006137, + "auxiliary_loss_mlp": 0.01011391, + "balance_loss_clip": 1.00468731, + "balance_loss_mlp": 1.01036537, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7530650097098192, + "language_loss": 0.55159616, + "learning_rate": 7.83571738189001e-07, + "loss": 0.5717715, + "num_input_tokens_seen": 257235470, + "step": 11923, + "time_per_iteration": 2.995072603225708 + }, + { + "auxiliary_loss_clip": 0.01066046, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.03119802, + "balance_loss_mlp": 1.01923633, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.5150771794402502, + "language_loss": 0.77195024, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79291815, + "num_input_tokens_seen": 257255850, + "step": 11924, + "time_per_iteration": 2.6901490688323975 + }, + { + "auxiliary_loss_clip": 0.01064937, + "auxiliary_loss_mlp": 0.01026135, + "balance_loss_clip": 1.03254247, + "balance_loss_mlp": 1.01616907, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.8584447909413522, + "language_loss": 0.68336201, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70427275, + "num_input_tokens_seen": 257275425, + "step": 11925, + "time_per_iteration": 2.656834125518799 + }, + { + "auxiliary_loss_clip": 0.01067847, + "auxiliary_loss_mlp": 0.01026075, + "balance_loss_clip": 1.03278637, + "balance_loss_mlp": 1.01598978, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.6011299594893302, + "language_loss": 0.77444994, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79538912, + "num_input_tokens_seen": 257295740, + "step": 11926, + "time_per_iteration": 2.690606117248535 + }, + { + "auxiliary_loss_clip": 0.0108831, + "auxiliary_loss_mlp": 0.00749585, + "balance_loss_clip": 1.033108, + "balance_loss_mlp": 1.00029659, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 2.1033978527057515, + "language_loss": 0.77381837, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79219735, + "num_input_tokens_seen": 257315970, + "step": 11927, + "time_per_iteration": 2.6110594272613525 + }, + { + "auxiliary_loss_clip": 0.0108878, + "auxiliary_loss_mlp": 0.01025942, + "balance_loss_clip": 1.0354116, + "balance_loss_mlp": 1.01415825, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.6752845444994695, + "language_loss": 0.689623, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71077019, + "num_input_tokens_seen": 257334230, + "step": 11928, + "time_per_iteration": 2.618112802505493 + }, + { + "auxiliary_loss_clip": 0.01046465, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.03135705, + "balance_loss_mlp": 1.0159936, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.7474467216905691, + "language_loss": 0.65118009, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67191422, + "num_input_tokens_seen": 257352145, + "step": 11929, + "time_per_iteration": 4.2424046993255615 + }, + { + "auxiliary_loss_clip": 0.0107119, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.03131223, + "balance_loss_mlp": 1.01872802, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 1.938574027559075, + "language_loss": 0.6939559, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71497262, + "num_input_tokens_seen": 257371460, + "step": 11930, + "time_per_iteration": 2.654770851135254 + }, + { + "auxiliary_loss_clip": 0.01060413, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.02842128, + "balance_loss_mlp": 1.01523089, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.7004156948147868, + "language_loss": 0.80621237, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82708037, + "num_input_tokens_seen": 257390800, + "step": 11931, + "time_per_iteration": 2.6835625171661377 + }, + { + "auxiliary_loss_clip": 0.01087843, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.03406155, + "balance_loss_mlp": 1.02185965, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.1733525129294744, + "language_loss": 0.78395563, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80515474, + "num_input_tokens_seen": 257407495, + "step": 11932, + "time_per_iteration": 2.5752463340759277 + }, + { + "auxiliary_loss_clip": 0.01083744, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.03369164, + "balance_loss_mlp": 1.01603913, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.7357140090012237, + "language_loss": 0.75028563, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77139336, + "num_input_tokens_seen": 257429675, + "step": 11933, + "time_per_iteration": 2.668602705001831 + }, + { + "auxiliary_loss_clip": 0.01105755, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.03648853, + "balance_loss_mlp": 1.02075279, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 3.064039932185186, + "language_loss": 0.69639701, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71779895, + "num_input_tokens_seen": 257442765, + "step": 11934, + "time_per_iteration": 2.4989869594573975 + }, + { + "auxiliary_loss_clip": 0.01088206, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.0327934, + "balance_loss_mlp": 1.01882112, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.576764603833295, + "language_loss": 0.86167049, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88285315, + "num_input_tokens_seen": 257459310, + "step": 11935, + "time_per_iteration": 2.6281168460845947 + }, + { + "auxiliary_loss_clip": 0.01051096, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.02877784, + "balance_loss_mlp": 1.01702642, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.639430077956734, + "language_loss": 0.73778355, + "learning_rate": 7.795567660576388e-07, + "loss": 0.7585777, + "num_input_tokens_seen": 257484750, + "step": 11936, + "time_per_iteration": 2.7505030632019043 + }, + { + "auxiliary_loss_clip": 0.01024574, + "auxiliary_loss_mlp": 0.01003414, + "balance_loss_clip": 1.00395882, + "balance_loss_mlp": 1.00235355, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.757400064542863, + "language_loss": 0.55873668, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57901657, + "num_input_tokens_seen": 257543110, + "step": 11937, + "time_per_iteration": 3.077207088470459 + }, + { + "auxiliary_loss_clip": 0.010901, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.03441262, + "balance_loss_mlp": 1.02177763, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.8017620179819196, + "language_loss": 0.54679573, + "learning_rate": 7.789397715835542e-07, + "loss": 0.5680384, + "num_input_tokens_seen": 257567410, + "step": 11938, + "time_per_iteration": 2.6494174003601074 + }, + { + "auxiliary_loss_clip": 0.01085099, + "auxiliary_loss_mlp": 0.01024665, + "balance_loss_clip": 1.03271508, + "balance_loss_mlp": 1.01421666, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.6999717445700975, + "language_loss": 0.76583475, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78693241, + "num_input_tokens_seen": 257586270, + "step": 11939, + "time_per_iteration": 2.5597782135009766 + }, + { + "auxiliary_loss_clip": 0.01007994, + "auxiliary_loss_mlp": 0.00998846, + "balance_loss_clip": 1.0071528, + "balance_loss_mlp": 0.99785024, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7557060156797207, + "language_loss": 0.61465019, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63471854, + "num_input_tokens_seen": 257647415, + "step": 11940, + "time_per_iteration": 3.147723436355591 + }, + { + "auxiliary_loss_clip": 0.01059371, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.03061056, + "balance_loss_mlp": 1.01994896, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.501461109238095, + "language_loss": 0.5903427, + "learning_rate": 7.780146271721097e-07, + "loss": 0.61124396, + "num_input_tokens_seen": 257669795, + "step": 11941, + "time_per_iteration": 2.705648422241211 + }, + { + "auxiliary_loss_clip": 0.0107728, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03488421, + "balance_loss_mlp": 1.01779115, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 2.1186066677325925, + "language_loss": 0.79073334, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81179684, + "num_input_tokens_seen": 257687415, + "step": 11942, + "time_per_iteration": 2.6587209701538086 + }, + { + "auxiliary_loss_clip": 0.01088453, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.03393722, + "balance_loss_mlp": 1.02472961, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 1.7198145140172176, + "language_loss": 0.66171539, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68296009, + "num_input_tokens_seen": 257706215, + "step": 11943, + "time_per_iteration": 4.062662601470947 + }, + { + "auxiliary_loss_clip": 0.01097885, + "auxiliary_loss_mlp": 0.01026262, + "balance_loss_clip": 1.03489923, + "balance_loss_mlp": 1.01510358, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 4.158781247289571, + "language_loss": 0.78618181, + "learning_rate": 7.770898998009254e-07, + "loss": 0.80742323, + "num_input_tokens_seen": 257724740, + "step": 11944, + "time_per_iteration": 2.542294979095459 + }, + { + "auxiliary_loss_clip": 0.01070964, + "auxiliary_loss_mlp": 0.00749498, + "balance_loss_clip": 1.03223372, + "balance_loss_mlp": 1.0002085, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.100994361164855, + "language_loss": 0.63144451, + "learning_rate": 7.767817500740277e-07, + "loss": 0.64964914, + "num_input_tokens_seen": 257742060, + "step": 11945, + "time_per_iteration": 2.7222585678100586 + }, + { + "auxiliary_loss_clip": 0.0101585, + "auxiliary_loss_mlp": 0.01003838, + "balance_loss_clip": 1.00784445, + "balance_loss_mlp": 1.00287867, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7056173800972134, + "language_loss": 0.51101071, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53120756, + "num_input_tokens_seen": 257802250, + "step": 11946, + "time_per_iteration": 3.0713493824005127 + }, + { + "auxiliary_loss_clip": 0.01067301, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.03413618, + "balance_loss_mlp": 1.02262139, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 2.271607425659996, + "language_loss": 0.74336594, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76439786, + "num_input_tokens_seen": 257821155, + "step": 11947, + "time_per_iteration": 2.7316932678222656 + }, + { + "auxiliary_loss_clip": 0.01050863, + "auxiliary_loss_mlp": 0.0074945, + "balance_loss_clip": 1.02832031, + "balance_loss_mlp": 1.00027609, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.6247300730398189, + "language_loss": 0.72535717, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74336034, + "num_input_tokens_seen": 257839905, + "step": 11948, + "time_per_iteration": 2.6805689334869385 + }, + { + "auxiliary_loss_clip": 0.0107651, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_clip": 1.03263175, + "balance_loss_mlp": 1.02973962, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 9.252118917321456, + "language_loss": 0.7134015, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73458827, + "num_input_tokens_seen": 257860055, + "step": 11949, + "time_per_iteration": 2.6568589210510254 + }, + { + "auxiliary_loss_clip": 0.0109799, + "auxiliary_loss_mlp": 0.00749388, + "balance_loss_clip": 1.03453469, + "balance_loss_mlp": 1.00017691, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 2.0074019406570875, + "language_loss": 0.76368445, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78215826, + "num_input_tokens_seen": 257879315, + "step": 11950, + "time_per_iteration": 2.5998761653900146 + }, + { + "auxiliary_loss_clip": 0.01102311, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.03587461, + "balance_loss_mlp": 1.02005899, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.584227586823718, + "language_loss": 0.67504358, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69639146, + "num_input_tokens_seen": 257896570, + "step": 11951, + "time_per_iteration": 2.6055195331573486 + }, + { + "auxiliary_loss_clip": 0.01077193, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.03376973, + "balance_loss_mlp": 1.02178097, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.9736151436068388, + "language_loss": 0.779163, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80028141, + "num_input_tokens_seen": 257916855, + "step": 11952, + "time_per_iteration": 4.094135761260986 + }, + { + "auxiliary_loss_clip": 0.01093274, + "auxiliary_loss_mlp": 0.01034357, + "balance_loss_clip": 1.03551388, + "balance_loss_mlp": 1.02211392, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 3.021702376011194, + "language_loss": 0.74752736, + "learning_rate": 7.743182230841352e-07, + "loss": 0.76880372, + "num_input_tokens_seen": 257937140, + "step": 11953, + "time_per_iteration": 2.6120922565460205 + }, + { + "auxiliary_loss_clip": 0.0108869, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.03209233, + "balance_loss_mlp": 1.01961637, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.7600660965870576, + "language_loss": 0.73069286, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75189251, + "num_input_tokens_seen": 257956785, + "step": 11954, + "time_per_iteration": 2.669273614883423 + }, + { + "auxiliary_loss_clip": 0.01079902, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.03562331, + "balance_loss_mlp": 1.02606821, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.8903332983037064, + "language_loss": 0.74173969, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76291513, + "num_input_tokens_seen": 257975455, + "step": 11955, + "time_per_iteration": 2.583519697189331 + }, + { + "auxiliary_loss_clip": 0.01063641, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.03197145, + "balance_loss_mlp": 1.01720691, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.6767074948942788, + "language_loss": 0.73259777, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75351655, + "num_input_tokens_seen": 257996850, + "step": 11956, + "time_per_iteration": 4.213555574417114 + }, + { + "auxiliary_loss_clip": 0.01006038, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.02503002, + "balance_loss_mlp": 1.02358437, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.6055247291299395, + "language_loss": 0.70911503, + "learning_rate": 7.730875746869987e-07, + "loss": 0.72954893, + "num_input_tokens_seen": 258016145, + "step": 11957, + "time_per_iteration": 2.838550329208374 + }, + { + "auxiliary_loss_clip": 0.01046643, + "auxiliary_loss_mlp": 0.0104074, + "balance_loss_clip": 1.03048849, + "balance_loss_mlp": 1.02874732, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.9755100283664144, + "language_loss": 0.73139608, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75226986, + "num_input_tokens_seen": 258035420, + "step": 11958, + "time_per_iteration": 2.8449742794036865 + }, + { + "auxiliary_loss_clip": 0.01082227, + "auxiliary_loss_mlp": 0.01035686, + "balance_loss_clip": 1.03173721, + "balance_loss_mlp": 1.02377081, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 2.843661725642868, + "language_loss": 0.8392722, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86045134, + "num_input_tokens_seen": 258053520, + "step": 11959, + "time_per_iteration": 2.610877513885498 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.03869915, + "balance_loss_mlp": 1.0189954, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.5538825805215577, + "language_loss": 0.818735, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84011531, + "num_input_tokens_seen": 258073020, + "step": 11960, + "time_per_iteration": 2.6018340587615967 + }, + { + "auxiliary_loss_clip": 0.01070894, + "auxiliary_loss_mlp": 0.01036187, + "balance_loss_clip": 1.03220332, + "balance_loss_mlp": 1.02386069, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.588248201927918, + "language_loss": 0.77970111, + "learning_rate": 7.718576706841013e-07, + "loss": 0.80077189, + "num_input_tokens_seen": 258093155, + "step": 11961, + "time_per_iteration": 2.6877684593200684 + }, + { + "auxiliary_loss_clip": 0.01081899, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.03313422, + "balance_loss_mlp": 1.01913953, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.3884095283122577, + "language_loss": 0.74955034, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77066445, + "num_input_tokens_seen": 258113905, + "step": 11962, + "time_per_iteration": 2.711714029312134 + }, + { + "auxiliary_loss_clip": 0.01090145, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.03413606, + "balance_loss_mlp": 1.01905167, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.6932473132859656, + "language_loss": 0.75214237, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77336419, + "num_input_tokens_seen": 258132820, + "step": 11963, + "time_per_iteration": 2.591630458831787 + }, + { + "auxiliary_loss_clip": 0.01062635, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.0349009, + "balance_loss_mlp": 1.02637947, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 2.567312311005552, + "language_loss": 0.80202681, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82304156, + "num_input_tokens_seen": 258148055, + "step": 11964, + "time_per_iteration": 2.6803574562072754 + }, + { + "auxiliary_loss_clip": 0.01086693, + "auxiliary_loss_mlp": 0.01034568, + "balance_loss_clip": 1.0330987, + "balance_loss_mlp": 1.02277184, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.8272720232024264, + "language_loss": 0.75219202, + "learning_rate": 7.70628511821652e-07, + "loss": 0.7734046, + "num_input_tokens_seen": 258165995, + "step": 11965, + "time_per_iteration": 2.6352694034576416 + }, + { + "auxiliary_loss_clip": 0.01075047, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.01977921, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.6320771730164658, + "language_loss": 0.7745893, + "learning_rate": 7.703213386216377e-07, + "loss": 0.7956658, + "num_input_tokens_seen": 258186165, + "step": 11966, + "time_per_iteration": 2.624682664871216 + }, + { + "auxiliary_loss_clip": 0.01070307, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.03030562, + "balance_loss_mlp": 1.02166212, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 1.9415927770385553, + "language_loss": 0.73551679, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75656331, + "num_input_tokens_seen": 258204595, + "step": 11967, + "time_per_iteration": 2.5630767345428467 + }, + { + "auxiliary_loss_clip": 0.01071497, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.03420496, + "balance_loss_mlp": 1.0172658, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 1.6097220694407262, + "language_loss": 0.81543398, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83642197, + "num_input_tokens_seen": 258223110, + "step": 11968, + "time_per_iteration": 4.168830394744873 + }, + { + "auxiliary_loss_clip": 0.01077155, + "auxiliary_loss_mlp": 0.01026053, + "balance_loss_clip": 1.03491616, + "balance_loss_mlp": 1.01583111, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 1.7987977709148562, + "language_loss": 0.76293397, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78396606, + "num_input_tokens_seen": 258242660, + "step": 11969, + "time_per_iteration": 2.636509895324707 + }, + { + "auxiliary_loss_clip": 0.01052479, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.02718174, + "balance_loss_mlp": 1.01664555, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.5609610960213949, + "language_loss": 0.71038604, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73120224, + "num_input_tokens_seen": 258261850, + "step": 11970, + "time_per_iteration": 2.642857313156128 + }, + { + "auxiliary_loss_clip": 0.01003241, + "auxiliary_loss_mlp": 0.01006424, + "balance_loss_clip": 1.00265312, + "balance_loss_mlp": 1.00544631, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9180756656782398, + "language_loss": 0.60825229, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62834895, + "num_input_tokens_seen": 258312570, + "step": 11971, + "time_per_iteration": 3.048757314682007 + }, + { + "auxiliary_loss_clip": 0.01106504, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.03654587, + "balance_loss_mlp": 1.01779103, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 1.9813292267512708, + "language_loss": 0.8039583, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82532501, + "num_input_tokens_seen": 258331600, + "step": 11972, + "time_per_iteration": 2.555874824523926 + }, + { + "auxiliary_loss_clip": 0.01077111, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.0324738, + "balance_loss_mlp": 1.01958227, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.5100785077671721, + "language_loss": 0.75904465, + "learning_rate": 7.681724325006733e-07, + "loss": 0.78013396, + "num_input_tokens_seen": 258351785, + "step": 11973, + "time_per_iteration": 2.7575974464416504 + }, + { + "auxiliary_loss_clip": 0.00987614, + "auxiliary_loss_mlp": 0.00999245, + "balance_loss_clip": 1.00639021, + "balance_loss_mlp": 0.99829751, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8569257698071561, + "language_loss": 0.57287258, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59274119, + "num_input_tokens_seen": 258404035, + "step": 11974, + "time_per_iteration": 3.080624580383301 + }, + { + "auxiliary_loss_clip": 0.01073675, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03283548, + "balance_loss_mlp": 1.02165115, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 1.9174738840855574, + "language_loss": 0.61218005, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63325208, + "num_input_tokens_seen": 258424850, + "step": 11975, + "time_per_iteration": 2.665792465209961 + }, + { + "auxiliary_loss_clip": 0.01086052, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.02106833, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.8780387418738855, + "language_loss": 0.67682022, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69800341, + "num_input_tokens_seen": 258445485, + "step": 11976, + "time_per_iteration": 2.6488208770751953 + }, + { + "auxiliary_loss_clip": 0.01068681, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.0333606, + "balance_loss_mlp": 1.01585221, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.660197798939174, + "language_loss": 0.66898847, + "learning_rate": 7.669455135323004e-07, + "loss": 0.6899429, + "num_input_tokens_seen": 258464505, + "step": 11977, + "time_per_iteration": 2.662245273590088 + }, + { + "auxiliary_loss_clip": 0.01079027, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.03314209, + "balance_loss_mlp": 1.0255636, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.558581954040421, + "language_loss": 0.75628912, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77745128, + "num_input_tokens_seen": 258487190, + "step": 11978, + "time_per_iteration": 2.6644294261932373 + }, + { + "auxiliary_loss_clip": 0.01097468, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.03328323, + "balance_loss_mlp": 1.01804471, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 2.0255071289064244, + "language_loss": 0.78708684, + "learning_rate": 7.663323345468908e-07, + "loss": 0.80835831, + "num_input_tokens_seen": 258503790, + "step": 11979, + "time_per_iteration": 2.528731346130371 + }, + { + "auxiliary_loss_clip": 0.01086916, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.03306913, + "balance_loss_mlp": 1.01774025, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.5980273159597733, + "language_loss": 0.64904422, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67020655, + "num_input_tokens_seen": 258527335, + "step": 11980, + "time_per_iteration": 2.664757251739502 + }, + { + "auxiliary_loss_clip": 0.01092884, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.0360378, + "balance_loss_mlp": 1.02198362, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 1.9220466996848655, + "language_loss": 0.6664117, + "learning_rate": 7.657193426846871e-07, + "loss": 0.68769002, + "num_input_tokens_seen": 258546690, + "step": 11981, + "time_per_iteration": 2.595944881439209 + }, + { + "auxiliary_loss_clip": 0.01065242, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.03157401, + "balance_loss_mlp": 1.01966381, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.7782527496921088, + "language_loss": 0.73415244, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75512165, + "num_input_tokens_seen": 258566340, + "step": 11982, + "time_per_iteration": 4.03133487701416 + }, + { + "auxiliary_loss_clip": 0.01067637, + "auxiliary_loss_mlp": 0.00749582, + "balance_loss_clip": 1.03205001, + "balance_loss_mlp": 1.00025392, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 2.374379916370088, + "language_loss": 0.6600334, + "learning_rate": 7.65106538038665e-07, + "loss": 0.67820555, + "num_input_tokens_seen": 258584455, + "step": 11983, + "time_per_iteration": 2.599376678466797 + }, + { + "auxiliary_loss_clip": 0.01074116, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.03748274, + "balance_loss_mlp": 1.02189469, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.7090443116112328, + "language_loss": 0.66077358, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68185151, + "num_input_tokens_seen": 258604725, + "step": 11984, + "time_per_iteration": 2.6351120471954346 + }, + { + "auxiliary_loss_clip": 0.01087209, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.03441679, + "balance_loss_mlp": 1.01901782, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.9022596855759322, + "language_loss": 0.73922426, + "learning_rate": 7.644939207017771e-07, + "loss": 0.76041329, + "num_input_tokens_seen": 258622885, + "step": 11985, + "time_per_iteration": 2.5768394470214844 + }, + { + "auxiliary_loss_clip": 0.01090255, + "auxiliary_loss_mlp": 0.0102655, + "balance_loss_clip": 1.03603673, + "balance_loss_mlp": 1.01548195, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 2.4530809651770484, + "language_loss": 0.62502164, + "learning_rate": 7.641876823032977e-07, + "loss": 0.64618969, + "num_input_tokens_seen": 258644305, + "step": 11986, + "time_per_iteration": 2.6428377628326416 + }, + { + "auxiliary_loss_clip": 0.01081234, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.03521991, + "balance_loss_mlp": 1.02018034, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.6710830363759108, + "language_loss": 0.72489738, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74604398, + "num_input_tokens_seen": 258661775, + "step": 11987, + "time_per_iteration": 2.6486499309539795 + }, + { + "auxiliary_loss_clip": 0.0108203, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.03503811, + "balance_loss_mlp": 1.02246511, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.798480925196285, + "language_loss": 0.78399658, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80516315, + "num_input_tokens_seen": 258679830, + "step": 11988, + "time_per_iteration": 2.584641218185425 + }, + { + "auxiliary_loss_clip": 0.01100411, + "auxiliary_loss_mlp": 0.01030128, + "balance_loss_clip": 1.03512347, + "balance_loss_mlp": 1.01849961, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 1.7383709171637436, + "language_loss": 0.78737837, + "learning_rate": 7.632692483270618e-07, + "loss": 0.80868381, + "num_input_tokens_seen": 258697415, + "step": 11989, + "time_per_iteration": 2.536695718765259 + }, + { + "auxiliary_loss_clip": 0.01097928, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.03442109, + "balance_loss_mlp": 1.02038908, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 1.6702642623715365, + "language_loss": 0.82385862, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84515381, + "num_input_tokens_seen": 258716755, + "step": 11990, + "time_per_iteration": 2.5807929039001465 + }, + { + "auxiliary_loss_clip": 0.01070478, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.03388715, + "balance_loss_mlp": 1.02450681, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 1.886159108216978, + "language_loss": 0.76039481, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78145272, + "num_input_tokens_seen": 258733270, + "step": 11991, + "time_per_iteration": 2.585644006729126 + }, + { + "auxiliary_loss_clip": 0.01056145, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.03209758, + "balance_loss_mlp": 1.02155209, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 1.4810746523058353, + "language_loss": 0.72206897, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74297136, + "num_input_tokens_seen": 258755270, + "step": 11992, + "time_per_iteration": 4.2067060470581055 + }, + { + "auxiliary_loss_clip": 0.01087489, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.03176045, + "balance_loss_mlp": 1.01880026, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.9278261035941522, + "language_loss": 0.66290319, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68408048, + "num_input_tokens_seen": 258775340, + "step": 11993, + "time_per_iteration": 2.7972865104675293 + }, + { + "auxiliary_loss_clip": 0.01089302, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.03326786, + "balance_loss_mlp": 1.01940942, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 11.594648775836012, + "language_loss": 0.65643555, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67763335, + "num_input_tokens_seen": 258794580, + "step": 11994, + "time_per_iteration": 2.6525228023529053 + }, + { + "auxiliary_loss_clip": 0.01086162, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.03181076, + "balance_loss_mlp": 1.02021003, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 1.767316108106237, + "language_loss": 0.66811383, + "learning_rate": 7.614336469056172e-07, + "loss": 0.68930531, + "num_input_tokens_seen": 258812330, + "step": 11995, + "time_per_iteration": 2.5275449752807617 + }, + { + "auxiliary_loss_clip": 0.01070227, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.03152835, + "balance_loss_mlp": 1.01714659, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.8717112166775116, + "language_loss": 0.79632086, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81731677, + "num_input_tokens_seen": 258831770, + "step": 11996, + "time_per_iteration": 4.116582870483398 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01034405, + "balance_loss_clip": 1.0370003, + "balance_loss_mlp": 1.02309227, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 2.166873030368, + "language_loss": 0.81514835, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83651435, + "num_input_tokens_seen": 258849090, + "step": 11997, + "time_per_iteration": 2.5665552616119385 + }, + { + "auxiliary_loss_clip": 0.01101754, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.03465152, + "balance_loss_mlp": 1.02081823, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 3.7333737281380244, + "language_loss": 0.6637212, + "learning_rate": 7.605164800868646e-07, + "loss": 0.68507081, + "num_input_tokens_seen": 258868230, + "step": 11998, + "time_per_iteration": 2.6110880374908447 + }, + { + "auxiliary_loss_clip": 0.0109999, + "auxiliary_loss_mlp": 0.01029326, + "balance_loss_clip": 1.03573942, + "balance_loss_mlp": 1.01906824, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 2.5245289742279478, + "language_loss": 0.72365481, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74494797, + "num_input_tokens_seen": 258885525, + "step": 11999, + "time_per_iteration": 2.508824586868286 + }, + { + "auxiliary_loss_clip": 0.01081344, + "auxiliary_loss_mlp": 0.01023986, + "balance_loss_clip": 1.03506446, + "balance_loss_mlp": 1.01264966, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.7168205987017866, + "language_loss": 0.82949072, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85054404, + "num_input_tokens_seen": 258903245, + "step": 12000, + "time_per_iteration": 2.5524778366088867 + }, + { + "auxiliary_loss_clip": 0.01092415, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.03619623, + "balance_loss_mlp": 1.02257824, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 2.1521090082772547, + "language_loss": 0.77363116, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79489362, + "num_input_tokens_seen": 258921245, + "step": 12001, + "time_per_iteration": 2.5350067615509033 + }, + { + "auxiliary_loss_clip": 0.01081243, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.03428102, + "balance_loss_mlp": 1.02184534, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.7096249028644932, + "language_loss": 0.81549162, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83664286, + "num_input_tokens_seen": 258939425, + "step": 12002, + "time_per_iteration": 2.568725109100342 + }, + { + "auxiliary_loss_clip": 0.01090127, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.03483248, + "balance_loss_mlp": 1.01914847, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 1.9081077808951603, + "language_loss": 0.62266445, + "learning_rate": 7.589888089035462e-07, + "loss": 0.64387238, + "num_input_tokens_seen": 258960710, + "step": 12003, + "time_per_iteration": 2.675889253616333 + }, + { + "auxiliary_loss_clip": 0.01099737, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.02158368, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.144571068951648, + "language_loss": 0.68839121, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70972055, + "num_input_tokens_seen": 258978475, + "step": 12004, + "time_per_iteration": 2.483802318572998 + }, + { + "auxiliary_loss_clip": 0.01007685, + "auxiliary_loss_mlp": 0.00999378, + "balance_loss_clip": 1.0126785, + "balance_loss_mlp": 0.99826956, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8633666654622613, + "language_loss": 0.54147506, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56154573, + "num_input_tokens_seen": 259037520, + "step": 12005, + "time_per_iteration": 3.1107232570648193 + }, + { + "auxiliary_loss_clip": 0.01070572, + "auxiliary_loss_mlp": 0.01028856, + "balance_loss_clip": 1.03550756, + "balance_loss_mlp": 1.01689315, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.6213413294069983, + "language_loss": 0.63449764, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65549195, + "num_input_tokens_seen": 259061325, + "step": 12006, + "time_per_iteration": 2.714062213897705 + }, + { + "auxiliary_loss_clip": 0.01069881, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.03180599, + "balance_loss_mlp": 1.01939869, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.7199136343919144, + "language_loss": 0.91827452, + "learning_rate": 7.577675189541865e-07, + "loss": 0.93928164, + "num_input_tokens_seen": 259078135, + "step": 12007, + "time_per_iteration": 2.538933515548706 + }, + { + "auxiliary_loss_clip": 0.01058003, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.02911019, + "balance_loss_mlp": 1.02019238, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 1.7581647799484135, + "language_loss": 0.63945603, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66037273, + "num_input_tokens_seen": 259095910, + "step": 12008, + "time_per_iteration": 2.5522842407226562 + }, + { + "auxiliary_loss_clip": 0.01090778, + "auxiliary_loss_mlp": 0.01033212, + "balance_loss_clip": 1.03459907, + "balance_loss_mlp": 1.02138662, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 2.507477605235959, + "language_loss": 0.78695154, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80819148, + "num_input_tokens_seen": 259114225, + "step": 12009, + "time_per_iteration": 4.038757562637329 + }, + { + "auxiliary_loss_clip": 0.01090151, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.03540277, + "balance_loss_mlp": 1.02530789, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.7809846303324928, + "language_loss": 0.64149022, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66277266, + "num_input_tokens_seen": 259134660, + "step": 12010, + "time_per_iteration": 2.6207704544067383 + }, + { + "auxiliary_loss_clip": 0.01100146, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.03435469, + "balance_loss_mlp": 1.01840937, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 1.6965395934341954, + "language_loss": 0.77226424, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79356897, + "num_input_tokens_seen": 259153300, + "step": 12011, + "time_per_iteration": 2.538813591003418 + }, + { + "auxiliary_loss_clip": 0.01079636, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.03223848, + "balance_loss_mlp": 1.02074671, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.71379367765207, + "language_loss": 0.78736842, + "learning_rate": 7.56241966479781e-07, + "loss": 0.80848175, + "num_input_tokens_seen": 259172115, + "step": 12012, + "time_per_iteration": 2.578927993774414 + }, + { + "auxiliary_loss_clip": 0.01080918, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.03665781, + "balance_loss_mlp": 1.0190655, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.7578801970832034, + "language_loss": 0.755997, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77710688, + "num_input_tokens_seen": 259191345, + "step": 12013, + "time_per_iteration": 2.653578758239746 + }, + { + "auxiliary_loss_clip": 0.01098213, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.03383648, + "balance_loss_mlp": 1.01391172, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 4.939704302670989, + "language_loss": 0.75584543, + "learning_rate": 7.556320755530484e-07, + "loss": 0.77707607, + "num_input_tokens_seen": 259211700, + "step": 12014, + "time_per_iteration": 2.539391040802002 + }, + { + "auxiliary_loss_clip": 0.010903, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.03303313, + "balance_loss_mlp": 1.01812768, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.7487356761689254, + "language_loss": 0.86473906, + "learning_rate": 7.553272008637346e-07, + "loss": 0.88593554, + "num_input_tokens_seen": 259233825, + "step": 12015, + "time_per_iteration": 2.5908193588256836 + }, + { + "auxiliary_loss_clip": 0.01087195, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.03381324, + "balance_loss_mlp": 1.02379692, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.7539729691276749, + "language_loss": 0.78285611, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80407608, + "num_input_tokens_seen": 259253055, + "step": 12016, + "time_per_iteration": 2.5649566650390625 + }, + { + "auxiliary_loss_clip": 0.01048827, + "auxiliary_loss_mlp": 0.0104259, + "balance_loss_clip": 1.02905512, + "balance_loss_mlp": 1.0300014, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.4735974407654768, + "language_loss": 0.7767818, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79769599, + "num_input_tokens_seen": 259273420, + "step": 12017, + "time_per_iteration": 2.634787082672119 + }, + { + "auxiliary_loss_clip": 0.01096411, + "auxiliary_loss_mlp": 0.01027715, + "balance_loss_clip": 1.03413582, + "balance_loss_mlp": 1.01725471, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 1.7248569667786902, + "language_loss": 0.73740482, + "learning_rate": 7.54412860030732e-07, + "loss": 0.75864607, + "num_input_tokens_seen": 259291000, + "step": 12018, + "time_per_iteration": 2.5140810012817383 + }, + { + "auxiliary_loss_clip": 0.010709, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.03896105, + "balance_loss_mlp": 1.01755309, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 1.8505768370230207, + "language_loss": 0.77770007, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79868793, + "num_input_tokens_seen": 259312390, + "step": 12019, + "time_per_iteration": 2.654675245285034 + }, + { + "auxiliary_loss_clip": 0.01074526, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.03475857, + "balance_loss_mlp": 1.01608014, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.9406246335194741, + "language_loss": 0.73784328, + "learning_rate": 7.53803535620081e-07, + "loss": 0.7588616, + "num_input_tokens_seen": 259332645, + "step": 12020, + "time_per_iteration": 2.66619873046875 + }, + { + "auxiliary_loss_clip": 0.01083935, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.0338707, + "balance_loss_mlp": 1.02434242, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.6236808399230673, + "language_loss": 0.77286839, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79405683, + "num_input_tokens_seen": 259353810, + "step": 12021, + "time_per_iteration": 2.6082258224487305 + }, + { + "auxiliary_loss_clip": 0.01059639, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.03087592, + "balance_loss_mlp": 1.01919961, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 1.7853692188272636, + "language_loss": 0.67897755, + "learning_rate": 7.531944002330073e-07, + "loss": 0.69987917, + "num_input_tokens_seen": 259372460, + "step": 12022, + "time_per_iteration": 2.679617166519165 + }, + { + "auxiliary_loss_clip": 0.01090393, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 1.02058959, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7309503428698862, + "language_loss": 0.69630456, + "learning_rate": 7.528899034521858e-07, + "loss": 0.7175293, + "num_input_tokens_seen": 259393275, + "step": 12023, + "time_per_iteration": 4.130378246307373 + }, + { + "auxiliary_loss_clip": 0.01065708, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.02842307, + "balance_loss_mlp": 1.01727128, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.7122147709082614, + "language_loss": 0.7097733, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73072183, + "num_input_tokens_seen": 259416205, + "step": 12024, + "time_per_iteration": 2.589817762374878 + }, + { + "auxiliary_loss_clip": 0.0106602, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.03419411, + "balance_loss_mlp": 1.02095699, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 2.3238682198936114, + "language_loss": 0.75616419, + "learning_rate": 7.522810517737089e-07, + "loss": 0.7771399, + "num_input_tokens_seen": 259433115, + "step": 12025, + "time_per_iteration": 2.5919432640075684 + }, + { + "auxiliary_loss_clip": 0.01085869, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.03285062, + "balance_loss_mlp": 1.02066827, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 2.1414822455192173, + "language_loss": 0.76370907, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78487974, + "num_input_tokens_seen": 259450475, + "step": 12026, + "time_per_iteration": 2.5565738677978516 + }, + { + "auxiliary_loss_clip": 0.01089309, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.03388071, + "balance_loss_mlp": 1.02665329, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 2.024181514985078, + "language_loss": 0.66926396, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69053262, + "num_input_tokens_seen": 259469355, + "step": 12027, + "time_per_iteration": 2.5886993408203125 + }, + { + "auxiliary_loss_clip": 0.01052852, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.03587532, + "balance_loss_mlp": 1.01818275, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 1.9398541988737745, + "language_loss": 0.78694606, + "learning_rate": 7.513681291370469e-07, + "loss": 0.80777514, + "num_input_tokens_seen": 259486565, + "step": 12028, + "time_per_iteration": 2.686835765838623 + }, + { + "auxiliary_loss_clip": 0.01054944, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.03052938, + "balance_loss_mlp": 1.01577938, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.7878955830446148, + "language_loss": 0.82274139, + "learning_rate": 7.510639162726e-07, + "loss": 0.84357035, + "num_input_tokens_seen": 259505070, + "step": 12029, + "time_per_iteration": 2.5914742946624756 + }, + { + "auxiliary_loss_clip": 0.01007725, + "auxiliary_loss_mlp": 0.01003508, + "balance_loss_clip": 1.00898075, + "balance_loss_mlp": 1.00242949, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8076262162537917, + "language_loss": 0.61790919, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63802147, + "num_input_tokens_seen": 259569135, + "step": 12030, + "time_per_iteration": 3.2147529125213623 + }, + { + "auxiliary_loss_clip": 0.01078634, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.03090668, + "balance_loss_mlp": 1.01866567, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.731268657582758, + "language_loss": 0.77822006, + "learning_rate": 7.504556326345859e-07, + "loss": 0.7993198, + "num_input_tokens_seen": 259587035, + "step": 12031, + "time_per_iteration": 2.5632007122039795 + }, + { + "auxiliary_loss_clip": 0.01088715, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.03371167, + "balance_loss_mlp": 1.0184294, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 1.7355809307259453, + "language_loss": 0.81287706, + "learning_rate": 7.501515618840834e-07, + "loss": 0.8340649, + "num_input_tokens_seen": 259606140, + "step": 12032, + "time_per_iteration": 4.073545694351196 + }, + { + "auxiliary_loss_clip": 0.01056473, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.03003335, + "balance_loss_mlp": 1.0219382, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.9276698576380975, + "language_loss": 0.75430524, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77520472, + "num_input_tokens_seen": 259624275, + "step": 12033, + "time_per_iteration": 2.645308256149292 + }, + { + "auxiliary_loss_clip": 0.01062243, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.02995062, + "balance_loss_mlp": 1.01806712, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.8495284869555222, + "language_loss": 0.74708748, + "learning_rate": 7.495435625777423e-07, + "loss": 0.76799524, + "num_input_tokens_seen": 259643465, + "step": 12034, + "time_per_iteration": 2.6102497577667236 + }, + { + "auxiliary_loss_clip": 0.01076176, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.03262722, + "balance_loss_mlp": 1.02232337, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.849319236809795, + "language_loss": 0.8055566, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82664454, + "num_input_tokens_seen": 259662500, + "step": 12035, + "time_per_iteration": 2.5929038524627686 + }, + { + "auxiliary_loss_clip": 0.01036549, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.03322124, + "balance_loss_mlp": 1.01645899, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 3.8199818839745885, + "language_loss": 0.61018986, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63083243, + "num_input_tokens_seen": 259680140, + "step": 12036, + "time_per_iteration": 2.633420467376709 + }, + { + "auxiliary_loss_clip": 0.01085966, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.0339725, + "balance_loss_mlp": 1.02100182, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.7338075946474771, + "language_loss": 0.67368233, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69485039, + "num_input_tokens_seen": 259700160, + "step": 12037, + "time_per_iteration": 4.018120527267456 + }, + { + "auxiliary_loss_clip": 0.01098398, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.03413486, + "balance_loss_mlp": 1.02298236, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 1.733145722641854, + "language_loss": 0.7232542, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74458236, + "num_input_tokens_seen": 259720525, + "step": 12038, + "time_per_iteration": 2.607039451599121 + }, + { + "auxiliary_loss_clip": 0.0109983, + "auxiliary_loss_mlp": 0.01035581, + "balance_loss_clip": 1.0351212, + "balance_loss_mlp": 1.02298093, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.5785527363708673, + "language_loss": 0.71804094, + "learning_rate": 7.480243943186293e-07, + "loss": 0.73939508, + "num_input_tokens_seen": 259738680, + "step": 12039, + "time_per_iteration": 2.6115195751190186 + }, + { + "auxiliary_loss_clip": 0.01100946, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.03499627, + "balance_loss_mlp": 1.0212481, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 28.89893597570809, + "language_loss": 0.76206642, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78339207, + "num_input_tokens_seen": 259758790, + "step": 12040, + "time_per_iteration": 2.5475051403045654 + }, + { + "auxiliary_loss_clip": 0.0106572, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.03100514, + "balance_loss_mlp": 1.01916218, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 4.107484480322456, + "language_loss": 0.7644223, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78538424, + "num_input_tokens_seen": 259777370, + "step": 12041, + "time_per_iteration": 2.6073038578033447 + }, + { + "auxiliary_loss_clip": 0.01089268, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.03251028, + "balance_loss_mlp": 1.0190146, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.179637297629008, + "language_loss": 0.63611108, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65730464, + "num_input_tokens_seen": 259794665, + "step": 12042, + "time_per_iteration": 2.546219825744629 + }, + { + "auxiliary_loss_clip": 0.01065154, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.03449357, + "balance_loss_mlp": 1.01987743, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 1.8161711340472193, + "language_loss": 0.83634287, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85731202, + "num_input_tokens_seen": 259811110, + "step": 12043, + "time_per_iteration": 2.6109306812286377 + }, + { + "auxiliary_loss_clip": 0.01066155, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.0322535, + "balance_loss_mlp": 1.0181663, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 2.1484811398362327, + "language_loss": 0.64207172, + "learning_rate": 7.465064129354379e-07, + "loss": 0.6630398, + "num_input_tokens_seen": 259831080, + "step": 12044, + "time_per_iteration": 2.5922887325286865 + }, + { + "auxiliary_loss_clip": 0.0110171, + "auxiliary_loss_mlp": 0.0103357, + "balance_loss_clip": 1.03665841, + "balance_loss_mlp": 1.02166069, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.5706671036753577, + "language_loss": 0.81378913, + "learning_rate": 7.462029592105658e-07, + "loss": 0.8351419, + "num_input_tokens_seen": 259850135, + "step": 12045, + "time_per_iteration": 2.5146093368530273 + }, + { + "auxiliary_loss_clip": 0.01097195, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.03436899, + "balance_loss_mlp": 1.01991987, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.498435223631444, + "language_loss": 0.71652508, + "learning_rate": 7.458995530298034e-07, + "loss": 0.73780864, + "num_input_tokens_seen": 259868185, + "step": 12046, + "time_per_iteration": 2.479848861694336 + }, + { + "auxiliary_loss_clip": 0.01058905, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.02904034, + "balance_loss_mlp": 1.01654708, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 2.099733333858959, + "language_loss": 0.71001971, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73089588, + "num_input_tokens_seen": 259887055, + "step": 12047, + "time_per_iteration": 2.5893683433532715 + }, + { + "auxiliary_loss_clip": 0.01070375, + "auxiliary_loss_mlp": 0.01037989, + "balance_loss_clip": 1.03330112, + "balance_loss_mlp": 1.02617502, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.6147771558760948, + "language_loss": 0.69571924, + "learning_rate": 7.45292883346627e-07, + "loss": 0.71680284, + "num_input_tokens_seen": 259908295, + "step": 12048, + "time_per_iteration": 2.6110448837280273 + }, + { + "auxiliary_loss_clip": 0.01004943, + "auxiliary_loss_mlp": 0.0100961, + "balance_loss_clip": 1.00419784, + "balance_loss_mlp": 1.00847721, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.827395782984991, + "language_loss": 0.53722155, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55736697, + "num_input_tokens_seen": 259968475, + "step": 12049, + "time_per_iteration": 4.632451772689819 + }, + { + "auxiliary_loss_clip": 0.01074507, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.03290713, + "balance_loss_mlp": 1.01565766, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 1.9636984186263733, + "language_loss": 0.60464847, + "learning_rate": 7.446864039779258e-07, + "loss": 0.62568027, + "num_input_tokens_seen": 259984865, + "step": 12050, + "time_per_iteration": 2.5844521522521973 + }, + { + "auxiliary_loss_clip": 0.00986604, + "auxiliary_loss_mlp": 0.01009188, + "balance_loss_clip": 1.00575745, + "balance_loss_mlp": 1.00821018, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7118638072940425, + "language_loss": 0.53254032, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55249828, + "num_input_tokens_seen": 260046735, + "step": 12051, + "time_per_iteration": 3.215853691101074 + }, + { + "auxiliary_loss_clip": 0.01084075, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.03214002, + "balance_loss_mlp": 1.01999176, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.6367929185850703, + "language_loss": 0.72099638, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74214327, + "num_input_tokens_seen": 260067950, + "step": 12052, + "time_per_iteration": 2.5929272174835205 + }, + { + "auxiliary_loss_clip": 0.01083157, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.03181565, + "balance_loss_mlp": 1.01672006, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 2.212620112966422, + "language_loss": 0.74343568, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76455933, + "num_input_tokens_seen": 260087730, + "step": 12053, + "time_per_iteration": 2.5892553329467773 + }, + { + "auxiliary_loss_clip": 0.0105861, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.03135133, + "balance_loss_mlp": 1.02366006, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 3.61654987596386, + "language_loss": 0.78265929, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80360579, + "num_input_tokens_seen": 260107760, + "step": 12054, + "time_per_iteration": 2.6151959896087646 + }, + { + "auxiliary_loss_clip": 0.01064877, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.0330019, + "balance_loss_mlp": 1.0205934, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.5131019092629545, + "language_loss": 0.6867606, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70773721, + "num_input_tokens_seen": 260123660, + "step": 12055, + "time_per_iteration": 2.7712082862854004 + }, + { + "auxiliary_loss_clip": 0.01066111, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.03244805, + "balance_loss_mlp": 1.02022862, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.8347011313256998, + "language_loss": 0.7452724, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76624346, + "num_input_tokens_seen": 260142690, + "step": 12056, + "time_per_iteration": 2.6312472820281982 + }, + { + "auxiliary_loss_clip": 0.01095859, + "auxiliary_loss_mlp": 0.01024677, + "balance_loss_clip": 1.03334582, + "balance_loss_mlp": 1.01356649, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.503623175189415, + "language_loss": 0.70778036, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72898567, + "num_input_tokens_seen": 260162590, + "step": 12057, + "time_per_iteration": 2.556515693664551 + }, + { + "auxiliary_loss_clip": 0.01043248, + "auxiliary_loss_mlp": 0.01043542, + "balance_loss_clip": 1.03132772, + "balance_loss_mlp": 1.03026223, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 2.2528536564115322, + "language_loss": 0.62731826, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64818615, + "num_input_tokens_seen": 260181065, + "step": 12058, + "time_per_iteration": 2.620807409286499 + }, + { + "auxiliary_loss_clip": 0.01055137, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.03085756, + "balance_loss_mlp": 1.0182972, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.9269870329854517, + "language_loss": 0.74411303, + "learning_rate": 7.419596044262535e-07, + "loss": 0.76496935, + "num_input_tokens_seen": 260200330, + "step": 12059, + "time_per_iteration": 2.6916754245758057 + }, + { + "auxiliary_loss_clip": 0.01087149, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.03426015, + "balance_loss_mlp": 1.02174592, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.6731165596714261, + "language_loss": 0.79436946, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81555974, + "num_input_tokens_seen": 260219975, + "step": 12060, + "time_per_iteration": 2.672994613647461 + }, + { + "auxiliary_loss_clip": 0.01087762, + "auxiliary_loss_mlp": 0.01025668, + "balance_loss_clip": 1.03376985, + "balance_loss_mlp": 1.01359856, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 2.0238199829975723, + "language_loss": 0.76149386, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78262812, + "num_input_tokens_seen": 260242025, + "step": 12061, + "time_per_iteration": 2.5702731609344482 + }, + { + "auxiliary_loss_clip": 0.01098133, + "auxiliary_loss_mlp": 0.00749251, + "balance_loss_clip": 1.03551912, + "balance_loss_mlp": 1.00027978, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.6937468883145774, + "language_loss": 0.81379837, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83227217, + "num_input_tokens_seen": 260260015, + "step": 12062, + "time_per_iteration": 2.4399845600128174 + }, + { + "auxiliary_loss_clip": 0.0104432, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.03060317, + "balance_loss_mlp": 1.02023757, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 2.344346620835032, + "language_loss": 0.69132149, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71210217, + "num_input_tokens_seen": 260278635, + "step": 12063, + "time_per_iteration": 4.217488050460815 + }, + { + "auxiliary_loss_clip": 0.01061967, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.0318166, + "balance_loss_mlp": 1.02001333, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.5470262744189676, + "language_loss": 0.69774026, + "learning_rate": 7.40446384925973e-07, + "loss": 0.71867216, + "num_input_tokens_seen": 260298510, + "step": 12064, + "time_per_iteration": 2.6372971534729004 + }, + { + "auxiliary_loss_clip": 0.01078642, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.03447342, + "balance_loss_mlp": 1.02053738, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 8.363184358495124, + "language_loss": 0.90535045, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92645872, + "num_input_tokens_seen": 260317405, + "step": 12065, + "time_per_iteration": 2.5695888996124268 + }, + { + "auxiliary_loss_clip": 0.01017411, + "auxiliary_loss_mlp": 0.00999345, + "balance_loss_clip": 1.00663531, + "balance_loss_mlp": 0.99825448, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.651247620049174, + "language_loss": 0.56062245, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58079004, + "num_input_tokens_seen": 260388085, + "step": 12066, + "time_per_iteration": 3.2710251808166504 + }, + { + "auxiliary_loss_clip": 0.01052104, + "auxiliary_loss_mlp": 0.0102685, + "balance_loss_clip": 1.03267109, + "balance_loss_mlp": 1.01628768, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.8907613053892531, + "language_loss": 0.76770478, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78849429, + "num_input_tokens_seen": 260406165, + "step": 12067, + "time_per_iteration": 2.6990511417388916 + }, + { + "auxiliary_loss_clip": 0.01009021, + "auxiliary_loss_mlp": 0.01000841, + "balance_loss_clip": 1.00800467, + "balance_loss_mlp": 0.99973279, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7244764374200197, + "language_loss": 0.57004339, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59014201, + "num_input_tokens_seen": 260461365, + "step": 12068, + "time_per_iteration": 3.0003931522369385 + }, + { + "auxiliary_loss_clip": 0.00985905, + "auxiliary_loss_mlp": 0.01004854, + "balance_loss_clip": 1.00573754, + "balance_loss_mlp": 1.00355434, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6594361912680919, + "language_loss": 0.55415499, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57406259, + "num_input_tokens_seen": 260523795, + "step": 12069, + "time_per_iteration": 3.2160425186157227 + }, + { + "auxiliary_loss_clip": 0.01064872, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.03346944, + "balance_loss_mlp": 1.01682293, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 2.3819886137087343, + "language_loss": 0.79945242, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82037032, + "num_input_tokens_seen": 260544765, + "step": 12070, + "time_per_iteration": 2.6328747272491455 + }, + { + "auxiliary_loss_clip": 0.01069421, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.03150976, + "balance_loss_mlp": 1.01906061, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 1.7376051316400845, + "language_loss": 0.71899486, + "learning_rate": 7.383298839673197e-07, + "loss": 0.73998952, + "num_input_tokens_seen": 260564340, + "step": 12071, + "time_per_iteration": 2.6357028484344482 + }, + { + "auxiliary_loss_clip": 0.01097173, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.03429449, + "balance_loss_mlp": 1.02154994, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.8294635238411978, + "language_loss": 0.70585924, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72715127, + "num_input_tokens_seen": 260582565, + "step": 12072, + "time_per_iteration": 4.1061108112335205 + }, + { + "auxiliary_loss_clip": 0.01060473, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.02994001, + "balance_loss_mlp": 1.02059531, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 2.6338994003669383, + "language_loss": 0.78600442, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80693883, + "num_input_tokens_seen": 260601700, + "step": 12073, + "time_per_iteration": 2.609143018722534 + }, + { + "auxiliary_loss_clip": 0.01078393, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.03455365, + "balance_loss_mlp": 1.01738667, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.4746590791705283, + "language_loss": 0.70233214, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72340256, + "num_input_tokens_seen": 260623040, + "step": 12074, + "time_per_iteration": 2.6896984577178955 + }, + { + "auxiliary_loss_clip": 0.01078074, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.03294992, + "balance_loss_mlp": 1.01776397, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 6.431632220735201, + "language_loss": 0.7407096, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76178879, + "num_input_tokens_seen": 260642735, + "step": 12075, + "time_per_iteration": 2.61915922164917 + }, + { + "auxiliary_loss_clip": 0.01089014, + "auxiliary_loss_mlp": 0.01030408, + "balance_loss_clip": 1.03412557, + "balance_loss_mlp": 1.01815927, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.4464429997426578, + "language_loss": 0.63867772, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65987194, + "num_input_tokens_seen": 260669935, + "step": 12076, + "time_per_iteration": 2.881319284439087 + }, + { + "auxiliary_loss_clip": 0.01058853, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.03035378, + "balance_loss_mlp": 1.02014995, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 2.7393960676459113, + "language_loss": 0.78799552, + "learning_rate": 7.365176060028912e-07, + "loss": 0.80890572, + "num_input_tokens_seen": 260689605, + "step": 12077, + "time_per_iteration": 2.6793484687805176 + }, + { + "auxiliary_loss_clip": 0.01024657, + "auxiliary_loss_mlp": 0.00746622, + "balance_loss_clip": 1.00423121, + "balance_loss_mlp": 0.99978846, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.88118963887022, + "language_loss": 0.64960289, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66731572, + "num_input_tokens_seen": 260748265, + "step": 12078, + "time_per_iteration": 4.5528950691223145 + }, + { + "auxiliary_loss_clip": 0.0101627, + "auxiliary_loss_mlp": 0.00998793, + "balance_loss_clip": 1.00607073, + "balance_loss_mlp": 0.99777937, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.719838937276111, + "language_loss": 0.59250915, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61265981, + "num_input_tokens_seen": 260816715, + "step": 12079, + "time_per_iteration": 3.203298330307007 + }, + { + "auxiliary_loss_clip": 0.01053796, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.03190804, + "balance_loss_mlp": 1.01973236, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 1.8767868761043385, + "language_loss": 0.64485896, + "learning_rate": 7.356121136696895e-07, + "loss": 0.66570961, + "num_input_tokens_seen": 260836765, + "step": 12080, + "time_per_iteration": 2.677382230758667 + }, + { + "auxiliary_loss_clip": 0.0105241, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.03036928, + "balance_loss_mlp": 1.01801264, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 2.9576687352332196, + "language_loss": 0.69597006, + "learning_rate": 7.35310378768128e-07, + "loss": 0.71679199, + "num_input_tokens_seen": 260854610, + "step": 12081, + "time_per_iteration": 2.6186439990997314 + }, + { + "auxiliary_loss_clip": 0.01103513, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.03642035, + "balance_loss_mlp": 1.01913762, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 1.6871514160791632, + "language_loss": 0.81119049, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83253026, + "num_input_tokens_seen": 260871620, + "step": 12082, + "time_per_iteration": 2.495894193649292 + }, + { + "auxiliary_loss_clip": 0.01090545, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.03257108, + "balance_loss_mlp": 1.02374494, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.958278864292439, + "language_loss": 0.76760399, + "learning_rate": 7.347070528479158e-07, + "loss": 0.78887558, + "num_input_tokens_seen": 260890490, + "step": 12083, + "time_per_iteration": 2.61159086227417 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.03697813, + "balance_loss_mlp": 1.02030158, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.9658935111773452, + "language_loss": 0.72099853, + "learning_rate": 7.344054618521433e-07, + "loss": 0.74235535, + "num_input_tokens_seen": 260909700, + "step": 12084, + "time_per_iteration": 2.5384745597839355 + }, + { + "auxiliary_loss_clip": 0.01104422, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.03721368, + "balance_loss_mlp": 1.0214982, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.6729060585421973, + "language_loss": 0.77738971, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79876745, + "num_input_tokens_seen": 260929090, + "step": 12085, + "time_per_iteration": 2.633784055709839 + }, + { + "auxiliary_loss_clip": 0.01087927, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.03316736, + "balance_loss_mlp": 1.02363682, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.888196036786084, + "language_loss": 0.72372586, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74495339, + "num_input_tokens_seen": 260946615, + "step": 12086, + "time_per_iteration": 2.551391363143921 + }, + { + "auxiliary_loss_clip": 0.01057423, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.02944946, + "balance_loss_mlp": 1.02809119, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 1.9176255243214848, + "language_loss": 0.69466555, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71564937, + "num_input_tokens_seen": 260968515, + "step": 12087, + "time_per_iteration": 2.655305862426758 + }, + { + "auxiliary_loss_clip": 0.01102833, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03600621, + "balance_loss_mlp": 1.02061057, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 3.8582569079623137, + "language_loss": 0.78796887, + "learning_rate": 7.331995778981088e-07, + "loss": 0.80932277, + "num_input_tokens_seen": 260986790, + "step": 12088, + "time_per_iteration": 2.526005268096924 + }, + { + "auxiliary_loss_clip": 0.01086139, + "auxiliary_loss_mlp": 0.01037578, + "balance_loss_clip": 1.03294826, + "balance_loss_mlp": 1.02633703, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 2.207086280544058, + "language_loss": 0.73860008, + "learning_rate": 7.328982269740221e-07, + "loss": 0.75983727, + "num_input_tokens_seen": 261004925, + "step": 12089, + "time_per_iteration": 2.5494842529296875 + }, + { + "auxiliary_loss_clip": 0.01075823, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.03270614, + "balance_loss_mlp": 1.02322233, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.826100535630767, + "language_loss": 0.71153235, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73263979, + "num_input_tokens_seen": 261023895, + "step": 12090, + "time_per_iteration": 4.095763444900513 + }, + { + "auxiliary_loss_clip": 0.01047393, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.03126717, + "balance_loss_mlp": 1.01966059, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 2.0524342799794244, + "language_loss": 0.77108264, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79187649, + "num_input_tokens_seen": 261045445, + "step": 12091, + "time_per_iteration": 2.7933623790740967 + }, + { + "auxiliary_loss_clip": 0.0107916, + "auxiliary_loss_mlp": 0.007495, + "balance_loss_clip": 1.03076077, + "balance_loss_mlp": 1.00029361, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 2.002798316701555, + "language_loss": 0.71580708, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73409367, + "num_input_tokens_seen": 261064275, + "step": 12092, + "time_per_iteration": 2.5590603351593018 + }, + { + "auxiliary_loss_clip": 0.01089405, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.03497291, + "balance_loss_mlp": 1.01757765, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.8710994936442686, + "language_loss": 0.60993195, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63111329, + "num_input_tokens_seen": 261083310, + "step": 12093, + "time_per_iteration": 2.654977560043335 + }, + { + "auxiliary_loss_clip": 0.01077445, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.03390169, + "balance_loss_mlp": 1.0218395, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.7698569535936648, + "language_loss": 0.75367653, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77478325, + "num_input_tokens_seen": 261103460, + "step": 12094, + "time_per_iteration": 2.7590863704681396 + }, + { + "auxiliary_loss_clip": 0.01062631, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.0308975, + "balance_loss_mlp": 1.01965225, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 2.0317537148571483, + "language_loss": 0.84961367, + "learning_rate": 7.310911308504808e-07, + "loss": 0.87054139, + "num_input_tokens_seen": 261121375, + "step": 12095, + "time_per_iteration": 2.9284636974334717 + }, + { + "auxiliary_loss_clip": 0.01086715, + "auxiliary_loss_mlp": 0.01034817, + "balance_loss_clip": 1.03257775, + "balance_loss_mlp": 1.02319407, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.8047531523177895, + "language_loss": 0.77486736, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79608262, + "num_input_tokens_seen": 261141105, + "step": 12096, + "time_per_iteration": 2.5542330741882324 + }, + { + "auxiliary_loss_clip": 0.0110233, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.03704107, + "balance_loss_mlp": 1.02217329, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 1.8034860292228836, + "language_loss": 0.72326458, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74462008, + "num_input_tokens_seen": 261159255, + "step": 12097, + "time_per_iteration": 2.479034900665283 + }, + { + "auxiliary_loss_clip": 0.01089268, + "auxiliary_loss_mlp": 0.00749453, + "balance_loss_clip": 1.03469825, + "balance_loss_mlp": 1.00027609, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 1.8860539425063627, + "language_loss": 0.77140081, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78978801, + "num_input_tokens_seen": 261177960, + "step": 12098, + "time_per_iteration": 2.5469391345977783 + }, + { + "auxiliary_loss_clip": 0.01077962, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.0309999, + "balance_loss_mlp": 1.01895905, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.5665905140167267, + "language_loss": 0.67231214, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69339859, + "num_input_tokens_seen": 261205660, + "step": 12099, + "time_per_iteration": 2.978067636489868 + }, + { + "auxiliary_loss_clip": 0.01083975, + "auxiliary_loss_mlp": 0.0103885, + "balance_loss_clip": 1.03080857, + "balance_loss_mlp": 1.02430081, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 2.3369253496732756, + "language_loss": 0.72779536, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74902362, + "num_input_tokens_seen": 261225185, + "step": 12100, + "time_per_iteration": 2.548973560333252 + }, + { + "auxiliary_loss_clip": 0.01091161, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.03507519, + "balance_loss_mlp": 1.02262044, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.3479415979161782, + "language_loss": 0.74853259, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76977849, + "num_input_tokens_seen": 261247965, + "step": 12101, + "time_per_iteration": 2.607966899871826 + }, + { + "auxiliary_loss_clip": 0.01056981, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.03327203, + "balance_loss_mlp": 1.01885033, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 2.2115766847963374, + "language_loss": 0.82562077, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84648937, + "num_input_tokens_seen": 261267585, + "step": 12102, + "time_per_iteration": 2.604661226272583 + }, + { + "auxiliary_loss_clip": 0.01087787, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.03426933, + "balance_loss_mlp": 1.02083325, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.0660555006903647, + "language_loss": 0.81998897, + "learning_rate": 7.286843643386495e-07, + "loss": 0.84117764, + "num_input_tokens_seen": 261285200, + "step": 12103, + "time_per_iteration": 3.9539055824279785 + }, + { + "auxiliary_loss_clip": 0.01078425, + "auxiliary_loss_mlp": 0.01024845, + "balance_loss_clip": 1.03440738, + "balance_loss_mlp": 1.01335275, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.7135363626119349, + "language_loss": 0.66596693, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68699968, + "num_input_tokens_seen": 261303645, + "step": 12104, + "time_per_iteration": 2.5400335788726807 + }, + { + "auxiliary_loss_clip": 0.01067404, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.03420258, + "balance_loss_mlp": 1.02265525, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.879158956651321, + "language_loss": 0.65691578, + "learning_rate": 7.280831545667611e-07, + "loss": 0.67792666, + "num_input_tokens_seen": 261323265, + "step": 12105, + "time_per_iteration": 2.7194719314575195 + }, + { + "auxiliary_loss_clip": 0.01103351, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.03784406, + "balance_loss_mlp": 1.0195514, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.1054045007602014, + "language_loss": 0.7572062, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77855206, + "num_input_tokens_seen": 261339745, + "step": 12106, + "time_per_iteration": 2.452876091003418 + }, + { + "auxiliary_loss_clip": 0.01090414, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.03324401, + "balance_loss_mlp": 1.01892757, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.1661854145207644, + "language_loss": 0.70108616, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72229993, + "num_input_tokens_seen": 261359310, + "step": 12107, + "time_per_iteration": 2.625643730163574 + }, + { + "auxiliary_loss_clip": 0.01079143, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.03160632, + "balance_loss_mlp": 1.02248573, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.566861494537214, + "language_loss": 0.75103086, + "learning_rate": 7.271817016715205e-07, + "loss": 0.7721585, + "num_input_tokens_seen": 261384640, + "step": 12108, + "time_per_iteration": 2.9009110927581787 + }, + { + "auxiliary_loss_clip": 0.01098904, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.03326809, + "balance_loss_mlp": 1.01646519, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.6120114039765165, + "language_loss": 0.67147928, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69274741, + "num_input_tokens_seen": 261405290, + "step": 12109, + "time_per_iteration": 2.6032066345214844 + }, + { + "auxiliary_loss_clip": 0.01060814, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.032974, + "balance_loss_mlp": 1.01650786, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 2.462120797090744, + "language_loss": 0.63430667, + "learning_rate": 7.265809743826912e-07, + "loss": 0.6552006, + "num_input_tokens_seen": 261419710, + "step": 12110, + "time_per_iteration": 2.5891222953796387 + }, + { + "auxiliary_loss_clip": 0.01066951, + "auxiliary_loss_mlp": 0.01025571, + "balance_loss_clip": 1.0309422, + "balance_loss_mlp": 1.01354241, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.8363764996812884, + "language_loss": 0.58266771, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60359293, + "num_input_tokens_seen": 261442385, + "step": 12111, + "time_per_iteration": 4.293952941894531 + }, + { + "auxiliary_loss_clip": 0.01052245, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.03607237, + "balance_loss_mlp": 1.01931453, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 3.719341643432983, + "language_loss": 0.74262345, + "learning_rate": 7.259804402465677e-07, + "loss": 0.7634542, + "num_input_tokens_seen": 261459805, + "step": 12112, + "time_per_iteration": 2.663400650024414 + }, + { + "auxiliary_loss_clip": 0.0108494, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.03092957, + "balance_loss_mlp": 1.02068245, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.3748124121796215, + "language_loss": 0.66276622, + "learning_rate": 7.25680245639237e-07, + "loss": 0.68393248, + "num_input_tokens_seen": 261477175, + "step": 12113, + "time_per_iteration": 2.5444486141204834 + }, + { + "auxiliary_loss_clip": 0.01062042, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.03273916, + "balance_loss_mlp": 1.01632214, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 2.0660436182157134, + "language_loss": 0.7317602, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75266457, + "num_input_tokens_seen": 261494990, + "step": 12114, + "time_per_iteration": 2.661684989929199 + }, + { + "auxiliary_loss_clip": 0.01068353, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.0326333, + "balance_loss_mlp": 1.01720369, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.122550505142656, + "language_loss": 0.68106967, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70203894, + "num_input_tokens_seen": 261514445, + "step": 12115, + "time_per_iteration": 2.624509572982788 + }, + { + "auxiliary_loss_clip": 0.01100572, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.03361988, + "balance_loss_mlp": 1.01862359, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.6098131686453239, + "language_loss": 0.5977121, + "learning_rate": 7.247799517967674e-07, + "loss": 0.61902201, + "num_input_tokens_seen": 261533565, + "step": 12116, + "time_per_iteration": 2.501418113708496 + }, + { + "auxiliary_loss_clip": 0.01086863, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03366673, + "balance_loss_mlp": 1.01984918, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 9.626431596065286, + "language_loss": 0.72521424, + "learning_rate": 7.2447995054705e-07, + "loss": 0.74639553, + "num_input_tokens_seen": 261553795, + "step": 12117, + "time_per_iteration": 2.541508674621582 + }, + { + "auxiliary_loss_clip": 0.01083214, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.03344345, + "balance_loss_mlp": 1.01557398, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 4.965916074502727, + "language_loss": 0.69817156, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71927327, + "num_input_tokens_seen": 261572565, + "step": 12118, + "time_per_iteration": 4.003634452819824 + }, + { + "auxiliary_loss_clip": 0.01041744, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.03072667, + "balance_loss_mlp": 1.02477169, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.7491472880246006, + "language_loss": 0.84237254, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86314929, + "num_input_tokens_seen": 261590910, + "step": 12119, + "time_per_iteration": 2.6039040088653564 + }, + { + "auxiliary_loss_clip": 0.01099215, + "auxiliary_loss_mlp": 0.01025889, + "balance_loss_clip": 1.03409255, + "balance_loss_mlp": 1.01487947, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.3211200502043545, + "language_loss": 0.81852216, + "learning_rate": 7.235802370504831e-07, + "loss": 0.83977324, + "num_input_tokens_seen": 261606005, + "step": 12120, + "time_per_iteration": 2.481424570083618 + }, + { + "auxiliary_loss_clip": 0.01058781, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.03287578, + "balance_loss_mlp": 1.02413166, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 3.332669626267678, + "language_loss": 0.78915, + "learning_rate": 7.232804293403963e-07, + "loss": 0.81009185, + "num_input_tokens_seen": 261622305, + "step": 12121, + "time_per_iteration": 2.6062655448913574 + }, + { + "auxiliary_loss_clip": 0.01098728, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.03166986, + "balance_loss_mlp": 1.02460814, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.6608926467179002, + "language_loss": 0.68679726, + "learning_rate": 7.229806700436441e-07, + "loss": 0.70814723, + "num_input_tokens_seen": 261642465, + "step": 12122, + "time_per_iteration": 2.564962387084961 + }, + { + "auxiliary_loss_clip": 0.01049494, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.02950943, + "balance_loss_mlp": 1.02014208, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 2.0360396833351406, + "language_loss": 0.87161541, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89241552, + "num_input_tokens_seen": 261661420, + "step": 12123, + "time_per_iteration": 2.658475399017334 + }, + { + "auxiliary_loss_clip": 0.01063751, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.03159285, + "balance_loss_mlp": 1.0162878, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 1.9591127828765824, + "language_loss": 0.82546622, + "learning_rate": 7.223812967356065e-07, + "loss": 0.84637523, + "num_input_tokens_seen": 261680865, + "step": 12124, + "time_per_iteration": 2.672332525253296 + }, + { + "auxiliary_loss_clip": 0.01069712, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.0334276, + "balance_loss_mlp": 1.02011275, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.875605637522652, + "language_loss": 0.67252743, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69353247, + "num_input_tokens_seen": 261701455, + "step": 12125, + "time_per_iteration": 2.5947680473327637 + }, + { + "auxiliary_loss_clip": 0.01090869, + "auxiliary_loss_mlp": 0.01038952, + "balance_loss_clip": 1.03369522, + "balance_loss_mlp": 1.02649522, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.8456698096509228, + "language_loss": 0.74674976, + "learning_rate": 7.217821172172855e-07, + "loss": 0.76804799, + "num_input_tokens_seen": 261721260, + "step": 12126, + "time_per_iteration": 2.5841944217681885 + }, + { + "auxiliary_loss_clip": 0.01006307, + "auxiliary_loss_mlp": 0.01000707, + "balance_loss_clip": 1.00635576, + "balance_loss_mlp": 0.99972957, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.829284436811799, + "language_loss": 0.58630925, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60637939, + "num_input_tokens_seen": 261779370, + "step": 12127, + "time_per_iteration": 3.03777813911438 + }, + { + "auxiliary_loss_clip": 0.01061726, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.0311321, + "balance_loss_mlp": 1.0199784, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 2.032476909239938, + "language_loss": 0.68872023, + "learning_rate": 7.21183131579562e-07, + "loss": 0.70963776, + "num_input_tokens_seen": 261798050, + "step": 12128, + "time_per_iteration": 2.6725521087646484 + }, + { + "auxiliary_loss_clip": 0.01069552, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.03218508, + "balance_loss_mlp": 1.02051401, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 2.411699207181752, + "language_loss": 0.65590584, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67692602, + "num_input_tokens_seen": 261817660, + "step": 12129, + "time_per_iteration": 2.645768165588379 + }, + { + "auxiliary_loss_clip": 0.01097022, + "auxiliary_loss_mlp": 0.01026787, + "balance_loss_clip": 1.03409445, + "balance_loss_mlp": 1.01554012, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 2.5113087873253646, + "language_loss": 0.74427474, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76551288, + "num_input_tokens_seen": 261837935, + "step": 12130, + "time_per_iteration": 4.1195104122161865 + }, + { + "auxiliary_loss_clip": 0.01070418, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.03072572, + "balance_loss_mlp": 1.01811874, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.5563510277703914, + "language_loss": 0.69468176, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71568298, + "num_input_tokens_seen": 261857575, + "step": 12131, + "time_per_iteration": 2.604492425918579 + }, + { + "auxiliary_loss_clip": 0.01064728, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.03408587, + "balance_loss_mlp": 1.02008557, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.9319631540023559, + "language_loss": 0.77317637, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79412711, + "num_input_tokens_seen": 261877265, + "step": 12132, + "time_per_iteration": 2.6555514335632324 + }, + { + "auxiliary_loss_clip": 0.01089408, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.03464532, + "balance_loss_mlp": 1.02449405, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.1611493418186423, + "language_loss": 0.79070568, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81195045, + "num_input_tokens_seen": 261893695, + "step": 12133, + "time_per_iteration": 2.5463063716888428 + }, + { + "auxiliary_loss_clip": 0.01047052, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.02949262, + "balance_loss_mlp": 1.01893461, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.7921494910659523, + "language_loss": 0.71971422, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74048877, + "num_input_tokens_seen": 261911825, + "step": 12134, + "time_per_iteration": 2.6610779762268066 + }, + { + "auxiliary_loss_clip": 0.01078914, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.03400731, + "balance_loss_mlp": 1.02434778, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.6157036002229606, + "language_loss": 0.71179652, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73294288, + "num_input_tokens_seen": 261931190, + "step": 12135, + "time_per_iteration": 2.5798304080963135 + }, + { + "auxiliary_loss_clip": 0.01062503, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.03133607, + "balance_loss_mlp": 1.02090359, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.950862360066841, + "language_loss": 0.62274706, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64369261, + "num_input_tokens_seen": 261951240, + "step": 12136, + "time_per_iteration": 2.663153886795044 + }, + { + "auxiliary_loss_clip": 0.01081553, + "auxiliary_loss_mlp": 0.00749284, + "balance_loss_clip": 1.03155303, + "balance_loss_mlp": 1.0002234, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 2.3239880599954765, + "language_loss": 0.74377704, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76208544, + "num_input_tokens_seen": 261971605, + "step": 12137, + "time_per_iteration": 2.561296224594116 + }, + { + "auxiliary_loss_clip": 0.01092011, + "auxiliary_loss_mlp": 0.00749351, + "balance_loss_clip": 1.03644848, + "balance_loss_mlp": 1.00028443, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.6454718150974703, + "language_loss": 0.7391175, + "learning_rate": 7.181911147788069e-07, + "loss": 0.75753117, + "num_input_tokens_seen": 261990830, + "step": 12138, + "time_per_iteration": 2.6266636848449707 + }, + { + "auxiliary_loss_clip": 0.01063024, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.03231943, + "balance_loss_mlp": 1.01765752, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.368460030001133, + "language_loss": 0.71551096, + "learning_rate": 7.178921802463702e-07, + "loss": 0.73642242, + "num_input_tokens_seen": 262008190, + "step": 12139, + "time_per_iteration": 2.6408839225769043 + }, + { + "auxiliary_loss_clip": 0.01084639, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.0338819, + "balance_loss_mlp": 1.01638412, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.5022274737459662, + "language_loss": 0.73370951, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75482059, + "num_input_tokens_seen": 262030460, + "step": 12140, + "time_per_iteration": 2.60593843460083 + }, + { + "auxiliary_loss_clip": 0.01073684, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.03251588, + "balance_loss_mlp": 1.01956379, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.7001858357658586, + "language_loss": 0.55235338, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57340026, + "num_input_tokens_seen": 262050830, + "step": 12141, + "time_per_iteration": 2.6824915409088135 + }, + { + "auxiliary_loss_clip": 0.01057159, + "auxiliary_loss_mlp": 0.01026205, + "balance_loss_clip": 1.03020072, + "balance_loss_mlp": 1.01574993, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 3.1647208930742563, + "language_loss": 0.7266196, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74745321, + "num_input_tokens_seen": 262071245, + "step": 12142, + "time_per_iteration": 2.6519269943237305 + }, + { + "auxiliary_loss_clip": 0.01098011, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.03460431, + "balance_loss_mlp": 1.02487385, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.7091735867474696, + "language_loss": 0.73950851, + "learning_rate": 7.16696928406521e-07, + "loss": 0.76083899, + "num_input_tokens_seen": 262087525, + "step": 12143, + "time_per_iteration": 4.016369819641113 + }, + { + "auxiliary_loss_clip": 0.01058487, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.0313952, + "balance_loss_mlp": 1.02120566, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 2.1590989617008924, + "language_loss": 0.6670081, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68791682, + "num_input_tokens_seen": 262107355, + "step": 12144, + "time_per_iteration": 2.6932313442230225 + }, + { + "auxiliary_loss_clip": 0.01074365, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.03312325, + "balance_loss_mlp": 1.01780879, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.711021521771969, + "language_loss": 0.79059511, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81162459, + "num_input_tokens_seen": 262125645, + "step": 12145, + "time_per_iteration": 2.623610734939575 + }, + { + "auxiliary_loss_clip": 0.01056501, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.03151608, + "balance_loss_mlp": 1.02132654, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.756559220735825, + "language_loss": 0.91430593, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93519783, + "num_input_tokens_seen": 262144075, + "step": 12146, + "time_per_iteration": 2.6655688285827637 + }, + { + "auxiliary_loss_clip": 0.01095649, + "auxiliary_loss_mlp": 0.01026243, + "balance_loss_clip": 1.03433371, + "balance_loss_mlp": 1.01566291, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 1.796916706808513, + "language_loss": 0.61861658, + "learning_rate": 7.155024551743316e-07, + "loss": 0.63983548, + "num_input_tokens_seen": 262165940, + "step": 12147, + "time_per_iteration": 2.523723840713501 + }, + { + "auxiliary_loss_clip": 0.01101274, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.03599572, + "balance_loss_mlp": 1.02484393, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 3.62302456246213, + "language_loss": 0.75238067, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77375519, + "num_input_tokens_seen": 262184520, + "step": 12148, + "time_per_iteration": 2.536233425140381 + }, + { + "auxiliary_loss_clip": 0.01012742, + "auxiliary_loss_mlp": 0.00746554, + "balance_loss_clip": 1.01181328, + "balance_loss_mlp": 0.99993736, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.8518046970839294, + "language_loss": 0.5676285, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58522147, + "num_input_tokens_seen": 262247070, + "step": 12149, + "time_per_iteration": 3.1499900817871094 + }, + { + "auxiliary_loss_clip": 0.01077938, + "auxiliary_loss_mlp": 0.01031898, + "balance_loss_clip": 1.03234243, + "balance_loss_mlp": 1.0208174, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.7756883840755942, + "language_loss": 0.73960978, + "learning_rate": 7.146071116474451e-07, + "loss": 0.76070815, + "num_input_tokens_seen": 262266605, + "step": 12150, + "time_per_iteration": 2.5990147590637207 + }, + { + "auxiliary_loss_clip": 0.01099763, + "auxiliary_loss_mlp": 0.01031052, + "balance_loss_clip": 1.033144, + "balance_loss_mlp": 1.01950097, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 1.9054330161140647, + "language_loss": 0.84135878, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86266696, + "num_input_tokens_seen": 262283880, + "step": 12151, + "time_per_iteration": 3.985198736190796 + }, + { + "auxiliary_loss_clip": 0.01056139, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.03052688, + "balance_loss_mlp": 1.02252376, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.922906132526911, + "language_loss": 0.78054166, + "learning_rate": 7.14010459655127e-07, + "loss": 0.80144429, + "num_input_tokens_seen": 262304155, + "step": 12152, + "time_per_iteration": 2.6488633155822754 + }, + { + "auxiliary_loss_clip": 0.01064761, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03404438, + "balance_loss_mlp": 1.0206244, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.6395118810003722, + "language_loss": 0.79832232, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81928569, + "num_input_tokens_seen": 262325660, + "step": 12153, + "time_per_iteration": 2.673692464828491 + }, + { + "auxiliary_loss_clip": 0.01089272, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.03352976, + "balance_loss_mlp": 1.02045739, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.6202146290255504, + "language_loss": 0.67290807, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69411677, + "num_input_tokens_seen": 262344075, + "step": 12154, + "time_per_iteration": 2.5226821899414062 + }, + { + "auxiliary_loss_clip": 0.01050794, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.03316379, + "balance_loss_mlp": 1.02117062, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.7990021462531622, + "language_loss": 0.65996504, + "learning_rate": 7.131158474313128e-07, + "loss": 0.68079937, + "num_input_tokens_seen": 262363305, + "step": 12155, + "time_per_iteration": 2.7281622886657715 + }, + { + "auxiliary_loss_clip": 0.01077391, + "auxiliary_loss_mlp": 0.01029159, + "balance_loss_clip": 1.03427184, + "balance_loss_mlp": 1.01816809, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.687626784640937, + "language_loss": 0.81790614, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83897161, + "num_input_tokens_seen": 262380730, + "step": 12156, + "time_per_iteration": 2.7043306827545166 + }, + { + "auxiliary_loss_clip": 0.01055236, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.03045762, + "balance_loss_mlp": 1.02127218, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 1.9199517447515633, + "language_loss": 0.7517879, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77265966, + "num_input_tokens_seen": 262395480, + "step": 12157, + "time_per_iteration": 2.5768256187438965 + }, + { + "auxiliary_loss_clip": 0.01083722, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.03223181, + "balance_loss_mlp": 1.0183872, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 1.8922668080460872, + "language_loss": 0.73381257, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75492847, + "num_input_tokens_seen": 262413340, + "step": 12158, + "time_per_iteration": 2.534193515777588 + }, + { + "auxiliary_loss_clip": 0.01073712, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.03267848, + "balance_loss_mlp": 1.02076602, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 3.3927050987285683, + "language_loss": 0.85547048, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87652254, + "num_input_tokens_seen": 262433455, + "step": 12159, + "time_per_iteration": 4.053854703903198 + }, + { + "auxiliary_loss_clip": 0.01078326, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.03207994, + "balance_loss_mlp": 1.0197382, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.0891310696677805, + "language_loss": 0.73421752, + "learning_rate": 7.116258031844895e-07, + "loss": 0.7553131, + "num_input_tokens_seen": 262450335, + "step": 12160, + "time_per_iteration": 2.553208112716675 + }, + { + "auxiliary_loss_clip": 0.01090802, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.03488755, + "balance_loss_mlp": 1.02146244, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 2.41257163597822, + "language_loss": 0.72569084, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74692845, + "num_input_tokens_seen": 262468240, + "step": 12161, + "time_per_iteration": 2.5438106060028076 + }, + { + "auxiliary_loss_clip": 0.01066656, + "auxiliary_loss_mlp": 0.00749449, + "balance_loss_clip": 1.03182673, + "balance_loss_mlp": 1.00021839, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 1.783399608379028, + "language_loss": 0.69454098, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71270204, + "num_input_tokens_seen": 262487045, + "step": 12162, + "time_per_iteration": 2.6469717025756836 + }, + { + "auxiliary_loss_clip": 0.01091548, + "auxiliary_loss_mlp": 0.01028759, + "balance_loss_clip": 1.03638113, + "balance_loss_mlp": 1.01737452, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.7932646676315698, + "language_loss": 0.66559744, + "learning_rate": 7.107323628093382e-07, + "loss": 0.6868006, + "num_input_tokens_seen": 262504855, + "step": 12163, + "time_per_iteration": 2.581873655319214 + }, + { + "auxiliary_loss_clip": 0.01078728, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.03409338, + "balance_loss_mlp": 1.01882195, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.612584467132913, + "language_loss": 0.68521124, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70630097, + "num_input_tokens_seen": 262524920, + "step": 12164, + "time_per_iteration": 2.606719732284546 + }, + { + "auxiliary_loss_clip": 0.01047967, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.03255117, + "balance_loss_mlp": 1.02127147, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.6078724715350647, + "language_loss": 0.72983003, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75062549, + "num_input_tokens_seen": 262545725, + "step": 12165, + "time_per_iteration": 2.6408517360687256 + }, + { + "auxiliary_loss_clip": 0.01090898, + "auxiliary_loss_mlp": 0.01033881, + "balance_loss_clip": 1.03511536, + "balance_loss_mlp": 1.02242446, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.8300210128323824, + "language_loss": 0.76185912, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78310692, + "num_input_tokens_seen": 262565480, + "step": 12166, + "time_per_iteration": 2.576167583465576 + }, + { + "auxiliary_loss_clip": 0.01067556, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.0327543, + "balance_loss_mlp": 1.01675689, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 1.7262158757529245, + "language_loss": 0.79620647, + "learning_rate": 7.095417934766781e-07, + "loss": 0.8171581, + "num_input_tokens_seen": 262584145, + "step": 12167, + "time_per_iteration": 2.576068639755249 + }, + { + "auxiliary_loss_clip": 0.01087178, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.03388524, + "balance_loss_mlp": 1.02592111, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.9666701830315012, + "language_loss": 0.76928627, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79052222, + "num_input_tokens_seen": 262604045, + "step": 12168, + "time_per_iteration": 2.6264514923095703 + }, + { + "auxiliary_loss_clip": 0.01081167, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.03147483, + "balance_loss_mlp": 1.02648163, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 2.8507057466131718, + "language_loss": 0.81972682, + "learning_rate": 7.089468023710326e-07, + "loss": 0.84094059, + "num_input_tokens_seen": 262624540, + "step": 12169, + "time_per_iteration": 4.053924798965454 + }, + { + "auxiliary_loss_clip": 0.01085106, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.03303385, + "balance_loss_mlp": 1.02364278, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.8918711362423495, + "language_loss": 0.70384532, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72504723, + "num_input_tokens_seen": 262644545, + "step": 12170, + "time_per_iteration": 2.6766793727874756 + }, + { + "auxiliary_loss_clip": 0.01098123, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.0340122, + "balance_loss_mlp": 1.01885343, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.2212489241668445, + "language_loss": 0.69854045, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71982813, + "num_input_tokens_seen": 262662570, + "step": 12171, + "time_per_iteration": 2.5244522094726562 + }, + { + "auxiliary_loss_clip": 0.01100209, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03577018, + "balance_loss_mlp": 1.02014875, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 1.6969266403096854, + "language_loss": 0.65497649, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67629266, + "num_input_tokens_seen": 262683245, + "step": 12172, + "time_per_iteration": 2.589308977127075 + }, + { + "auxiliary_loss_clip": 0.01100433, + "auxiliary_loss_mlp": 0.01028726, + "balance_loss_clip": 1.0355649, + "balance_loss_mlp": 1.01738954, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.344799442897994, + "language_loss": 0.60951704, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63080859, + "num_input_tokens_seen": 262701585, + "step": 12173, + "time_per_iteration": 2.5076253414154053 + }, + { + "auxiliary_loss_clip": 0.01044258, + "auxiliary_loss_mlp": 0.01027034, + "balance_loss_clip": 1.03146124, + "balance_loss_mlp": 1.01635337, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 1.7549006698942367, + "language_loss": 0.73920953, + "learning_rate": 7.074601815494243e-07, + "loss": 0.7599225, + "num_input_tokens_seen": 262719295, + "step": 12174, + "time_per_iteration": 2.661018133163452 + }, + { + "auxiliary_loss_clip": 0.0109647, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.01482368, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.7650540760349456, + "language_loss": 0.80934173, + "learning_rate": 7.071630043797317e-07, + "loss": 0.83056045, + "num_input_tokens_seen": 262739995, + "step": 12175, + "time_per_iteration": 2.5832178592681885 + }, + { + "auxiliary_loss_clip": 0.01076994, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.03234231, + "balance_loss_mlp": 1.01791024, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 2.8424412528879994, + "language_loss": 0.76365542, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78471255, + "num_input_tokens_seen": 262757680, + "step": 12176, + "time_per_iteration": 2.550199270248413 + }, + { + "auxiliary_loss_clip": 0.01085965, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.03436363, + "balance_loss_mlp": 1.02051115, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 1.4412725126357293, + "language_loss": 0.76736104, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78853214, + "num_input_tokens_seen": 262776990, + "step": 12177, + "time_per_iteration": 2.513828992843628 + }, + { + "auxiliary_loss_clip": 0.01063892, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.031394, + "balance_loss_mlp": 1.01953673, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 4.141982404604496, + "language_loss": 0.74426174, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76519859, + "num_input_tokens_seen": 262795440, + "step": 12178, + "time_per_iteration": 2.5739190578460693 + }, + { + "auxiliary_loss_clip": 0.01079302, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.03327012, + "balance_loss_mlp": 1.01636147, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 2.5418837853936793, + "language_loss": 0.82189846, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84296763, + "num_input_tokens_seen": 262816385, + "step": 12179, + "time_per_iteration": 2.603893756866455 + }, + { + "auxiliary_loss_clip": 0.01057823, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.03076792, + "balance_loss_mlp": 1.02427793, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.6680065523839136, + "language_loss": 0.74593842, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76687944, + "num_input_tokens_seen": 262834955, + "step": 12180, + "time_per_iteration": 2.6851584911346436 + }, + { + "auxiliary_loss_clip": 0.01088235, + "auxiliary_loss_mlp": 0.007494, + "balance_loss_clip": 1.03151417, + "balance_loss_mlp": 1.00028276, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.882574020928822, + "language_loss": 0.79648554, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81486189, + "num_input_tokens_seen": 262853555, + "step": 12181, + "time_per_iteration": 2.6547279357910156 + }, + { + "auxiliary_loss_clip": 0.0109113, + "auxiliary_loss_mlp": 0.0074936, + "balance_loss_clip": 1.03475523, + "balance_loss_mlp": 1.00028563, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 2.7861324646188175, + "language_loss": 0.71683514, + "learning_rate": 7.050841375089506e-07, + "loss": 0.73523998, + "num_input_tokens_seen": 262870975, + "step": 12182, + "time_per_iteration": 4.0004801750183105 + }, + { + "auxiliary_loss_clip": 0.01100595, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.03525758, + "balance_loss_mlp": 1.01990962, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.5900381809262158, + "language_loss": 0.70970583, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73102021, + "num_input_tokens_seen": 262892635, + "step": 12183, + "time_per_iteration": 2.6354353427886963 + }, + { + "auxiliary_loss_clip": 0.01089367, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.03599036, + "balance_loss_mlp": 1.0206486, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 2.161745976522598, + "language_loss": 0.72397327, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74519479, + "num_input_tokens_seen": 262910725, + "step": 12184, + "time_per_iteration": 2.6886351108551025 + }, + { + "auxiliary_loss_clip": 0.01004315, + "auxiliary_loss_mlp": 0.00998799, + "balance_loss_clip": 1.00369608, + "balance_loss_mlp": 0.99787539, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.7538789197385789, + "language_loss": 0.65199107, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67202222, + "num_input_tokens_seen": 262974150, + "step": 12185, + "time_per_iteration": 3.109989881515503 + }, + { + "auxiliary_loss_clip": 0.01098254, + "auxiliary_loss_mlp": 0.01025478, + "balance_loss_clip": 1.03250659, + "balance_loss_mlp": 1.01413512, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 2.598626055565491, + "language_loss": 0.8035816, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82481897, + "num_input_tokens_seen": 262993370, + "step": 12186, + "time_per_iteration": 2.4984374046325684 + }, + { + "auxiliary_loss_clip": 0.01080818, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.03289986, + "balance_loss_mlp": 1.01965141, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.6179622760722554, + "language_loss": 0.73436874, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75548935, + "num_input_tokens_seen": 263012665, + "step": 12187, + "time_per_iteration": 2.5654616355895996 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.0355866, + "balance_loss_mlp": 1.02127051, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.7609922501074216, + "language_loss": 0.89218724, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91353089, + "num_input_tokens_seen": 263031475, + "step": 12188, + "time_per_iteration": 2.4999356269836426 + }, + { + "auxiliary_loss_clip": 0.0105275, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.03027821, + "balance_loss_mlp": 1.02275717, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 2.012598570182323, + "language_loss": 0.74726778, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76814443, + "num_input_tokens_seen": 263051445, + "step": 12189, + "time_per_iteration": 2.718414545059204 + }, + { + "auxiliary_loss_clip": 0.01065738, + "auxiliary_loss_mlp": 0.01028673, + "balance_loss_clip": 1.03185368, + "balance_loss_mlp": 1.01682413, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.578653668842749, + "language_loss": 0.82104039, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84198451, + "num_input_tokens_seen": 263070835, + "step": 12190, + "time_per_iteration": 2.6144587993621826 + }, + { + "auxiliary_loss_clip": 0.01059236, + "auxiliary_loss_mlp": 0.01038063, + "balance_loss_clip": 1.03294635, + "balance_loss_mlp": 1.02565384, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.7001620007686409, + "language_loss": 0.71724641, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73821938, + "num_input_tokens_seen": 263090070, + "step": 12191, + "time_per_iteration": 2.6486198902130127 + }, + { + "auxiliary_loss_clip": 0.01100809, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.03558457, + "balance_loss_mlp": 1.02652311, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.581814825415136, + "language_loss": 0.68956673, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71095634, + "num_input_tokens_seen": 263110030, + "step": 12192, + "time_per_iteration": 4.087995529174805 + }, + { + "auxiliary_loss_clip": 0.01086187, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.03326154, + "balance_loss_mlp": 1.01841784, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.696715959327114, + "language_loss": 0.7314446, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75260198, + "num_input_tokens_seen": 263129735, + "step": 12193, + "time_per_iteration": 2.5372183322906494 + }, + { + "auxiliary_loss_clip": 0.01081647, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.03303123, + "balance_loss_mlp": 1.01898026, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 1.7106068064638447, + "language_loss": 0.76975781, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79088026, + "num_input_tokens_seen": 263149100, + "step": 12194, + "time_per_iteration": 2.511897087097168 + }, + { + "auxiliary_loss_clip": 0.01088629, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.03571057, + "balance_loss_mlp": 1.01638961, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 2.067707534658556, + "language_loss": 0.70627618, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72744709, + "num_input_tokens_seen": 263166620, + "step": 12195, + "time_per_iteration": 2.529055595397949 + }, + { + "auxiliary_loss_clip": 0.01099344, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.03383505, + "balance_loss_mlp": 1.02569127, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.8312878312713095, + "language_loss": 0.7232008, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74456096, + "num_input_tokens_seen": 263184780, + "step": 12196, + "time_per_iteration": 2.4929606914520264 + }, + { + "auxiliary_loss_clip": 0.01099361, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.03540254, + "balance_loss_mlp": 1.02107191, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.914253239909325, + "language_loss": 0.71416414, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73549092, + "num_input_tokens_seen": 263204625, + "step": 12197, + "time_per_iteration": 2.5446791648864746 + }, + { + "auxiliary_loss_clip": 0.01045862, + "auxiliary_loss_mlp": 0.00749695, + "balance_loss_clip": 1.02936232, + "balance_loss_mlp": 1.00031757, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 1.8573017582330058, + "language_loss": 0.78224027, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80019593, + "num_input_tokens_seen": 263221565, + "step": 12198, + "time_per_iteration": 2.70326566696167 + }, + { + "auxiliary_loss_clip": 0.01042356, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.03223872, + "balance_loss_mlp": 1.01984978, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 2.366568010545421, + "language_loss": 0.74294257, + "learning_rate": 7.000454855504974e-07, + "loss": 0.7636739, + "num_input_tokens_seen": 263240620, + "step": 12199, + "time_per_iteration": 4.238009452819824 + }, + { + "auxiliary_loss_clip": 0.01082463, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.035043, + "balance_loss_mlp": 1.0194329, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.335607802006674, + "language_loss": 0.76733744, + "learning_rate": 6.997495373693729e-07, + "loss": 0.78847593, + "num_input_tokens_seen": 263254365, + "step": 12200, + "time_per_iteration": 2.5493216514587402 + }, + { + "auxiliary_loss_clip": 0.01061354, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.03353393, + "balance_loss_mlp": 1.01807332, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 1.9240404430998483, + "language_loss": 0.61417019, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63507342, + "num_input_tokens_seen": 263275880, + "step": 12201, + "time_per_iteration": 2.6580381393432617 + }, + { + "auxiliary_loss_clip": 0.01058569, + "auxiliary_loss_mlp": 0.00749344, + "balance_loss_clip": 1.03004289, + "balance_loss_mlp": 1.00026274, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.4995675073693173, + "language_loss": 0.5226528, + "learning_rate": 6.991577889352264e-07, + "loss": 0.54073197, + "num_input_tokens_seen": 263298315, + "step": 12202, + "time_per_iteration": 2.7961223125457764 + }, + { + "auxiliary_loss_clip": 0.01072115, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.03084683, + "balance_loss_mlp": 1.01621282, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.8481252328302156, + "language_loss": 0.68852293, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70951509, + "num_input_tokens_seen": 263318615, + "step": 12203, + "time_per_iteration": 2.5861778259277344 + }, + { + "auxiliary_loss_clip": 0.01074673, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.03378749, + "balance_loss_mlp": 1.0234611, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.832381230278726, + "language_loss": 0.66678554, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68788189, + "num_input_tokens_seen": 263336705, + "step": 12204, + "time_per_iteration": 2.6164941787719727 + }, + { + "auxiliary_loss_clip": 0.01075919, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.03503084, + "balance_loss_mlp": 1.02115047, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 1.8721267228510043, + "language_loss": 0.77290022, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79397714, + "num_input_tokens_seen": 263355065, + "step": 12205, + "time_per_iteration": 2.605501890182495 + }, + { + "auxiliary_loss_clip": 0.0104305, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.03144693, + "balance_loss_mlp": 1.01814866, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.5649094260218035, + "language_loss": 0.79571366, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81642938, + "num_input_tokens_seen": 263374460, + "step": 12206, + "time_per_iteration": 2.6628971099853516 + }, + { + "auxiliary_loss_clip": 0.01057192, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.03064561, + "balance_loss_mlp": 1.01606607, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.9281371362949686, + "language_loss": 0.71822178, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73907161, + "num_input_tokens_seen": 263393610, + "step": 12207, + "time_per_iteration": 2.7519278526306152 + }, + { + "auxiliary_loss_clip": 0.0100703, + "auxiliary_loss_mlp": 0.01004119, + "balance_loss_clip": 1.00639987, + "balance_loss_mlp": 1.00315332, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.8264877698938558, + "language_loss": 0.54802394, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56813538, + "num_input_tokens_seen": 263450340, + "step": 12208, + "time_per_iteration": 3.1803815364837646 + }, + { + "auxiliary_loss_clip": 0.01097453, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.03440881, + "balance_loss_mlp": 1.01703358, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.36801199307037, + "language_loss": 0.80342233, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82466692, + "num_input_tokens_seen": 263471735, + "step": 12209, + "time_per_iteration": 4.022647142410278 + }, + { + "auxiliary_loss_clip": 0.01095293, + "auxiliary_loss_mlp": 0.01028624, + "balance_loss_clip": 1.0319314, + "balance_loss_mlp": 1.01807439, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.4535245624191817, + "language_loss": 0.79152215, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81276131, + "num_input_tokens_seen": 263493245, + "step": 12210, + "time_per_iteration": 2.6142609119415283 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.034446, + "balance_loss_mlp": 1.01789117, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.715567011991498, + "language_loss": 0.76201737, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78328371, + "num_input_tokens_seen": 263511660, + "step": 12211, + "time_per_iteration": 2.482337713241577 + }, + { + "auxiliary_loss_clip": 0.01073954, + "auxiliary_loss_mlp": 0.01026011, + "balance_loss_clip": 1.03406835, + "balance_loss_mlp": 1.01453662, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 2.0194385747553096, + "language_loss": 0.72017682, + "learning_rate": 6.962020082425748e-07, + "loss": 0.74117649, + "num_input_tokens_seen": 263530875, + "step": 12212, + "time_per_iteration": 2.6894752979278564 + }, + { + "auxiliary_loss_clip": 0.01099963, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.03585911, + "balance_loss_mlp": 1.02385986, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.5146090087849216, + "language_loss": 0.68692827, + "learning_rate": 6.959067019092766e-07, + "loss": 0.70827723, + "num_input_tokens_seen": 263551585, + "step": 12213, + "time_per_iteration": 2.501757860183716 + }, + { + "auxiliary_loss_clip": 0.01025898, + "auxiliary_loss_mlp": 0.01002957, + "balance_loss_clip": 1.0057292, + "balance_loss_mlp": 1.00203919, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7254813218288197, + "language_loss": 0.5428623, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56315082, + "num_input_tokens_seen": 263609545, + "step": 12214, + "time_per_iteration": 2.9467201232910156 + }, + { + "auxiliary_loss_clip": 0.01100859, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.03398204, + "balance_loss_mlp": 1.01615226, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 1.9805938975659814, + "language_loss": 0.70323426, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72451472, + "num_input_tokens_seen": 263627880, + "step": 12215, + "time_per_iteration": 2.460956335067749 + }, + { + "auxiliary_loss_clip": 0.01067162, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.03168321, + "balance_loss_mlp": 1.01759171, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.6089394751261155, + "language_loss": 0.72441363, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74536347, + "num_input_tokens_seen": 263645665, + "step": 12216, + "time_per_iteration": 2.5698745250701904 + }, + { + "auxiliary_loss_clip": 0.01103896, + "auxiliary_loss_mlp": 0.01039091, + "balance_loss_clip": 1.03497338, + "balance_loss_mlp": 1.02586472, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.8686304628805968, + "language_loss": 0.7810362, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80246603, + "num_input_tokens_seen": 263668170, + "step": 12217, + "time_per_iteration": 2.497772693634033 + }, + { + "auxiliary_loss_clip": 0.01061301, + "auxiliary_loss_mlp": 0.01023702, + "balance_loss_clip": 1.03290904, + "balance_loss_mlp": 1.01392663, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 2.020179324923711, + "language_loss": 0.78125536, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80210537, + "num_input_tokens_seen": 263684190, + "step": 12218, + "time_per_iteration": 2.587956190109253 + }, + { + "auxiliary_loss_clip": 0.01044606, + "auxiliary_loss_mlp": 0.01035493, + "balance_loss_clip": 1.02840209, + "balance_loss_mlp": 1.02246356, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 1.8191528577198386, + "language_loss": 0.72027212, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74107313, + "num_input_tokens_seen": 263702095, + "step": 12219, + "time_per_iteration": 2.6012649536132812 + }, + { + "auxiliary_loss_clip": 0.01071734, + "auxiliary_loss_mlp": 0.0102639, + "balance_loss_clip": 1.03098238, + "balance_loss_mlp": 1.01561928, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.802571033615404, + "language_loss": 0.74886847, + "learning_rate": 6.938409428408061e-07, + "loss": 0.76984972, + "num_input_tokens_seen": 263721385, + "step": 12220, + "time_per_iteration": 2.620204448699951 + }, + { + "auxiliary_loss_clip": 0.01087365, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.03250122, + "balance_loss_mlp": 1.01751649, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.6827234704928464, + "language_loss": 0.66157204, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68273079, + "num_input_tokens_seen": 263737835, + "step": 12221, + "time_per_iteration": 2.4905238151550293 + }, + { + "auxiliary_loss_clip": 0.01069621, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.03067946, + "balance_loss_mlp": 1.02060866, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 2.0025534806927907, + "language_loss": 0.69692862, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71795487, + "num_input_tokens_seen": 263756480, + "step": 12222, + "time_per_iteration": 4.05775260925293 + }, + { + "auxiliary_loss_clip": 0.01050266, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.02958202, + "balance_loss_mlp": 1.02076221, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.582364866399794, + "language_loss": 0.65923566, + "learning_rate": 6.92956360247217e-07, + "loss": 0.68004632, + "num_input_tokens_seen": 263776440, + "step": 12223, + "time_per_iteration": 2.6443662643432617 + }, + { + "auxiliary_loss_clip": 0.01079605, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.03207099, + "balance_loss_mlp": 1.01725733, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 3.2677179218433596, + "language_loss": 0.72232741, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74340665, + "num_input_tokens_seen": 263793700, + "step": 12224, + "time_per_iteration": 2.5749385356903076 + }, + { + "auxiliary_loss_clip": 0.01070352, + "auxiliary_loss_mlp": 0.01026093, + "balance_loss_clip": 1.03467596, + "balance_loss_mlp": 1.01508999, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.8859813961085377, + "language_loss": 0.72511172, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74607611, + "num_input_tokens_seen": 263814620, + "step": 12225, + "time_per_iteration": 2.71822190284729 + }, + { + "auxiliary_loss_clip": 0.01101271, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.03400254, + "balance_loss_mlp": 1.01910639, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 4.105908174704904, + "language_loss": 0.76524568, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78657043, + "num_input_tokens_seen": 263832725, + "step": 12226, + "time_per_iteration": 2.523869276046753 + }, + { + "auxiliary_loss_clip": 0.01068028, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.03024578, + "balance_loss_mlp": 1.01728845, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 2.175859107275442, + "language_loss": 0.66876721, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68974727, + "num_input_tokens_seen": 263853850, + "step": 12227, + "time_per_iteration": 2.6946287155151367 + }, + { + "auxiliary_loss_clip": 0.01087104, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.03116488, + "balance_loss_mlp": 1.01972139, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.4722192995177172, + "language_loss": 0.63699043, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65816677, + "num_input_tokens_seen": 263874760, + "step": 12228, + "time_per_iteration": 2.5993919372558594 + }, + { + "auxiliary_loss_clip": 0.01076285, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.03228951, + "balance_loss_mlp": 1.02078998, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 1.6077836654960793, + "language_loss": 0.63519704, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65627062, + "num_input_tokens_seen": 263893390, + "step": 12229, + "time_per_iteration": 2.5777041912078857 + }, + { + "auxiliary_loss_clip": 0.01073722, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.03484273, + "balance_loss_mlp": 1.01953161, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.896454173442759, + "language_loss": 0.73435634, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75540525, + "num_input_tokens_seen": 263911180, + "step": 12230, + "time_per_iteration": 2.673476457595825 + }, + { + "auxiliary_loss_clip": 0.01044149, + "auxiliary_loss_mlp": 0.0103022, + "balance_loss_clip": 1.03060222, + "balance_loss_mlp": 1.0187695, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 2.0617624479332037, + "language_loss": 0.71686, + "learning_rate": 6.90599654932332e-07, + "loss": 0.73760366, + "num_input_tokens_seen": 263928975, + "step": 12231, + "time_per_iteration": 2.720724582672119 + }, + { + "auxiliary_loss_clip": 0.01090144, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.03458416, + "balance_loss_mlp": 1.02430272, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 2.3035584701705836, + "language_loss": 0.64049107, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66176277, + "num_input_tokens_seen": 263944495, + "step": 12232, + "time_per_iteration": 4.032205820083618 + }, + { + "auxiliary_loss_clip": 0.01079042, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.03350854, + "balance_loss_mlp": 1.01816916, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.7974906681210605, + "language_loss": 0.7533142, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77439946, + "num_input_tokens_seen": 263961325, + "step": 12233, + "time_per_iteration": 2.601663112640381 + }, + { + "auxiliary_loss_clip": 0.0109892, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.03388512, + "balance_loss_mlp": 1.01641226, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.7395597929471183, + "language_loss": 0.73322648, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75449181, + "num_input_tokens_seen": 263980445, + "step": 12234, + "time_per_iteration": 2.5208208560943604 + }, + { + "auxiliary_loss_clip": 0.01085433, + "auxiliary_loss_mlp": 0.01025661, + "balance_loss_clip": 1.03342652, + "balance_loss_mlp": 1.01438355, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 2.214547528670952, + "language_loss": 0.59722614, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61833704, + "num_input_tokens_seen": 263999330, + "step": 12235, + "time_per_iteration": 2.618180513381958 + }, + { + "auxiliary_loss_clip": 0.01073194, + "auxiliary_loss_mlp": 0.01024419, + "balance_loss_clip": 1.03192973, + "balance_loss_mlp": 1.01360714, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.310345772622339, + "language_loss": 0.86604762, + "learning_rate": 6.891283274567259e-07, + "loss": 0.88702381, + "num_input_tokens_seen": 264014150, + "step": 12236, + "time_per_iteration": 2.610501527786255 + }, + { + "auxiliary_loss_clip": 0.01087898, + "auxiliary_loss_mlp": 0.00749367, + "balance_loss_clip": 1.03317606, + "balance_loss_mlp": 1.00029731, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.9585991608337328, + "language_loss": 0.69755852, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71593118, + "num_input_tokens_seen": 264033140, + "step": 12237, + "time_per_iteration": 2.5483829975128174 + }, + { + "auxiliary_loss_clip": 0.01006925, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.02617908, + "balance_loss_mlp": 1.0162878, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.6887632606000724, + "language_loss": 0.72098422, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74133003, + "num_input_tokens_seen": 264052105, + "step": 12238, + "time_per_iteration": 2.8272061347961426 + }, + { + "auxiliary_loss_clip": 0.01066867, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.03041959, + "balance_loss_mlp": 1.01843166, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.6828955562722265, + "language_loss": 0.72159755, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74256939, + "num_input_tokens_seen": 264070690, + "step": 12239, + "time_per_iteration": 4.581337213516235 + }, + { + "auxiliary_loss_clip": 0.01066571, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.0332458, + "balance_loss_mlp": 1.01948333, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.3282739347494712, + "language_loss": 0.78991282, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81088269, + "num_input_tokens_seen": 264094225, + "step": 12240, + "time_per_iteration": 2.7116804122924805 + }, + { + "auxiliary_loss_clip": 0.01085945, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.03281999, + "balance_loss_mlp": 1.021698, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 2.22491950077339, + "language_loss": 0.82921982, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85040921, + "num_input_tokens_seen": 264113190, + "step": 12241, + "time_per_iteration": 2.5696423053741455 + }, + { + "auxiliary_loss_clip": 0.01085166, + "auxiliary_loss_mlp": 0.01025513, + "balance_loss_clip": 1.03119588, + "balance_loss_mlp": 1.01461768, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 2.2218834970451358, + "language_loss": 0.78412509, + "learning_rate": 6.873643749852484e-07, + "loss": 0.80523187, + "num_input_tokens_seen": 264132050, + "step": 12242, + "time_per_iteration": 2.5465080738067627 + }, + { + "auxiliary_loss_clip": 0.01046226, + "auxiliary_loss_mlp": 0.01024896, + "balance_loss_clip": 1.03027892, + "balance_loss_mlp": 1.0137862, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.8546298203571476, + "language_loss": 0.79230243, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81301367, + "num_input_tokens_seen": 264152800, + "step": 12243, + "time_per_iteration": 2.718574285507202 + }, + { + "auxiliary_loss_clip": 0.01086181, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.03192973, + "balance_loss_mlp": 1.01897979, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 5.47876746645625, + "language_loss": 0.74645877, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76762831, + "num_input_tokens_seen": 264169650, + "step": 12244, + "time_per_iteration": 2.4927268028259277 + }, + { + "auxiliary_loss_clip": 0.01078804, + "auxiliary_loss_mlp": 0.01028529, + "balance_loss_clip": 1.03076303, + "balance_loss_mlp": 1.01680446, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 2.2510203079725892, + "language_loss": 0.6953727, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71644598, + "num_input_tokens_seen": 264190530, + "step": 12245, + "time_per_iteration": 2.5378754138946533 + }, + { + "auxiliary_loss_clip": 0.01060435, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.03126431, + "balance_loss_mlp": 1.01917255, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.8551075997800972, + "language_loss": 0.7316159, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75252759, + "num_input_tokens_seen": 264210820, + "step": 12246, + "time_per_iteration": 2.697568893432617 + }, + { + "auxiliary_loss_clip": 0.01070991, + "auxiliary_loss_mlp": 0.01020027, + "balance_loss_clip": 1.03050601, + "balance_loss_mlp": 1.00974512, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 1.9998252323717485, + "language_loss": 0.73420286, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75511312, + "num_input_tokens_seen": 264227430, + "step": 12247, + "time_per_iteration": 2.598539113998413 + }, + { + "auxiliary_loss_clip": 0.01086748, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.03613496, + "balance_loss_mlp": 1.01814437, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.7189456367682694, + "language_loss": 0.73894995, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76010567, + "num_input_tokens_seen": 264245230, + "step": 12248, + "time_per_iteration": 2.573387861251831 + }, + { + "auxiliary_loss_clip": 0.01076566, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.03171086, + "balance_loss_mlp": 1.02071834, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 8.10781769090583, + "language_loss": 0.72955924, + "learning_rate": 6.853086953788727e-07, + "loss": 0.75064516, + "num_input_tokens_seen": 264263945, + "step": 12249, + "time_per_iteration": 4.145690441131592 + }, + { + "auxiliary_loss_clip": 0.01075321, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.03238034, + "balance_loss_mlp": 1.01695967, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 2.7853440044453706, + "language_loss": 0.7698372, + "learning_rate": 6.850152261875189e-07, + "loss": 0.7908746, + "num_input_tokens_seen": 264281500, + "step": 12250, + "time_per_iteration": 2.579601764678955 + }, + { + "auxiliary_loss_clip": 0.01054527, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.03225839, + "balance_loss_mlp": 1.01699352, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.8200123917653315, + "language_loss": 0.70935309, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73018211, + "num_input_tokens_seen": 264301625, + "step": 12251, + "time_per_iteration": 2.617366075515747 + }, + { + "auxiliary_loss_clip": 0.0108397, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.03366625, + "balance_loss_mlp": 1.01991129, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.6766287814181686, + "language_loss": 0.65589333, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67704332, + "num_input_tokens_seen": 264323975, + "step": 12252, + "time_per_iteration": 2.875195264816284 + }, + { + "auxiliary_loss_clip": 0.01043763, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.03055406, + "balance_loss_mlp": 1.01875639, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.6738940139694396, + "language_loss": 0.79026997, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81101513, + "num_input_tokens_seen": 264343785, + "step": 12253, + "time_per_iteration": 2.716378688812256 + }, + { + "auxiliary_loss_clip": 0.01094537, + "auxiliary_loss_mlp": 0.0074929, + "balance_loss_clip": 1.03304064, + "balance_loss_mlp": 1.00026417, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.1435732517697264, + "language_loss": 0.75914061, + "learning_rate": 6.83841848176905e-07, + "loss": 0.77757883, + "num_input_tokens_seen": 264361130, + "step": 12254, + "time_per_iteration": 2.511631727218628 + }, + { + "auxiliary_loss_clip": 0.01068955, + "auxiliary_loss_mlp": 0.01038623, + "balance_loss_clip": 1.03129733, + "balance_loss_mlp": 1.02542675, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.649173095031129, + "language_loss": 0.69616497, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71724081, + "num_input_tokens_seen": 264376965, + "step": 12255, + "time_per_iteration": 2.613044023513794 + }, + { + "auxiliary_loss_clip": 0.01087421, + "auxiliary_loss_mlp": 0.01024543, + "balance_loss_clip": 1.03332376, + "balance_loss_mlp": 1.01292062, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 12.22533275155898, + "language_loss": 0.7553755, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77649522, + "num_input_tokens_seen": 264396310, + "step": 12256, + "time_per_iteration": 2.5749716758728027 + }, + { + "auxiliary_loss_clip": 0.01088347, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.03331149, + "balance_loss_mlp": 1.01931787, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.7449006101994968, + "language_loss": 0.73899239, + "learning_rate": 6.829623386729182e-07, + "loss": 0.76018173, + "num_input_tokens_seen": 264418085, + "step": 12257, + "time_per_iteration": 2.6542985439300537 + }, + { + "auxiliary_loss_clip": 0.01077701, + "auxiliary_loss_mlp": 0.01033703, + "balance_loss_clip": 1.03072691, + "balance_loss_mlp": 1.02266991, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.5700065762286242, + "language_loss": 0.77983201, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80094606, + "num_input_tokens_seen": 264437595, + "step": 12258, + "time_per_iteration": 2.5445611476898193 + }, + { + "auxiliary_loss_clip": 0.01090077, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.03467238, + "balance_loss_mlp": 1.02032423, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.4904573152254679, + "language_loss": 0.66309917, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68431234, + "num_input_tokens_seen": 264457385, + "step": 12259, + "time_per_iteration": 2.5472545623779297 + }, + { + "auxiliary_loss_clip": 0.0108743, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.0340451, + "balance_loss_mlp": 1.02131498, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.6613607177709173, + "language_loss": 0.73597872, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75718623, + "num_input_tokens_seen": 264477205, + "step": 12260, + "time_per_iteration": 2.557041883468628 + }, + { + "auxiliary_loss_clip": 0.01080428, + "auxiliary_loss_mlp": 0.01025767, + "balance_loss_clip": 1.032112, + "balance_loss_mlp": 1.01441836, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.5946037173320098, + "language_loss": 0.73572409, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75678605, + "num_input_tokens_seen": 264497195, + "step": 12261, + "time_per_iteration": 2.5706512928009033 + }, + { + "auxiliary_loss_clip": 0.01077151, + "auxiliary_loss_mlp": 0.01032137, + "balance_loss_clip": 1.03162456, + "balance_loss_mlp": 1.0200845, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 3.2265341901496907, + "language_loss": 0.66859317, + "learning_rate": 6.814974884917438e-07, + "loss": 0.68968606, + "num_input_tokens_seen": 264516950, + "step": 12262, + "time_per_iteration": 4.146305561065674 + }, + { + "auxiliary_loss_clip": 0.01098485, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.03376245, + "balance_loss_mlp": 1.01705146, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 2.1312686880248046, + "language_loss": 0.88831264, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90958643, + "num_input_tokens_seen": 264532675, + "step": 12263, + "time_per_iteration": 2.5733094215393066 + }, + { + "auxiliary_loss_clip": 0.01094328, + "auxiliary_loss_mlp": 0.01023301, + "balance_loss_clip": 1.03456926, + "balance_loss_mlp": 1.01373482, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.6844253904959139, + "language_loss": 0.67119575, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69237208, + "num_input_tokens_seen": 264555635, + "step": 12264, + "time_per_iteration": 2.5384011268615723 + }, + { + "auxiliary_loss_clip": 0.01093948, + "auxiliary_loss_mlp": 0.01026198, + "balance_loss_clip": 1.0322988, + "balance_loss_mlp": 1.01614833, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 1.6778340448234628, + "language_loss": 0.80205268, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82325411, + "num_input_tokens_seen": 264573140, + "step": 12265, + "time_per_iteration": 2.5529751777648926 + }, + { + "auxiliary_loss_clip": 0.01093102, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.03439522, + "balance_loss_mlp": 1.01835454, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.7606645357262771, + "language_loss": 0.74715644, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76838279, + "num_input_tokens_seen": 264591610, + "step": 12266, + "time_per_iteration": 2.5305731296539307 + }, + { + "auxiliary_loss_clip": 0.01090786, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.03602779, + "balance_loss_mlp": 1.02105176, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.684664595274157, + "language_loss": 0.73263848, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75387657, + "num_input_tokens_seen": 264611170, + "step": 12267, + "time_per_iteration": 2.623291254043579 + }, + { + "auxiliary_loss_clip": 0.01063902, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.03297424, + "balance_loss_mlp": 1.02098095, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 2.1506483297489956, + "language_loss": 0.82686347, + "learning_rate": 6.797413183219923e-07, + "loss": 0.84782237, + "num_input_tokens_seen": 264629365, + "step": 12268, + "time_per_iteration": 2.5664281845092773 + }, + { + "auxiliary_loss_clip": 0.01096953, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.03385139, + "balance_loss_mlp": 1.023808, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.8452568648560366, + "language_loss": 0.73314804, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75446695, + "num_input_tokens_seen": 264647915, + "step": 12269, + "time_per_iteration": 2.4614572525024414 + }, + { + "auxiliary_loss_clip": 0.01068282, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.02963412, + "balance_loss_mlp": 1.01839805, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 1.9366268326954417, + "language_loss": 0.70474136, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72573096, + "num_input_tokens_seen": 264669620, + "step": 12270, + "time_per_iteration": 2.7593700885772705 + }, + { + "auxiliary_loss_clip": 0.01081473, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.03143001, + "balance_loss_mlp": 1.01881766, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 2.0921451135060853, + "language_loss": 0.69294679, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71405143, + "num_input_tokens_seen": 264689345, + "step": 12271, + "time_per_iteration": 2.60513973236084 + }, + { + "auxiliary_loss_clip": 0.01066082, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.03380561, + "balance_loss_mlp": 1.02160239, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 2.2720759517375093, + "language_loss": 0.67707062, + "learning_rate": 6.785715393476586e-07, + "loss": 0.69806564, + "num_input_tokens_seen": 264707625, + "step": 12272, + "time_per_iteration": 4.091864109039307 + }, + { + "auxiliary_loss_clip": 0.01073017, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.03324389, + "balance_loss_mlp": 1.01893234, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 1.6417167957220344, + "language_loss": 0.78313291, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80415952, + "num_input_tokens_seen": 264725575, + "step": 12273, + "time_per_iteration": 2.542909622192383 + }, + { + "auxiliary_loss_clip": 0.01095677, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03175795, + "balance_loss_mlp": 1.02050686, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.9743033191192432, + "language_loss": 0.83711392, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85838503, + "num_input_tokens_seen": 264742855, + "step": 12274, + "time_per_iteration": 2.6645829677581787 + }, + { + "auxiliary_loss_clip": 0.01078176, + "auxiliary_loss_mlp": 0.00749665, + "balance_loss_clip": 1.03454578, + "balance_loss_mlp": 1.0003978, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.9327462777089812, + "language_loss": 0.73627269, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75455117, + "num_input_tokens_seen": 264761155, + "step": 12275, + "time_per_iteration": 2.6978886127471924 + }, + { + "auxiliary_loss_clip": 0.01061589, + "auxiliary_loss_mlp": 0.01043985, + "balance_loss_clip": 1.03097165, + "balance_loss_mlp": 1.03103876, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.9620249391850524, + "language_loss": 0.73444229, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75549805, + "num_input_tokens_seen": 264780660, + "step": 12276, + "time_per_iteration": 2.690232038497925 + }, + { + "auxiliary_loss_clip": 0.01099232, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.03378773, + "balance_loss_mlp": 1.01655602, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 2.211482377996453, + "language_loss": 0.77545369, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79672235, + "num_input_tokens_seen": 264798850, + "step": 12277, + "time_per_iteration": 2.5428552627563477 + }, + { + "auxiliary_loss_clip": 0.01097727, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.03469777, + "balance_loss_mlp": 1.02342153, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 3.1892919428831346, + "language_loss": 0.78262627, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80395114, + "num_input_tokens_seen": 264816795, + "step": 12278, + "time_per_iteration": 2.4774715900421143 + }, + { + "auxiliary_loss_clip": 0.01088274, + "auxiliary_loss_mlp": 0.00749335, + "balance_loss_clip": 1.03303981, + "balance_loss_mlp": 1.00025642, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.435183322638564, + "language_loss": 0.72192872, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74030477, + "num_input_tokens_seen": 264834105, + "step": 12279, + "time_per_iteration": 2.5252068042755127 + }, + { + "auxiliary_loss_clip": 0.01090007, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.03279614, + "balance_loss_mlp": 1.02216291, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.5953020917997964, + "language_loss": 0.85806727, + "learning_rate": 6.762343873257034e-07, + "loss": 0.87930858, + "num_input_tokens_seen": 264850895, + "step": 12280, + "time_per_iteration": 3.985835075378418 + }, + { + "auxiliary_loss_clip": 0.01056662, + "auxiliary_loss_mlp": 0.01027149, + "balance_loss_clip": 1.03062356, + "balance_loss_mlp": 1.01529372, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 2.0070198998743427, + "language_loss": 0.72529805, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74613607, + "num_input_tokens_seen": 264869505, + "step": 12281, + "time_per_iteration": 2.6114885807037354 + }, + { + "auxiliary_loss_clip": 0.01054235, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.03051174, + "balance_loss_mlp": 1.02012324, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.6844627783783466, + "language_loss": 0.60529864, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62615579, + "num_input_tokens_seen": 264886915, + "step": 12282, + "time_per_iteration": 2.6168205738067627 + }, + { + "auxiliary_loss_clip": 0.01064935, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.0324651, + "balance_loss_mlp": 1.01799095, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 2.124811642998631, + "language_loss": 0.67739648, + "learning_rate": 6.753587832687632e-07, + "loss": 0.69834083, + "num_input_tokens_seen": 264910350, + "step": 12283, + "time_per_iteration": 2.7829477787017822 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.0074948, + "balance_loss_clip": 1.03534055, + "balance_loss_mlp": 1.00026953, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.594020222649908, + "language_loss": 0.75673175, + "learning_rate": 6.750670156960832e-07, + "loss": 0.77521867, + "num_input_tokens_seen": 264930705, + "step": 12284, + "time_per_iteration": 2.6185178756713867 + }, + { + "auxiliary_loss_clip": 0.01079357, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.03068304, + "balance_loss_mlp": 1.01654458, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 2.237087823582231, + "language_loss": 0.69147396, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71255863, + "num_input_tokens_seen": 264946975, + "step": 12285, + "time_per_iteration": 2.530878782272339 + }, + { + "auxiliary_loss_clip": 0.01082656, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.03401089, + "balance_loss_mlp": 1.01804662, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 2.167570040204705, + "language_loss": 0.79827809, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81940669, + "num_input_tokens_seen": 264967665, + "step": 12286, + "time_per_iteration": 2.618103265762329 + }, + { + "auxiliary_loss_clip": 0.01050009, + "auxiliary_loss_mlp": 0.01026427, + "balance_loss_clip": 1.03066504, + "balance_loss_mlp": 1.01533425, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 3.71077480805884, + "language_loss": 0.65276444, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67352885, + "num_input_tokens_seen": 264985480, + "step": 12287, + "time_per_iteration": 2.6800968647003174 + }, + { + "auxiliary_loss_clip": 0.01072145, + "auxiliary_loss_mlp": 0.01023987, + "balance_loss_clip": 1.03051376, + "balance_loss_mlp": 1.01338959, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.8280280493128485, + "language_loss": 0.77020377, + "learning_rate": 6.739004479318903e-07, + "loss": 0.79116511, + "num_input_tokens_seen": 265004790, + "step": 12288, + "time_per_iteration": 2.606189250946045 + }, + { + "auxiliary_loss_clip": 0.01090145, + "auxiliary_loss_mlp": 0.00749621, + "balance_loss_clip": 1.03491187, + "balance_loss_mlp": 1.000332, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.9241256679122016, + "language_loss": 0.58052307, + "learning_rate": 6.736089316777684e-07, + "loss": 0.5989207, + "num_input_tokens_seen": 265028790, + "step": 12289, + "time_per_iteration": 4.256573438644409 + }, + { + "auxiliary_loss_clip": 0.01026652, + "auxiliary_loss_mlp": 0.00746661, + "balance_loss_clip": 1.00663614, + "balance_loss_mlp": 0.99991649, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6400858918392609, + "language_loss": 0.4929052, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51063836, + "num_input_tokens_seen": 265096660, + "step": 12290, + "time_per_iteration": 3.225156784057617 + }, + { + "auxiliary_loss_clip": 0.01081743, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.03329432, + "balance_loss_mlp": 1.01534474, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 2.881661689393892, + "language_loss": 0.67968172, + "learning_rate": 6.730260500712237e-07, + "loss": 0.70077348, + "num_input_tokens_seen": 265116375, + "step": 12291, + "time_per_iteration": 2.632310390472412 + }, + { + "auxiliary_loss_clip": 0.00988162, + "auxiliary_loss_mlp": 0.00998714, + "balance_loss_clip": 1.00833488, + "balance_loss_mlp": 0.99764693, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9823836725776934, + "language_loss": 0.6087029, + "learning_rate": 6.727346847409052e-07, + "loss": 0.62857169, + "num_input_tokens_seen": 265161230, + "step": 12292, + "time_per_iteration": 2.9210689067840576 + }, + { + "auxiliary_loss_clip": 0.01050475, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.03274441, + "balance_loss_mlp": 1.02320683, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 2.165963208679542, + "language_loss": 0.67093855, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69178307, + "num_input_tokens_seen": 265182515, + "step": 12293, + "time_per_iteration": 2.7394864559173584 + }, + { + "auxiliary_loss_clip": 0.01087382, + "auxiliary_loss_mlp": 0.01032795, + "balance_loss_clip": 1.03259802, + "balance_loss_mlp": 1.02103484, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.9690217508151004, + "language_loss": 0.83435857, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85556036, + "num_input_tokens_seen": 265198160, + "step": 12294, + "time_per_iteration": 2.5956900119781494 + }, + { + "auxiliary_loss_clip": 0.01065077, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.03366208, + "balance_loss_mlp": 1.0187453, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.8215485114014751, + "language_loss": 0.73023742, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75119042, + "num_input_tokens_seen": 265218480, + "step": 12295, + "time_per_iteration": 2.7288756370544434 + }, + { + "auxiliary_loss_clip": 0.01087518, + "auxiliary_loss_mlp": 0.01038352, + "balance_loss_clip": 1.03549552, + "balance_loss_mlp": 1.02712798, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.7004812390932391, + "language_loss": 0.78921831, + "learning_rate": 6.715697268304215e-07, + "loss": 0.81047702, + "num_input_tokens_seen": 265240165, + "step": 12296, + "time_per_iteration": 2.6080193519592285 + }, + { + "auxiliary_loss_clip": 0.01098773, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.03380525, + "balance_loss_mlp": 1.01717925, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.9669712411173064, + "language_loss": 0.66763401, + "learning_rate": 6.712786132607182e-07, + "loss": 0.6889137, + "num_input_tokens_seen": 265263295, + "step": 12297, + "time_per_iteration": 2.666551351547241 + }, + { + "auxiliary_loss_clip": 0.01074182, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.03296971, + "balance_loss_mlp": 1.02152407, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 1.6836379795018241, + "language_loss": 0.6831311, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70421052, + "num_input_tokens_seen": 265282740, + "step": 12298, + "time_per_iteration": 2.568026542663574 + }, + { + "auxiliary_loss_clip": 0.01069991, + "auxiliary_loss_mlp": 0.01028841, + "balance_loss_clip": 1.03179586, + "balance_loss_mlp": 1.01748037, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.9778132912031168, + "language_loss": 0.74949694, + "learning_rate": 6.706965372880946e-07, + "loss": 0.77048522, + "num_input_tokens_seen": 265300175, + "step": 12299, + "time_per_iteration": 2.5156168937683105 + }, + { + "auxiliary_loss_clip": 0.01006695, + "auxiliary_loss_mlp": 0.00999763, + "balance_loss_clip": 1.01170516, + "balance_loss_mlp": 0.99869633, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7242920324893799, + "language_loss": 0.60853517, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62859964, + "num_input_tokens_seen": 265363275, + "step": 12300, + "time_per_iteration": 3.1816978454589844 + }, + { + "auxiliary_loss_clip": 0.01069805, + "auxiliary_loss_mlp": 0.01029777, + "balance_loss_clip": 1.03382254, + "balance_loss_mlp": 1.01821923, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.554078531170245, + "language_loss": 0.80325735, + "learning_rate": 6.7011466294475e-07, + "loss": 0.8242532, + "num_input_tokens_seen": 265382935, + "step": 12301, + "time_per_iteration": 2.569653272628784 + }, + { + "auxiliary_loss_clip": 0.01096502, + "auxiliary_loss_mlp": 0.01025107, + "balance_loss_clip": 1.03294945, + "balance_loss_mlp": 1.01449704, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.8824828041907207, + "language_loss": 0.73100519, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75222135, + "num_input_tokens_seen": 265403245, + "step": 12302, + "time_per_iteration": 4.150934934616089 + }, + { + "auxiliary_loss_clip": 0.01098653, + "auxiliary_loss_mlp": 0.01039649, + "balance_loss_clip": 1.03338647, + "balance_loss_mlp": 1.02811551, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 2.0059249475348397, + "language_loss": 0.74068058, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76206356, + "num_input_tokens_seen": 265423105, + "step": 12303, + "time_per_iteration": 2.6342320442199707 + }, + { + "auxiliary_loss_clip": 0.01096052, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.03328419, + "balance_loss_mlp": 1.02021158, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.6826830681815148, + "language_loss": 0.54414356, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56541425, + "num_input_tokens_seen": 265443445, + "step": 12304, + "time_per_iteration": 2.558623790740967 + }, + { + "auxiliary_loss_clip": 0.01075097, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.03163958, + "balance_loss_mlp": 1.02234721, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 1.846979834847677, + "language_loss": 0.8416357, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86272925, + "num_input_tokens_seen": 265462085, + "step": 12305, + "time_per_iteration": 2.6078689098358154 + }, + { + "auxiliary_loss_clip": 0.01017155, + "auxiliary_loss_mlp": 0.0104213, + "balance_loss_clip": 1.01804829, + "balance_loss_mlp": 1.04058027, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8934766103540936, + "language_loss": 0.5767839, + "learning_rate": 6.68660859793615e-07, + "loss": 0.5973767, + "num_input_tokens_seen": 265521190, + "step": 12306, + "time_per_iteration": 3.1433331966400146 + }, + { + "auxiliary_loss_clip": 0.0108041, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.0352422, + "balance_loss_mlp": 1.01822495, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 2.0377321970363087, + "language_loss": 0.81546348, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83656847, + "num_input_tokens_seen": 265539705, + "step": 12307, + "time_per_iteration": 2.521695137023926 + }, + { + "auxiliary_loss_clip": 0.01084924, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.03663015, + "balance_loss_mlp": 1.02072001, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.6662201297234203, + "language_loss": 0.70022547, + "learning_rate": 6.680796918475893e-07, + "loss": 0.72138935, + "num_input_tokens_seen": 265555855, + "step": 12308, + "time_per_iteration": 2.5536229610443115 + }, + { + "auxiliary_loss_clip": 0.01065292, + "auxiliary_loss_mlp": 0.01025405, + "balance_loss_clip": 1.03133917, + "balance_loss_mlp": 1.01460505, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.9960536561034834, + "language_loss": 0.81536698, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83627403, + "num_input_tokens_seen": 265575455, + "step": 12309, + "time_per_iteration": 2.5773680210113525 + }, + { + "auxiliary_loss_clip": 0.01088212, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.0341866, + "balance_loss_mlp": 1.02126038, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.8403961554316142, + "language_loss": 0.72719395, + "learning_rate": 6.674987259277692e-07, + "loss": 0.74841118, + "num_input_tokens_seen": 265595250, + "step": 12310, + "time_per_iteration": 2.548022985458374 + }, + { + "auxiliary_loss_clip": 0.01062295, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.03326261, + "balance_loss_mlp": 1.02323532, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.5034007829928515, + "language_loss": 0.88495773, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90593851, + "num_input_tokens_seen": 265606945, + "step": 12311, + "time_per_iteration": 2.555555820465088 + }, + { + "auxiliary_loss_clip": 0.01034465, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.02810478, + "balance_loss_mlp": 1.01568866, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 2.2505306125218594, + "language_loss": 0.80405223, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82465899, + "num_input_tokens_seen": 265626115, + "step": 12312, + "time_per_iteration": 4.200400114059448 + }, + { + "auxiliary_loss_clip": 0.01046233, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.0300169, + "balance_loss_mlp": 1.02012599, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 1.9437618230315257, + "language_loss": 0.78260171, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80337793, + "num_input_tokens_seen": 265646520, + "step": 12313, + "time_per_iteration": 2.681675434112549 + }, + { + "auxiliary_loss_clip": 0.01046901, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.03009963, + "balance_loss_mlp": 1.0213151, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 2.9463078224398154, + "language_loss": 0.78185713, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80266005, + "num_input_tokens_seen": 265661875, + "step": 12314, + "time_per_iteration": 2.637178421020508 + }, + { + "auxiliary_loss_clip": 0.01014456, + "auxiliary_loss_mlp": 0.0100174, + "balance_loss_clip": 1.00460649, + "balance_loss_mlp": 1.00080419, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.838511495140907, + "language_loss": 0.55240464, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57256663, + "num_input_tokens_seen": 265721255, + "step": 12315, + "time_per_iteration": 3.114175319671631 + }, + { + "auxiliary_loss_clip": 0.01083368, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.03413033, + "balance_loss_mlp": 1.02213979, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.5193664001258123, + "language_loss": 0.79147291, + "learning_rate": 6.65757041206591e-07, + "loss": 0.8126421, + "num_input_tokens_seen": 265743970, + "step": 12316, + "time_per_iteration": 2.65071177482605 + }, + { + "auxiliary_loss_clip": 0.01085032, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.03154445, + "balance_loss_mlp": 1.0177381, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 2.861957526076284, + "language_loss": 0.74832952, + "learning_rate": 6.654669374367275e-07, + "loss": 0.76947057, + "num_input_tokens_seen": 265760890, + "step": 12317, + "time_per_iteration": 2.5395431518554688 + }, + { + "auxiliary_loss_clip": 0.01065735, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.03263509, + "balance_loss_mlp": 1.02056265, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.8008110032287317, + "language_loss": 0.81331193, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83428484, + "num_input_tokens_seen": 265779600, + "step": 12318, + "time_per_iteration": 2.663281202316284 + }, + { + "auxiliary_loss_clip": 0.01071167, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.03109884, + "balance_loss_mlp": 1.01725554, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 2.0949927562353263, + "language_loss": 0.76865929, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78965646, + "num_input_tokens_seen": 265797030, + "step": 12319, + "time_per_iteration": 4.065890550613403 + }, + { + "auxiliary_loss_clip": 0.01065067, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.03107369, + "balance_loss_mlp": 1.01982045, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 2.070609734991384, + "language_loss": 0.64007258, + "learning_rate": 6.64596929804897e-07, + "loss": 0.66102695, + "num_input_tokens_seen": 265815055, + "step": 12320, + "time_per_iteration": 2.623708724975586 + }, + { + "auxiliary_loss_clip": 0.01090316, + "auxiliary_loss_mlp": 0.01036372, + "balance_loss_clip": 1.03417981, + "balance_loss_mlp": 1.02496409, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.6655837474263655, + "language_loss": 0.82446533, + "learning_rate": 6.643070285235288e-07, + "loss": 0.84573221, + "num_input_tokens_seen": 265828480, + "step": 12321, + "time_per_iteration": 2.518004894256592 + }, + { + "auxiliary_loss_clip": 0.01081955, + "auxiliary_loss_mlp": 0.01046988, + "balance_loss_clip": 1.03372729, + "balance_loss_mlp": 1.03383338, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 1.66677072966393, + "language_loss": 0.72087103, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74216044, + "num_input_tokens_seen": 265845825, + "step": 12322, + "time_per_iteration": 2.5791821479797363 + }, + { + "auxiliary_loss_clip": 0.01086418, + "auxiliary_loss_mlp": 0.00749531, + "balance_loss_clip": 1.03248692, + "balance_loss_mlp": 1.0003252, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.9241651952492693, + "language_loss": 0.64157355, + "learning_rate": 6.637273779206183e-07, + "loss": 0.65993309, + "num_input_tokens_seen": 265866335, + "step": 12323, + "time_per_iteration": 2.565175771713257 + }, + { + "auxiliary_loss_clip": 0.01063295, + "auxiliary_loss_mlp": 0.01026127, + "balance_loss_clip": 1.03060567, + "balance_loss_mlp": 1.01389587, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.452787874587157, + "language_loss": 0.75903374, + "learning_rate": 6.634376286210559e-07, + "loss": 0.77992797, + "num_input_tokens_seen": 265888945, + "step": 12324, + "time_per_iteration": 2.683657169342041 + }, + { + "auxiliary_loss_clip": 0.01069078, + "auxiliary_loss_mlp": 0.01023363, + "balance_loss_clip": 1.03257668, + "balance_loss_mlp": 1.01202607, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.6725605644190864, + "language_loss": 0.74721456, + "learning_rate": 6.63147930004073e-07, + "loss": 0.768139, + "num_input_tokens_seen": 265908030, + "step": 12325, + "time_per_iteration": 2.589851140975952 + }, + { + "auxiliary_loss_clip": 0.01056661, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.02091169, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.938547524597092, + "language_loss": 0.68658471, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70748663, + "num_input_tokens_seen": 265927030, + "step": 12326, + "time_per_iteration": 2.6683356761932373 + }, + { + "auxiliary_loss_clip": 0.01059899, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.03260684, + "balance_loss_mlp": 1.01595092, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 1.6780875651408595, + "language_loss": 0.89811444, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91898507, + "num_input_tokens_seen": 265945490, + "step": 12327, + "time_per_iteration": 2.658701181411743 + }, + { + "auxiliary_loss_clip": 0.01099611, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.03476715, + "balance_loss_mlp": 1.02008986, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 4.279173816353223, + "language_loss": 0.85502696, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87634099, + "num_input_tokens_seen": 265963265, + "step": 12328, + "time_per_iteration": 2.4995760917663574 + }, + { + "auxiliary_loss_clip": 0.01086161, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.0328263, + "balance_loss_mlp": 1.01605725, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.628212420523941, + "language_loss": 0.66609377, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68724144, + "num_input_tokens_seen": 265982270, + "step": 12329, + "time_per_iteration": 3.981027603149414 + }, + { + "auxiliary_loss_clip": 0.01064679, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.03223193, + "balance_loss_mlp": 1.02962756, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.57494033582896, + "language_loss": 0.66774583, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68881816, + "num_input_tokens_seen": 266003835, + "step": 12330, + "time_per_iteration": 2.660994529724121 + }, + { + "auxiliary_loss_clip": 0.01067893, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.03634596, + "balance_loss_mlp": 1.01931953, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 2.097688124283377, + "language_loss": 0.85474443, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87575638, + "num_input_tokens_seen": 266021595, + "step": 12331, + "time_per_iteration": 2.6199395656585693 + }, + { + "auxiliary_loss_clip": 0.01032794, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.03350472, + "balance_loss_mlp": 1.02031159, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 2.1327718795153583, + "language_loss": 0.69542974, + "learning_rate": 6.611214597199364e-07, + "loss": 0.7160846, + "num_input_tokens_seen": 266039860, + "step": 12332, + "time_per_iteration": 2.7126829624176025 + }, + { + "auxiliary_loss_clip": 0.01101525, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.03589642, + "balance_loss_mlp": 1.02146101, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 1.891715494654786, + "language_loss": 0.6310631, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65241528, + "num_input_tokens_seen": 266058050, + "step": 12333, + "time_per_iteration": 2.4875526428222656 + }, + { + "auxiliary_loss_clip": 0.01075973, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.03523183, + "balance_loss_mlp": 1.02145219, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.579165641987706, + "language_loss": 0.70976502, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73085141, + "num_input_tokens_seen": 266078060, + "step": 12334, + "time_per_iteration": 2.747732400894165 + }, + { + "auxiliary_loss_clip": 0.01053074, + "auxiliary_loss_mlp": 0.01025649, + "balance_loss_clip": 1.03288043, + "balance_loss_mlp": 1.01450944, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.7819114403218173, + "language_loss": 0.82263398, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84342128, + "num_input_tokens_seen": 266097110, + "step": 12335, + "time_per_iteration": 2.574744462966919 + }, + { + "auxiliary_loss_clip": 0.01101081, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03505445, + "balance_loss_mlp": 1.01852512, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 3.018192006227065, + "language_loss": 0.74596077, + "learning_rate": 6.599645934079259e-07, + "loss": 0.76728117, + "num_input_tokens_seen": 266110870, + "step": 12336, + "time_per_iteration": 2.4674594402313232 + }, + { + "auxiliary_loss_clip": 0.01052404, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.0311898, + "balance_loss_mlp": 1.01764154, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.9710142881147057, + "language_loss": 0.73608315, + "learning_rate": 6.596755038382029e-07, + "loss": 0.7569049, + "num_input_tokens_seen": 266127845, + "step": 12337, + "time_per_iteration": 2.6150095462799072 + }, + { + "auxiliary_loss_clip": 0.01073147, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.03386259, + "balance_loss_mlp": 1.02354217, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 2.1979006712089864, + "language_loss": 0.7696628, + "learning_rate": 6.593864650937186e-07, + "loss": 0.79074168, + "num_input_tokens_seen": 266145400, + "step": 12338, + "time_per_iteration": 2.5685067176818848 + }, + { + "auxiliary_loss_clip": 0.01087205, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.03347719, + "balance_loss_mlp": 1.0157125, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7492941048500068, + "language_loss": 0.73193264, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75306237, + "num_input_tokens_seen": 266164430, + "step": 12339, + "time_per_iteration": 2.525269031524658 + }, + { + "auxiliary_loss_clip": 0.01080107, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.03481603, + "balance_loss_mlp": 1.01871729, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 1.7478569858570774, + "language_loss": 0.79620469, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81731004, + "num_input_tokens_seen": 266183855, + "step": 12340, + "time_per_iteration": 2.605868101119995 + }, + { + "auxiliary_loss_clip": 0.0105538, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.03089249, + "balance_loss_mlp": 1.02122343, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.5652544948302585, + "language_loss": 0.75751787, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77840179, + "num_input_tokens_seen": 266202085, + "step": 12341, + "time_per_iteration": 2.5698020458221436 + }, + { + "auxiliary_loss_clip": 0.01065993, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.0327239, + "balance_loss_mlp": 1.02349997, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.4757728921441045, + "language_loss": 0.79932636, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82035297, + "num_input_tokens_seen": 266223445, + "step": 12342, + "time_per_iteration": 4.120841979980469 + }, + { + "auxiliary_loss_clip": 0.01060631, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.03144765, + "balance_loss_mlp": 1.01849866, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.6155058281475692, + "language_loss": 0.7753523, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79625654, + "num_input_tokens_seen": 266246575, + "step": 12343, + "time_per_iteration": 3.017563581466675 + }, + { + "auxiliary_loss_clip": 0.01071241, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.0304091, + "balance_loss_mlp": 1.01912546, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.624975012140189, + "language_loss": 0.67974567, + "learning_rate": 6.576533005704843e-07, + "loss": 0.70076543, + "num_input_tokens_seen": 266266055, + "step": 12344, + "time_per_iteration": 2.6063625812530518 + }, + { + "auxiliary_loss_clip": 0.01057119, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.03473103, + "balance_loss_mlp": 1.02170992, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.672278814510758, + "language_loss": 0.81215048, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83306599, + "num_input_tokens_seen": 266282240, + "step": 12345, + "time_per_iteration": 2.6280386447906494 + }, + { + "auxiliary_loss_clip": 0.01056742, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.03086197, + "balance_loss_mlp": 1.02888608, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 2.138735596955611, + "language_loss": 0.70608377, + "learning_rate": 6.570759861612988e-07, + "loss": 0.72706199, + "num_input_tokens_seen": 266300980, + "step": 12346, + "time_per_iteration": 2.62117075920105 + }, + { + "auxiliary_loss_clip": 0.01090231, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.03555632, + "balance_loss_mlp": 1.01718068, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.6768992891426293, + "language_loss": 0.73233795, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75352532, + "num_input_tokens_seen": 266322215, + "step": 12347, + "time_per_iteration": 2.6626181602478027 + }, + { + "auxiliary_loss_clip": 0.01074995, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.03070176, + "balance_loss_mlp": 1.01929343, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.9030829718375952, + "language_loss": 0.81183052, + "learning_rate": 6.564988754473642e-07, + "loss": 0.8328889, + "num_input_tokens_seen": 266341600, + "step": 12348, + "time_per_iteration": 2.5538434982299805 + }, + { + "auxiliary_loss_clip": 0.01098229, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.03377366, + "balance_loss_mlp": 1.02110255, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 2.1416179598800875, + "language_loss": 0.72520876, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74651426, + "num_input_tokens_seen": 266362895, + "step": 12349, + "time_per_iteration": 2.6194005012512207 + }, + { + "auxiliary_loss_clip": 0.01078687, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.03109169, + "balance_loss_mlp": 1.02423453, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 2.167041814932482, + "language_loss": 0.78819776, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80935937, + "num_input_tokens_seen": 266384015, + "step": 12350, + "time_per_iteration": 2.6675868034362793 + }, + { + "auxiliary_loss_clip": 0.01056959, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.03444493, + "balance_loss_mlp": 1.01953173, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 1.5718695544617198, + "language_loss": 0.74713546, + "learning_rate": 6.556335914965343e-07, + "loss": 0.76801336, + "num_input_tokens_seen": 266405990, + "step": 12351, + "time_per_iteration": 2.7510488033294678 + }, + { + "auxiliary_loss_clip": 0.01039763, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.03271341, + "balance_loss_mlp": 1.01511884, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 2.2992283572958763, + "language_loss": 0.81441152, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83508462, + "num_input_tokens_seen": 266424260, + "step": 12352, + "time_per_iteration": 4.253842353820801 + }, + { + "auxiliary_loss_clip": 0.01091951, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.03717899, + "balance_loss_mlp": 1.02534652, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.9206234515316674, + "language_loss": 0.71479809, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73608392, + "num_input_tokens_seen": 266444580, + "step": 12353, + "time_per_iteration": 2.5722999572753906 + }, + { + "auxiliary_loss_clip": 0.01092519, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.0373795, + "balance_loss_mlp": 1.02086067, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 2.3536466849395508, + "language_loss": 0.71910858, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74035454, + "num_input_tokens_seen": 266465640, + "step": 12354, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01014233, + "auxiliary_loss_mlp": 0.01008173, + "balance_loss_clip": 1.00403774, + "balance_loss_mlp": 1.00720787, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6978829915465294, + "language_loss": 0.59477979, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61500382, + "num_input_tokens_seen": 266531950, + "step": 12355, + "time_per_iteration": 3.2076568603515625 + }, + { + "auxiliary_loss_clip": 0.01101026, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.0353508, + "balance_loss_mlp": 1.0182364, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.6737594462749399, + "language_loss": 0.67733049, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69864416, + "num_input_tokens_seen": 266550665, + "step": 12356, + "time_per_iteration": 2.5093090534210205 + }, + { + "auxiliary_loss_clip": 0.01087915, + "auxiliary_loss_mlp": 0.00749742, + "balance_loss_clip": 1.03090811, + "balance_loss_mlp": 1.00036645, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 1.9844142644566516, + "language_loss": 0.71863526, + "learning_rate": 6.539044003097301e-07, + "loss": 0.73701185, + "num_input_tokens_seen": 266572455, + "step": 12357, + "time_per_iteration": 2.7937405109405518 + }, + { + "auxiliary_loss_clip": 0.01075686, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.03430855, + "balance_loss_mlp": 1.0177598, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 2.047394338906031, + "language_loss": 0.65083003, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67187071, + "num_input_tokens_seen": 266590895, + "step": 12358, + "time_per_iteration": 2.6058859825134277 + }, + { + "auxiliary_loss_clip": 0.0106064, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.03434682, + "balance_loss_mlp": 1.01930583, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.9130092278176585, + "language_loss": 0.80595815, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82688522, + "num_input_tokens_seen": 266607660, + "step": 12359, + "time_per_iteration": 4.283310890197754 + }, + { + "auxiliary_loss_clip": 0.01087707, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.03237534, + "balance_loss_mlp": 1.01590252, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.9379196242450276, + "language_loss": 0.68219256, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70334363, + "num_input_tokens_seen": 266624260, + "step": 12360, + "time_per_iteration": 2.5131890773773193 + }, + { + "auxiliary_loss_clip": 0.01087838, + "auxiliary_loss_mlp": 0.00749655, + "balance_loss_clip": 1.03243351, + "balance_loss_mlp": 1.00033879, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6689557313405572, + "language_loss": 0.72787571, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74625069, + "num_input_tokens_seen": 266644210, + "step": 12361, + "time_per_iteration": 2.5753819942474365 + }, + { + "auxiliary_loss_clip": 0.01044936, + "auxiliary_loss_mlp": 0.01037401, + "balance_loss_clip": 1.02873397, + "balance_loss_mlp": 1.0242101, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.971328467952667, + "language_loss": 0.56046915, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58129251, + "num_input_tokens_seen": 266664230, + "step": 12362, + "time_per_iteration": 2.6329596042633057 + }, + { + "auxiliary_loss_clip": 0.01059883, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.03220439, + "balance_loss_mlp": 1.01750946, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.7185672088477881, + "language_loss": 0.77483964, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79573441, + "num_input_tokens_seen": 266683270, + "step": 12363, + "time_per_iteration": 2.604318141937256 + }, + { + "auxiliary_loss_clip": 0.01068851, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.03153765, + "balance_loss_mlp": 1.01903379, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.6343219608883008, + "language_loss": 0.77750003, + "learning_rate": 6.518893332627862e-07, + "loss": 0.79849195, + "num_input_tokens_seen": 266701235, + "step": 12364, + "time_per_iteration": 2.5875110626220703 + }, + { + "auxiliary_loss_clip": 0.01088186, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.03378129, + "balance_loss_mlp": 1.01898539, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.7362891311844706, + "language_loss": 0.78462589, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80580461, + "num_input_tokens_seen": 266721495, + "step": 12365, + "time_per_iteration": 2.5416691303253174 + }, + { + "auxiliary_loss_clip": 0.01077348, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.03319907, + "balance_loss_mlp": 1.022017, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.6070509034741636, + "language_loss": 0.76837593, + "learning_rate": 6.513140597415346e-07, + "loss": 0.78948784, + "num_input_tokens_seen": 266747400, + "step": 12366, + "time_per_iteration": 2.8424670696258545 + }, + { + "auxiliary_loss_clip": 0.01087808, + "auxiliary_loss_mlp": 0.01025877, + "balance_loss_clip": 1.03526998, + "balance_loss_mlp": 1.01586342, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.7854837729422826, + "language_loss": 0.71441758, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73555446, + "num_input_tokens_seen": 266767630, + "step": 12367, + "time_per_iteration": 2.535068988800049 + }, + { + "auxiliary_loss_clip": 0.01070513, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.03349209, + "balance_loss_mlp": 1.02383447, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5423938951155995, + "language_loss": 0.74485248, + "learning_rate": 6.507389907895038e-07, + "loss": 0.7659108, + "num_input_tokens_seen": 266788015, + "step": 12368, + "time_per_iteration": 2.722018003463745 + }, + { + "auxiliary_loss_clip": 0.01085622, + "auxiliary_loss_mlp": 0.01032679, + "balance_loss_clip": 1.03501475, + "balance_loss_mlp": 1.02235508, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 1.8551214116868748, + "language_loss": 0.6930564, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71423936, + "num_input_tokens_seen": 266809010, + "step": 12369, + "time_per_iteration": 4.093932390213013 + }, + { + "auxiliary_loss_clip": 0.01076407, + "auxiliary_loss_mlp": 0.00749493, + "balance_loss_clip": 1.03314352, + "balance_loss_mlp": 1.00033236, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.9813838621666129, + "language_loss": 0.75913036, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77738929, + "num_input_tokens_seen": 266825390, + "step": 12370, + "time_per_iteration": 2.562699317932129 + }, + { + "auxiliary_loss_clip": 0.01101433, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.03714466, + "balance_loss_mlp": 1.02116895, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.678435666124855, + "language_loss": 0.78596133, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80730236, + "num_input_tokens_seen": 266844675, + "step": 12371, + "time_per_iteration": 2.5175957679748535 + }, + { + "auxiliary_loss_clip": 0.01078626, + "auxiliary_loss_mlp": 0.01024487, + "balance_loss_clip": 1.03306532, + "balance_loss_mlp": 1.01292396, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.6812542835794206, + "language_loss": 0.69433177, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71536291, + "num_input_tokens_seen": 266865160, + "step": 12372, + "time_per_iteration": 2.632289171218872 + }, + { + "auxiliary_loss_clip": 0.01066177, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.03223205, + "balance_loss_mlp": 1.01938999, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 5.634574770391034, + "language_loss": 0.74917877, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77015138, + "num_input_tokens_seen": 266883285, + "step": 12373, + "time_per_iteration": 2.578343152999878 + }, + { + "auxiliary_loss_clip": 0.01042313, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.02921891, + "balance_loss_mlp": 1.01932967, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.725903238435797, + "language_loss": 0.77358538, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79433513, + "num_input_tokens_seen": 266900960, + "step": 12374, + "time_per_iteration": 2.669060707092285 + }, + { + "auxiliary_loss_clip": 0.01032968, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.03072333, + "balance_loss_mlp": 1.02026415, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.020241818858045, + "language_loss": 0.76208043, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78272998, + "num_input_tokens_seen": 266917710, + "step": 12375, + "time_per_iteration": 2.7269670963287354 + }, + { + "auxiliary_loss_clip": 0.01084786, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.0326333, + "balance_loss_mlp": 1.01827645, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 2.095429715216749, + "language_loss": 0.77427495, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79540569, + "num_input_tokens_seen": 266934220, + "step": 12376, + "time_per_iteration": 2.525522470474243 + }, + { + "auxiliary_loss_clip": 0.01053291, + "auxiliary_loss_mlp": 0.01033927, + "balance_loss_clip": 1.02844119, + "balance_loss_mlp": 1.02031946, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.7467171614671326, + "language_loss": 0.79239488, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81326699, + "num_input_tokens_seen": 266955210, + "step": 12377, + "time_per_iteration": 2.689422607421875 + }, + { + "auxiliary_loss_clip": 0.01088651, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.03526843, + "balance_loss_mlp": 1.01571441, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 2.4703620576722747, + "language_loss": 0.67268938, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69385266, + "num_input_tokens_seen": 266976555, + "step": 12378, + "time_per_iteration": 2.9138267040252686 + }, + { + "auxiliary_loss_clip": 0.01067809, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.03461599, + "balance_loss_mlp": 1.02223158, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.6827716539878794, + "language_loss": 0.72014093, + "learning_rate": 6.475797721245648e-07, + "loss": 0.74116957, + "num_input_tokens_seen": 266997640, + "step": 12379, + "time_per_iteration": 2.6851768493652344 + }, + { + "auxiliary_loss_clip": 0.01057385, + "auxiliary_loss_mlp": 0.00749809, + "balance_loss_clip": 1.02940667, + "balance_loss_mlp": 1.00033128, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 1.8706080562921332, + "language_loss": 0.65536594, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67343789, + "num_input_tokens_seen": 267016165, + "step": 12380, + "time_per_iteration": 2.6511528491973877 + }, + { + "auxiliary_loss_clip": 0.01089654, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.03455687, + "balance_loss_mlp": 1.01884937, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 1.7725630946849025, + "language_loss": 0.78365123, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80485809, + "num_input_tokens_seen": 267034075, + "step": 12381, + "time_per_iteration": 4.093512773513794 + }, + { + "auxiliary_loss_clip": 0.01057745, + "auxiliary_loss_mlp": 0.01039653, + "balance_loss_clip": 1.03362632, + "balance_loss_mlp": 1.02568209, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 1.9532390926901733, + "language_loss": 0.72612607, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74710006, + "num_input_tokens_seen": 267053645, + "step": 12382, + "time_per_iteration": 2.6347479820251465 + }, + { + "auxiliary_loss_clip": 0.00999519, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00891185, + "balance_loss_mlp": 1.00066686, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6546001251227481, + "language_loss": 0.5466187, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56663078, + "num_input_tokens_seen": 267121830, + "step": 12383, + "time_per_iteration": 3.342938184738159 + }, + { + "auxiliary_loss_clip": 0.01075503, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.03269243, + "balance_loss_mlp": 1.01747561, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 1.9654357300805745, + "language_loss": 0.75546336, + "learning_rate": 6.461458141259395e-07, + "loss": 0.77651072, + "num_input_tokens_seen": 267141145, + "step": 12384, + "time_per_iteration": 2.59161114692688 + }, + { + "auxiliary_loss_clip": 0.01087961, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.03401184, + "balance_loss_mlp": 1.01644254, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 5.409989426128292, + "language_loss": 0.80061662, + "learning_rate": 6.458591764975823e-07, + "loss": 0.82177627, + "num_input_tokens_seen": 267159280, + "step": 12385, + "time_per_iteration": 2.6146795749664307 + }, + { + "auxiliary_loss_clip": 0.01067006, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.03332329, + "balance_loss_mlp": 1.02028632, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.9984724624875103, + "language_loss": 0.81616038, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83716315, + "num_input_tokens_seen": 267179390, + "step": 12386, + "time_per_iteration": 2.6684796810150146 + }, + { + "auxiliary_loss_clip": 0.01081381, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.03322554, + "balance_loss_mlp": 1.01873517, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.6267149935740655, + "language_loss": 0.7077167, + "learning_rate": 6.452860552992037e-07, + "loss": 0.7288295, + "num_input_tokens_seen": 267198165, + "step": 12387, + "time_per_iteration": 2.5727434158325195 + }, + { + "auxiliary_loss_clip": 0.0106733, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.03356683, + "balance_loss_mlp": 1.01751125, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 1.9356790262985903, + "language_loss": 0.7042011, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72515786, + "num_input_tokens_seen": 267214520, + "step": 12388, + "time_per_iteration": 2.57204008102417 + }, + { + "auxiliary_loss_clip": 0.01086538, + "auxiliary_loss_mlp": 0.0103032, + "balance_loss_clip": 1.03208828, + "balance_loss_mlp": 1.01915622, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.6391603301748452, + "language_loss": 0.84945089, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87061948, + "num_input_tokens_seen": 267236555, + "step": 12389, + "time_per_iteration": 2.61411714553833 + }, + { + "auxiliary_loss_clip": 0.01055006, + "auxiliary_loss_mlp": 0.01036226, + "balance_loss_clip": 1.03094089, + "balance_loss_mlp": 1.02481723, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 1.8811062718473606, + "language_loss": 0.79479963, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81571198, + "num_input_tokens_seen": 267254800, + "step": 12390, + "time_per_iteration": 2.6669299602508545 + }, + { + "auxiliary_loss_clip": 0.01076847, + "auxiliary_loss_mlp": 0.01028075, + "balance_loss_clip": 1.03291655, + "balance_loss_mlp": 1.01585019, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 2.7578971505600136, + "language_loss": 0.85045272, + "learning_rate": 6.441404294400014e-07, + "loss": 0.87150192, + "num_input_tokens_seen": 267274610, + "step": 12391, + "time_per_iteration": 4.168188095092773 + }, + { + "auxiliary_loss_clip": 0.0109909, + "auxiliary_loss_mlp": 0.01025996, + "balance_loss_clip": 1.03425395, + "balance_loss_mlp": 1.01535642, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 1.8337500882434183, + "language_loss": 0.73457003, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75582087, + "num_input_tokens_seen": 267292600, + "step": 12392, + "time_per_iteration": 2.504136562347412 + }, + { + "auxiliary_loss_clip": 0.01085396, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.03395534, + "balance_loss_mlp": 1.02351499, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.9166796341979073, + "language_loss": 0.76932269, + "learning_rate": 6.435679249529487e-07, + "loss": 0.7905215, + "num_input_tokens_seen": 267311295, + "step": 12393, + "time_per_iteration": 2.5930800437927246 + }, + { + "auxiliary_loss_clip": 0.01089629, + "auxiliary_loss_mlp": 0.01033176, + "balance_loss_clip": 1.03500211, + "balance_loss_mlp": 1.02032578, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 3.482428859749638, + "language_loss": 0.727539, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74876702, + "num_input_tokens_seen": 267328390, + "step": 12394, + "time_per_iteration": 2.54248046875 + }, + { + "auxiliary_loss_clip": 0.01046639, + "auxiliary_loss_mlp": 0.00749425, + "balance_loss_clip": 1.0371834, + "balance_loss_mlp": 1.00028849, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 2.451370560601816, + "language_loss": 0.8175931, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83555371, + "num_input_tokens_seen": 267348185, + "step": 12395, + "time_per_iteration": 2.7635750770568848 + }, + { + "auxiliary_loss_clip": 0.01092343, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.03524232, + "balance_loss_mlp": 1.01915324, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 2.7868088909710838, + "language_loss": 0.71910429, + "learning_rate": 6.427095540197937e-07, + "loss": 0.74034059, + "num_input_tokens_seen": 267367010, + "step": 12396, + "time_per_iteration": 2.6763222217559814 + }, + { + "auxiliary_loss_clip": 0.01061474, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.03491044, + "balance_loss_mlp": 1.01838756, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 1.8121240008338722, + "language_loss": 0.68244958, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70336604, + "num_input_tokens_seen": 267386605, + "step": 12397, + "time_per_iteration": 2.7018752098083496 + }, + { + "auxiliary_loss_clip": 0.0109891, + "auxiliary_loss_mlp": 0.0103701, + "balance_loss_clip": 1.03431034, + "balance_loss_mlp": 1.02519679, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 1.732919800897053, + "language_loss": 0.76586086, + "learning_rate": 6.421375640558908e-07, + "loss": 0.78722006, + "num_input_tokens_seen": 267404135, + "step": 12398, + "time_per_iteration": 2.510922431945801 + }, + { + "auxiliary_loss_clip": 0.01085466, + "auxiliary_loss_mlp": 0.01025422, + "balance_loss_clip": 1.03339815, + "balance_loss_mlp": 1.01420403, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.6625760600321635, + "language_loss": 0.77638584, + "learning_rate": 6.418516463039363e-07, + "loss": 0.79749471, + "num_input_tokens_seen": 267423120, + "step": 12399, + "time_per_iteration": 4.055375099182129 + }, + { + "auxiliary_loss_clip": 0.0107013, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.03062499, + "balance_loss_mlp": 1.02338123, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 1.9433613307218136, + "language_loss": 0.74077058, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76181185, + "num_input_tokens_seen": 267441250, + "step": 12400, + "time_per_iteration": 2.574751377105713 + }, + { + "auxiliary_loss_clip": 0.01085567, + "auxiliary_loss_mlp": 0.0103044, + "balance_loss_clip": 1.03258049, + "balance_loss_mlp": 1.01974726, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.9949525534419275, + "language_loss": 0.81918687, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84034693, + "num_input_tokens_seen": 267462820, + "step": 12401, + "time_per_iteration": 2.6057064533233643 + }, + { + "auxiliary_loss_clip": 0.0106425, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.03249407, + "balance_loss_mlp": 1.02138948, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 2.2084251454880466, + "language_loss": 0.64967734, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67063856, + "num_input_tokens_seen": 267483065, + "step": 12402, + "time_per_iteration": 2.6234853267669678 + }, + { + "auxiliary_loss_clip": 0.01060697, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.02862358, + "balance_loss_mlp": 1.01928842, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 4.899929661191868, + "language_loss": 0.73425901, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75516558, + "num_input_tokens_seen": 267504825, + "step": 12403, + "time_per_iteration": 2.7959306240081787 + }, + { + "auxiliary_loss_clip": 0.00996866, + "auxiliary_loss_mlp": 0.01005718, + "balance_loss_clip": 1.00657892, + "balance_loss_mlp": 1.00487804, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8263883937364609, + "language_loss": 0.58775675, + "learning_rate": 6.404228302777621e-07, + "loss": 0.6077826, + "num_input_tokens_seen": 267559260, + "step": 12404, + "time_per_iteration": 2.9956161975860596 + }, + { + "auxiliary_loss_clip": 0.01098591, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.03395748, + "balance_loss_mlp": 1.02336991, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 2.0459498550095554, + "language_loss": 0.77548695, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79681778, + "num_input_tokens_seen": 267578720, + "step": 12405, + "time_per_iteration": 2.5852737426757812 + }, + { + "auxiliary_loss_clip": 0.01066894, + "auxiliary_loss_mlp": 0.01036736, + "balance_loss_clip": 1.03055596, + "balance_loss_mlp": 1.02496457, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.6408672939452744, + "language_loss": 0.69398737, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71502376, + "num_input_tokens_seen": 267598250, + "step": 12406, + "time_per_iteration": 2.685345411300659 + }, + { + "auxiliary_loss_clip": 0.01030749, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.03244519, + "balance_loss_mlp": 1.01988888, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.735665275910025, + "language_loss": 0.65162373, + "learning_rate": 6.39566159239002e-07, + "loss": 0.67225373, + "num_input_tokens_seen": 267615430, + "step": 12407, + "time_per_iteration": 2.7105813026428223 + }, + { + "auxiliary_loss_clip": 0.0105626, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.03293216, + "balance_loss_mlp": 1.02138543, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.7466514593607922, + "language_loss": 0.71888858, + "learning_rate": 6.392807053872212e-07, + "loss": 0.73979127, + "num_input_tokens_seen": 267635075, + "step": 12408, + "time_per_iteration": 2.6788971424102783 + }, + { + "auxiliary_loss_clip": 0.01092849, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.02221704, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 1.918828197450895, + "language_loss": 0.72833717, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74961257, + "num_input_tokens_seen": 267654105, + "step": 12409, + "time_per_iteration": 4.1004931926727295 + }, + { + "auxiliary_loss_clip": 0.01086422, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.03359401, + "balance_loss_mlp": 1.02025938, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.608224035269559, + "language_loss": 0.65831941, + "learning_rate": 6.38709952490319e-07, + "loss": 0.67949331, + "num_input_tokens_seen": 267673090, + "step": 12410, + "time_per_iteration": 2.58081316947937 + }, + { + "auxiliary_loss_clip": 0.01084027, + "auxiliary_loss_mlp": 0.00749514, + "balance_loss_clip": 1.03432965, + "balance_loss_mlp": 1.00028777, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.0676488479586177, + "language_loss": 0.8433609, + "learning_rate": 6.384246534668396e-07, + "loss": 0.8616963, + "num_input_tokens_seen": 267690605, + "step": 12411, + "time_per_iteration": 2.5889229774475098 + }, + { + "auxiliary_loss_clip": 0.01064092, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.03195453, + "balance_loss_mlp": 1.01735163, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.9483344960914275, + "language_loss": 0.78132564, + "learning_rate": 6.381394060744339e-07, + "loss": 0.80225617, + "num_input_tokens_seen": 267710540, + "step": 12412, + "time_per_iteration": 2.6538686752319336 + }, + { + "auxiliary_loss_clip": 0.01050892, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.02861857, + "balance_loss_mlp": 1.01864755, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.7008227777557958, + "language_loss": 0.62390345, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64471543, + "num_input_tokens_seen": 267730780, + "step": 12413, + "time_per_iteration": 2.728602647781372 + }, + { + "auxiliary_loss_clip": 0.01016231, + "auxiliary_loss_mlp": 0.00746742, + "balance_loss_clip": 1.00600052, + "balance_loss_mlp": 0.99995577, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7176225899302581, + "language_loss": 0.54894304, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56657279, + "num_input_tokens_seen": 267794240, + "step": 12414, + "time_per_iteration": 3.1725828647613525 + }, + { + "auxiliary_loss_clip": 0.0106901, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.03075933, + "balance_loss_mlp": 1.01883364, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.6876096685529105, + "language_loss": 0.54776913, + "learning_rate": 6.372839737918154e-07, + "loss": 0.56877124, + "num_input_tokens_seen": 267817190, + "step": 12415, + "time_per_iteration": 2.6745030879974365 + }, + { + "auxiliary_loss_clip": 0.01032053, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.0307436, + "balance_loss_mlp": 1.02050519, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.545005027604256, + "language_loss": 0.74743474, + "learning_rate": 6.369989330318506e-07, + "loss": 0.76809496, + "num_input_tokens_seen": 267836245, + "step": 12416, + "time_per_iteration": 2.7611653804779053 + }, + { + "auxiliary_loss_clip": 0.01045902, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02900553, + "balance_loss_mlp": 1.02405095, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.4330086368111832, + "language_loss": 0.69437253, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71519554, + "num_input_tokens_seen": 267858310, + "step": 12417, + "time_per_iteration": 2.890378952026367 + }, + { + "auxiliary_loss_clip": 0.01075023, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.03808689, + "balance_loss_mlp": 1.02067375, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 2.260731097612353, + "language_loss": 0.73643482, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75751382, + "num_input_tokens_seen": 267876345, + "step": 12418, + "time_per_iteration": 2.641002655029297 + }, + { + "auxiliary_loss_clip": 0.01087928, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.03377569, + "balance_loss_mlp": 1.01708484, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.7287595904476791, + "language_loss": 0.68932313, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71048874, + "num_input_tokens_seen": 267896740, + "step": 12419, + "time_per_iteration": 2.5692219734191895 + }, + { + "auxiliary_loss_clip": 0.01094904, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.03410769, + "balance_loss_mlp": 1.02129495, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.838255810250797, + "language_loss": 0.74775445, + "learning_rate": 6.358592869514216e-07, + "loss": 0.76902634, + "num_input_tokens_seen": 267914765, + "step": 12420, + "time_per_iteration": 2.5278992652893066 + }, + { + "auxiliary_loss_clip": 0.01092658, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.0361656, + "balance_loss_mlp": 1.01785755, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.7681512785933882, + "language_loss": 0.67770791, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69893199, + "num_input_tokens_seen": 267934085, + "step": 12421, + "time_per_iteration": 3.993696928024292 + }, + { + "auxiliary_loss_clip": 0.01078122, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.03413177, + "balance_loss_mlp": 1.01910985, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.6869904144342955, + "language_loss": 0.72605348, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74715066, + "num_input_tokens_seen": 267955170, + "step": 12422, + "time_per_iteration": 2.6152820587158203 + }, + { + "auxiliary_loss_clip": 0.0105788, + "auxiliary_loss_mlp": 0.01031221, + "balance_loss_clip": 1.03241611, + "balance_loss_mlp": 1.01916313, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 2.188853136701214, + "language_loss": 0.74645042, + "learning_rate": 6.350050955009796e-07, + "loss": 0.76734143, + "num_input_tokens_seen": 267974980, + "step": 12423, + "time_per_iteration": 2.7138545513153076 + }, + { + "auxiliary_loss_clip": 0.01085315, + "auxiliary_loss_mlp": 0.01024202, + "balance_loss_clip": 1.03238726, + "balance_loss_mlp": 1.01349759, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.7581843510832225, + "language_loss": 0.67853463, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69962978, + "num_input_tokens_seen": 267994985, + "step": 12424, + "time_per_iteration": 2.575059652328491 + }, + { + "auxiliary_loss_clip": 0.01093494, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.03634715, + "balance_loss_mlp": 1.02449274, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.9336822923455663, + "language_loss": 0.74337721, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76467592, + "num_input_tokens_seen": 268014985, + "step": 12425, + "time_per_iteration": 2.687904119491577 + }, + { + "auxiliary_loss_clip": 0.01057538, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.03137541, + "balance_loss_mlp": 1.0192194, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 2.7015336992326433, + "language_loss": 0.69498777, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71588522, + "num_input_tokens_seen": 268034395, + "step": 12426, + "time_per_iteration": 2.607184886932373 + }, + { + "auxiliary_loss_clip": 0.01059252, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.03280997, + "balance_loss_mlp": 1.02380526, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.4136031244852314, + "language_loss": 0.65488422, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67582238, + "num_input_tokens_seen": 268054485, + "step": 12427, + "time_per_iteration": 2.6428170204162598 + }, + { + "auxiliary_loss_clip": 0.01100167, + "auxiliary_loss_mlp": 0.01029149, + "balance_loss_clip": 1.03508973, + "balance_loss_mlp": 1.01664376, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.5297721910133157, + "language_loss": 0.74710125, + "learning_rate": 6.335824784423118e-07, + "loss": 0.76839435, + "num_input_tokens_seen": 268072250, + "step": 12428, + "time_per_iteration": 2.5424294471740723 + }, + { + "auxiliary_loss_clip": 0.01094545, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.03552556, + "balance_loss_mlp": 1.01847887, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 1.9976100598148547, + "language_loss": 0.58557183, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60683262, + "num_input_tokens_seen": 268089840, + "step": 12429, + "time_per_iteration": 2.5980899333953857 + }, + { + "auxiliary_loss_clip": 0.01089465, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.03476191, + "balance_loss_mlp": 1.01709008, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.7409698951155723, + "language_loss": 0.60812455, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62930834, + "num_input_tokens_seen": 268109360, + "step": 12430, + "time_per_iteration": 2.6322836875915527 + }, + { + "auxiliary_loss_clip": 0.01076393, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.0318656, + "balance_loss_mlp": 1.01936018, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.6414497385105855, + "language_loss": 0.75596547, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77703631, + "num_input_tokens_seen": 268131840, + "step": 12431, + "time_per_iteration": 4.123002767562866 + }, + { + "auxiliary_loss_clip": 0.01088094, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03248239, + "balance_loss_mlp": 1.01704288, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.8353976125770803, + "language_loss": 0.75731182, + "learning_rate": 6.32445317395021e-07, + "loss": 0.7784791, + "num_input_tokens_seen": 268148300, + "step": 12432, + "time_per_iteration": 2.5237326622009277 + }, + { + "auxiliary_loss_clip": 0.01077835, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.03399885, + "balance_loss_mlp": 1.02182436, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 3.8388430999707928, + "language_loss": 0.69821382, + "learning_rate": 6.321611567507787e-07, + "loss": 0.71934193, + "num_input_tokens_seen": 268166450, + "step": 12433, + "time_per_iteration": 2.603682518005371 + }, + { + "auxiliary_loss_clip": 0.01050942, + "auxiliary_loss_mlp": 0.01028657, + "balance_loss_clip": 1.03065157, + "balance_loss_mlp": 1.01626539, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 2.2946693321207663, + "language_loss": 0.66828775, + "learning_rate": 6.318770479751232e-07, + "loss": 0.68908376, + "num_input_tokens_seen": 268186165, + "step": 12434, + "time_per_iteration": 2.6941158771514893 + }, + { + "auxiliary_loss_clip": 0.0109403, + "auxiliary_loss_mlp": 0.01031052, + "balance_loss_clip": 1.03354013, + "balance_loss_mlp": 1.02129459, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.452382495547313, + "language_loss": 0.79660833, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81785905, + "num_input_tokens_seen": 268208145, + "step": 12435, + "time_per_iteration": 2.585176944732666 + }, + { + "auxiliary_loss_clip": 0.01070961, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.0342232, + "balance_loss_mlp": 1.01797462, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.8639048314266293, + "language_loss": 0.68404102, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70504534, + "num_input_tokens_seen": 268228345, + "step": 12436, + "time_per_iteration": 2.7529423236846924 + }, + { + "auxiliary_loss_clip": 0.01072129, + "auxiliary_loss_mlp": 0.01034473, + "balance_loss_clip": 1.03288865, + "balance_loss_mlp": 1.02257562, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 1.6659041539817119, + "language_loss": 0.70692062, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72798669, + "num_input_tokens_seen": 268250260, + "step": 12437, + "time_per_iteration": 2.7612361907958984 + }, + { + "auxiliary_loss_clip": 0.01058037, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.03039896, + "balance_loss_mlp": 1.01795256, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.8038194889722723, + "language_loss": 0.67346233, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69432306, + "num_input_tokens_seen": 268268440, + "step": 12438, + "time_per_iteration": 2.668775796890259 + }, + { + "auxiliary_loss_clip": 0.01074029, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.0315485, + "balance_loss_mlp": 1.02009845, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.6561780255094807, + "language_loss": 0.80705738, + "learning_rate": 6.304572825026344e-07, + "loss": 0.82811236, + "num_input_tokens_seen": 268285765, + "step": 12439, + "time_per_iteration": 4.20887565612793 + }, + { + "auxiliary_loss_clip": 0.01060563, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.03072977, + "balance_loss_mlp": 1.02296293, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 5.447448884046015, + "language_loss": 0.70833546, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72928119, + "num_input_tokens_seen": 268304015, + "step": 12440, + "time_per_iteration": 2.6741139888763428 + }, + { + "auxiliary_loss_clip": 0.01077082, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.03644276, + "balance_loss_mlp": 1.01694298, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.5352531909865408, + "language_loss": 0.7400738, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76112223, + "num_input_tokens_seen": 268323290, + "step": 12441, + "time_per_iteration": 2.781677007675171 + }, + { + "auxiliary_loss_clip": 0.01092022, + "auxiliary_loss_mlp": 0.00749648, + "balance_loss_clip": 1.0356015, + "balance_loss_mlp": 1.00029171, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.303182111613231, + "language_loss": 0.82617474, + "learning_rate": 6.296060463313698e-07, + "loss": 0.8445915, + "num_input_tokens_seen": 268339490, + "step": 12442, + "time_per_iteration": 2.6117289066314697 + }, + { + "auxiliary_loss_clip": 0.01042137, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.0331769, + "balance_loss_mlp": 1.02023625, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 2.8738558046424383, + "language_loss": 0.63020432, + "learning_rate": 6.293224048575565e-07, + "loss": 0.65095139, + "num_input_tokens_seen": 268359865, + "step": 12443, + "time_per_iteration": 2.8222532272338867 + }, + { + "auxiliary_loss_clip": 0.01054532, + "auxiliary_loss_mlp": 0.01024513, + "balance_loss_clip": 1.02958846, + "balance_loss_mlp": 1.0136646, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.148343051881551, + "language_loss": 0.7152971, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73608756, + "num_input_tokens_seen": 268377065, + "step": 12444, + "time_per_iteration": 2.687378406524658 + }, + { + "auxiliary_loss_clip": 0.01045496, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.03191221, + "balance_loss_mlp": 1.01626444, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.6387317390347147, + "language_loss": 0.69157732, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71231341, + "num_input_tokens_seen": 268396935, + "step": 12445, + "time_per_iteration": 2.7109005451202393 + }, + { + "auxiliary_loss_clip": 0.01085217, + "auxiliary_loss_mlp": 0.01025004, + "balance_loss_clip": 1.03271484, + "balance_loss_mlp": 1.01394176, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 2.0463908970271083, + "language_loss": 0.74194038, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76304263, + "num_input_tokens_seen": 268414460, + "step": 12446, + "time_per_iteration": 2.565547466278076 + }, + { + "auxiliary_loss_clip": 0.01081568, + "auxiliary_loss_mlp": 0.00749606, + "balance_loss_clip": 1.03367734, + "balance_loss_mlp": 1.00030303, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.4336717872411744, + "language_loss": 0.73242772, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75073946, + "num_input_tokens_seen": 268432225, + "step": 12447, + "time_per_iteration": 2.5844945907592773 + }, + { + "auxiliary_loss_clip": 0.01057582, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.03270721, + "balance_loss_mlp": 1.01950514, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.6606766217227293, + "language_loss": 0.72101724, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74189663, + "num_input_tokens_seen": 268449270, + "step": 12448, + "time_per_iteration": 4.145671606063843 + }, + { + "auxiliary_loss_clip": 0.01101585, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.03526807, + "balance_loss_mlp": 1.02558768, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.6976949600839544, + "language_loss": 0.73673403, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75812006, + "num_input_tokens_seen": 268467250, + "step": 12449, + "time_per_iteration": 2.587305784225464 + }, + { + "auxiliary_loss_clip": 0.01063295, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.03364611, + "balance_loss_mlp": 1.02358937, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 2.3699602668442323, + "language_loss": 0.60952944, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63051867, + "num_input_tokens_seen": 268487270, + "step": 12450, + "time_per_iteration": 2.6282291412353516 + }, + { + "auxiliary_loss_clip": 0.01093367, + "auxiliary_loss_mlp": 0.01026926, + "balance_loss_clip": 1.03191578, + "balance_loss_mlp": 1.01661396, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 2.0640323861072627, + "language_loss": 0.70368683, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72488976, + "num_input_tokens_seen": 268508020, + "step": 12451, + "time_per_iteration": 2.55873703956604 + }, + { + "auxiliary_loss_clip": 0.01092807, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.0341984, + "balance_loss_mlp": 1.01857722, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 2.64250777065033, + "language_loss": 0.80367339, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82490396, + "num_input_tokens_seen": 268527375, + "step": 12452, + "time_per_iteration": 2.5721096992492676 + }, + { + "auxiliary_loss_clip": 0.01108189, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.03920889, + "balance_loss_mlp": 1.02105236, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 3.03650671733578, + "language_loss": 0.71832395, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73973829, + "num_input_tokens_seen": 268544870, + "step": 12453, + "time_per_iteration": 2.495891571044922 + }, + { + "auxiliary_loss_clip": 0.01078849, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.03482842, + "balance_loss_mlp": 1.01851296, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 2.034698612431755, + "language_loss": 0.73818707, + "learning_rate": 6.262057814417517e-07, + "loss": 0.75927019, + "num_input_tokens_seen": 268564580, + "step": 12454, + "time_per_iteration": 2.5652389526367188 + }, + { + "auxiliary_loss_clip": 0.00992684, + "auxiliary_loss_mlp": 0.01007367, + "balance_loss_clip": 1.00474, + "balance_loss_mlp": 1.0058831, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7356148707365502, + "language_loss": 0.59364784, + "learning_rate": 6.259227643920322e-07, + "loss": 0.6136483, + "num_input_tokens_seen": 268629550, + "step": 12455, + "time_per_iteration": 3.283029556274414 + }, + { + "auxiliary_loss_clip": 0.0106014, + "auxiliary_loss_mlp": 0.01028289, + "balance_loss_clip": 1.03213811, + "balance_loss_mlp": 1.01678538, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 1.9321840264181138, + "language_loss": 0.79515707, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81604135, + "num_input_tokens_seen": 268646645, + "step": 12456, + "time_per_iteration": 2.6014435291290283 + }, + { + "auxiliary_loss_clip": 0.0101605, + "auxiliary_loss_mlp": 0.00997433, + "balance_loss_clip": 1.00532627, + "balance_loss_mlp": 0.99641401, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8371159868274691, + "language_loss": 0.6145789, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63471377, + "num_input_tokens_seen": 268702275, + "step": 12457, + "time_per_iteration": 3.0521843433380127 + }, + { + "auxiliary_loss_clip": 0.01085961, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.03742671, + "balance_loss_mlp": 1.02224576, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 1.942064510573082, + "language_loss": 0.67397922, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69517213, + "num_input_tokens_seen": 268716265, + "step": 12458, + "time_per_iteration": 2.5677175521850586 + }, + { + "auxiliary_loss_clip": 0.01044518, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.02966356, + "balance_loss_mlp": 1.01877642, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 1.8111961481966852, + "language_loss": 0.79800111, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81874877, + "num_input_tokens_seen": 268734330, + "step": 12459, + "time_per_iteration": 2.639040231704712 + }, + { + "auxiliary_loss_clip": 0.01059311, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.03284419, + "balance_loss_mlp": 1.02361846, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.4871398285963182, + "language_loss": 0.80630654, + "learning_rate": 6.245084609352043e-07, + "loss": 0.8272593, + "num_input_tokens_seen": 268753500, + "step": 12460, + "time_per_iteration": 2.638087034225464 + }, + { + "auxiliary_loss_clip": 0.01068925, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.0322535, + "balance_loss_mlp": 1.02154207, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.7028036374144158, + "language_loss": 0.86344862, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88447547, + "num_input_tokens_seen": 268772055, + "step": 12461, + "time_per_iteration": 4.149841785430908 + }, + { + "auxiliary_loss_clip": 0.01080392, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.0337404, + "balance_loss_mlp": 1.02273345, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 1.7745377634636779, + "language_loss": 0.69443262, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71557188, + "num_input_tokens_seen": 268792265, + "step": 12462, + "time_per_iteration": 2.6269946098327637 + }, + { + "auxiliary_loss_clip": 0.01099757, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.03432572, + "balance_loss_mlp": 1.01887131, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.2329358248807325, + "language_loss": 0.70519996, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72650886, + "num_input_tokens_seen": 268812735, + "step": 12463, + "time_per_iteration": 2.550398826599121 + }, + { + "auxiliary_loss_clip": 0.01062035, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.03250396, + "balance_loss_mlp": 1.01997018, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.6486742526701017, + "language_loss": 0.77519494, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79612041, + "num_input_tokens_seen": 268833090, + "step": 12464, + "time_per_iteration": 2.674283504486084 + }, + { + "auxiliary_loss_clip": 0.01073643, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.03079867, + "balance_loss_mlp": 1.01758313, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.6846842867097465, + "language_loss": 0.78118366, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80220646, + "num_input_tokens_seen": 268851880, + "step": 12465, + "time_per_iteration": 2.6181259155273438 + }, + { + "auxiliary_loss_clip": 0.01067927, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.03329861, + "balance_loss_mlp": 1.02470565, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.5475277988758025, + "language_loss": 0.73873293, + "learning_rate": 6.22813018144422e-07, + "loss": 0.75979376, + "num_input_tokens_seen": 268867910, + "step": 12466, + "time_per_iteration": 2.581101894378662 + }, + { + "auxiliary_loss_clip": 0.01087427, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.03345263, + "balance_loss_mlp": 1.02266955, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 1.9563267629935002, + "language_loss": 0.66179562, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68301129, + "num_input_tokens_seen": 268887260, + "step": 12467, + "time_per_iteration": 2.560455322265625 + }, + { + "auxiliary_loss_clip": 0.01048891, + "auxiliary_loss_mlp": 0.0074975, + "balance_loss_clip": 1.03157663, + "balance_loss_mlp": 1.00031543, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.724172497379243, + "language_loss": 0.76471967, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78270608, + "num_input_tokens_seen": 268902520, + "step": 12468, + "time_per_iteration": 2.636556625366211 + }, + { + "auxiliary_loss_clip": 0.01061963, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.03383112, + "balance_loss_mlp": 1.02119374, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.385955721319652, + "language_loss": 0.69255275, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71350265, + "num_input_tokens_seen": 268920970, + "step": 12469, + "time_per_iteration": 2.6007473468780518 + }, + { + "auxiliary_loss_clip": 0.01079337, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.03385568, + "balance_loss_mlp": 1.02010167, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 2.0022314618266024, + "language_loss": 0.69401562, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71513164, + "num_input_tokens_seen": 268936600, + "step": 12470, + "time_per_iteration": 2.569282054901123 + }, + { + "auxiliary_loss_clip": 0.01071451, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03281164, + "balance_loss_mlp": 1.01972008, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 2.130525874670091, + "language_loss": 0.75087225, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77191854, + "num_input_tokens_seen": 268956560, + "step": 12471, + "time_per_iteration": 4.078276634216309 + }, + { + "auxiliary_loss_clip": 0.01070699, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.02096009, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.071906143037522, + "language_loss": 0.77183014, + "learning_rate": 6.211194553838929e-07, + "loss": 0.79288012, + "num_input_tokens_seen": 268973945, + "step": 12472, + "time_per_iteration": 2.5658507347106934 + }, + { + "auxiliary_loss_clip": 0.01085621, + "auxiliary_loss_mlp": 0.00749493, + "balance_loss_clip": 1.03289354, + "balance_loss_mlp": 1.00030398, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.4922649134262658, + "language_loss": 0.84494531, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86329651, + "num_input_tokens_seen": 268993245, + "step": 12473, + "time_per_iteration": 2.5729541778564453 + }, + { + "auxiliary_loss_clip": 0.01065994, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.03113365, + "balance_loss_mlp": 1.01867437, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 1.9044926429778182, + "language_loss": 0.73619747, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75717258, + "num_input_tokens_seen": 269012125, + "step": 12474, + "time_per_iteration": 2.668693780899048 + }, + { + "auxiliary_loss_clip": 0.01075959, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.03321838, + "balance_loss_mlp": 1.01979041, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 1.6245136715756796, + "language_loss": 0.7452817, + "learning_rate": 6.202733797375492e-07, + "loss": 0.7663604, + "num_input_tokens_seen": 269030545, + "step": 12475, + "time_per_iteration": 2.6281867027282715 + }, + { + "auxiliary_loss_clip": 0.01095552, + "auxiliary_loss_mlp": 0.01038642, + "balance_loss_clip": 1.03500247, + "balance_loss_mlp": 1.025702, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 1.8512272132637424, + "language_loss": 0.80049294, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82183492, + "num_input_tokens_seen": 269048180, + "step": 12476, + "time_per_iteration": 2.50818133354187 + }, + { + "auxiliary_loss_clip": 0.01064417, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.03296363, + "balance_loss_mlp": 1.02420759, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.7967846768372784, + "language_loss": 0.77743983, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79843873, + "num_input_tokens_seen": 269068600, + "step": 12477, + "time_per_iteration": 2.6392276287078857 + }, + { + "auxiliary_loss_clip": 0.01006432, + "auxiliary_loss_mlp": 0.01004206, + "balance_loss_clip": 1.0060271, + "balance_loss_mlp": 1.00328183, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8038337594374092, + "language_loss": 0.54380941, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56391579, + "num_input_tokens_seen": 269119045, + "step": 12478, + "time_per_iteration": 4.5429368019104 + }, + { + "auxiliary_loss_clip": 0.01070107, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.03111172, + "balance_loss_mlp": 1.02226317, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.6885675729582046, + "language_loss": 0.79942524, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82046187, + "num_input_tokens_seen": 269136755, + "step": 12479, + "time_per_iteration": 2.5686869621276855 + }, + { + "auxiliary_loss_clip": 0.01091539, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.03418839, + "balance_loss_mlp": 1.02211154, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.1965620609548195, + "language_loss": 0.62881112, + "learning_rate": 6.188643001902369e-07, + "loss": 0.65007055, + "num_input_tokens_seen": 269156120, + "step": 12480, + "time_per_iteration": 2.546553373336792 + }, + { + "auxiliary_loss_clip": 0.01070132, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.03190792, + "balance_loss_mlp": 1.02232051, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 2.067904265018188, + "language_loss": 0.78057033, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80160469, + "num_input_tokens_seen": 269175650, + "step": 12481, + "time_per_iteration": 2.549933671951294 + }, + { + "auxiliary_loss_clip": 0.01055752, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.03193188, + "balance_loss_mlp": 1.02196157, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 2.3915873714013896, + "language_loss": 0.71274817, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73364878, + "num_input_tokens_seen": 269197080, + "step": 12482, + "time_per_iteration": 2.6376900672912598 + }, + { + "auxiliary_loss_clip": 0.01101646, + "auxiliary_loss_mlp": 0.01038851, + "balance_loss_clip": 1.03583062, + "balance_loss_mlp": 1.02727616, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.5968176330393504, + "language_loss": 0.7036078, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72501284, + "num_input_tokens_seen": 269218600, + "step": 12483, + "time_per_iteration": 2.5577521324157715 + }, + { + "auxiliary_loss_clip": 0.01099349, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.03409815, + "balance_loss_mlp": 1.01626396, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 2.070396908685744, + "language_loss": 0.7446748, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76593506, + "num_input_tokens_seen": 269239245, + "step": 12484, + "time_per_iteration": 2.469417095184326 + }, + { + "auxiliary_loss_clip": 0.01077618, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.03368235, + "balance_loss_mlp": 1.01546824, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.2720776988160325, + "language_loss": 0.84630835, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86735541, + "num_input_tokens_seen": 269258520, + "step": 12485, + "time_per_iteration": 2.628248929977417 + }, + { + "auxiliary_loss_clip": 0.01062161, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.0319221, + "balance_loss_mlp": 1.01735997, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.6725243763671533, + "language_loss": 0.781663, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80257386, + "num_input_tokens_seen": 269278320, + "step": 12486, + "time_per_iteration": 2.6533195972442627 + }, + { + "auxiliary_loss_clip": 0.01086531, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.03332579, + "balance_loss_mlp": 1.01825225, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.0333596749375213, + "language_loss": 0.72641671, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74758625, + "num_input_tokens_seen": 269298025, + "step": 12487, + "time_per_iteration": 2.5616297721862793 + }, + { + "auxiliary_loss_clip": 0.01073488, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.03257155, + "balance_loss_mlp": 1.01613736, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 2.161391234982564, + "language_loss": 0.67204148, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69305146, + "num_input_tokens_seen": 269316770, + "step": 12488, + "time_per_iteration": 4.083112716674805 + }, + { + "auxiliary_loss_clip": 0.01032506, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.02985215, + "balance_loss_mlp": 1.02019203, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 9.335654901883537, + "language_loss": 0.7709707, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79162091, + "num_input_tokens_seen": 269334755, + "step": 12489, + "time_per_iteration": 2.699153184890747 + }, + { + "auxiliary_loss_clip": 0.01086019, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.03407466, + "balance_loss_mlp": 1.01790559, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 1.6473901987503186, + "language_loss": 0.75041664, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77156293, + "num_input_tokens_seen": 269353810, + "step": 12490, + "time_per_iteration": 2.623476266860962 + }, + { + "auxiliary_loss_clip": 0.0110032, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.03555894, + "balance_loss_mlp": 1.01851249, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.9222856463644176, + "language_loss": 0.7800929, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80139577, + "num_input_tokens_seen": 269372910, + "step": 12491, + "time_per_iteration": 2.572486639022827 + }, + { + "auxiliary_loss_clip": 0.01086646, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.03323913, + "balance_loss_mlp": 1.01902497, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 1.6155389779918425, + "language_loss": 0.76668382, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78784132, + "num_input_tokens_seen": 269391545, + "step": 12492, + "time_per_iteration": 2.572056770324707 + }, + { + "auxiliary_loss_clip": 0.01067868, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.03227377, + "balance_loss_mlp": 1.01777542, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 2.1828394661380885, + "language_loss": 0.7159251, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73688936, + "num_input_tokens_seen": 269408530, + "step": 12493, + "time_per_iteration": 2.6028616428375244 + }, + { + "auxiliary_loss_clip": 0.01089594, + "auxiliary_loss_mlp": 0.00749392, + "balance_loss_clip": 1.03525138, + "balance_loss_mlp": 1.00022542, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.6343327384007769, + "language_loss": 0.80507839, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82346827, + "num_input_tokens_seen": 269425930, + "step": 12494, + "time_per_iteration": 2.5687994956970215 + }, + { + "auxiliary_loss_clip": 0.01100576, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.03503668, + "balance_loss_mlp": 1.01671696, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 1.9623440316256573, + "language_loss": 0.78683829, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80812788, + "num_input_tokens_seen": 269443945, + "step": 12495, + "time_per_iteration": 2.5294368267059326 + }, + { + "auxiliary_loss_clip": 0.01099229, + "auxiliary_loss_mlp": 0.00749348, + "balance_loss_clip": 1.03461611, + "balance_loss_mlp": 1.00020766, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 3.02427690853954, + "language_loss": 0.71201599, + "learning_rate": 6.143640508441898e-07, + "loss": 0.73050171, + "num_input_tokens_seen": 269463625, + "step": 12496, + "time_per_iteration": 2.5212132930755615 + }, + { + "auxiliary_loss_clip": 0.01053369, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.03166676, + "balance_loss_mlp": 1.01885855, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 5.55452312438839, + "language_loss": 0.78216767, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80300093, + "num_input_tokens_seen": 269483415, + "step": 12497, + "time_per_iteration": 2.817375898361206 + }, + { + "auxiliary_loss_clip": 0.01089942, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.03359485, + "balance_loss_mlp": 1.02386963, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.4808167992416217, + "language_loss": 0.76849294, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78974849, + "num_input_tokens_seen": 269504635, + "step": 12498, + "time_per_iteration": 2.568441152572632 + }, + { + "auxiliary_loss_clip": 0.01076808, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.03465414, + "balance_loss_mlp": 1.02156043, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.7517495817811806, + "language_loss": 0.73965043, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76073998, + "num_input_tokens_seen": 269523955, + "step": 12499, + "time_per_iteration": 2.5621490478515625 + }, + { + "auxiliary_loss_clip": 0.01084783, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.03042793, + "balance_loss_mlp": 1.01696658, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 1.9570558348055767, + "language_loss": 0.79247266, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81359625, + "num_input_tokens_seen": 269544410, + "step": 12500, + "time_per_iteration": 2.5798025131225586 + }, + { + "auxiliary_loss_clip": 0.01098704, + "auxiliary_loss_mlp": 0.01036902, + "balance_loss_clip": 1.03731608, + "balance_loss_mlp": 1.02321124, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 2.0385095893135166, + "language_loss": 0.73801327, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75936937, + "num_input_tokens_seen": 269563315, + "step": 12501, + "time_per_iteration": 4.155551910400391 + }, + { + "auxiliary_loss_clip": 0.01075746, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.03135395, + "balance_loss_mlp": 1.01524794, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.6183646726139336, + "language_loss": 0.78479123, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80581635, + "num_input_tokens_seen": 269583950, + "step": 12502, + "time_per_iteration": 2.5921552181243896 + }, + { + "auxiliary_loss_clip": 0.01079415, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.03426278, + "balance_loss_mlp": 1.02055514, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.893214223808283, + "language_loss": 0.70452636, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72564113, + "num_input_tokens_seen": 269600120, + "step": 12503, + "time_per_iteration": 2.576050043106079 + }, + { + "auxiliary_loss_clip": 0.01023888, + "auxiliary_loss_mlp": 0.00999871, + "balance_loss_clip": 1.00369585, + "balance_loss_mlp": 0.99881601, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9840502849321339, + "language_loss": 0.64049906, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66073662, + "num_input_tokens_seen": 269659815, + "step": 12504, + "time_per_iteration": 2.9823899269104004 + }, + { + "auxiliary_loss_clip": 0.01052974, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.0280695, + "balance_loss_mlp": 1.02027309, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.4140413333105206, + "language_loss": 0.689183, + "learning_rate": 6.118385689264896e-07, + "loss": 0.71003044, + "num_input_tokens_seen": 269684565, + "step": 12505, + "time_per_iteration": 2.7384049892425537 + }, + { + "auxiliary_loss_clip": 0.01015091, + "auxiliary_loss_mlp": 0.00746724, + "balance_loss_clip": 1.00536764, + "balance_loss_mlp": 0.99989873, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6434955605783763, + "language_loss": 0.55117762, + "learning_rate": 6.11558222878809e-07, + "loss": 0.5687958, + "num_input_tokens_seen": 269752325, + "step": 12506, + "time_per_iteration": 3.1942954063415527 + }, + { + "auxiliary_loss_clip": 0.01085886, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.03429627, + "balance_loss_mlp": 1.02131271, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 2.5539377207089258, + "language_loss": 0.78426659, + "learning_rate": 6.112779294809796e-07, + "loss": 0.8054598, + "num_input_tokens_seen": 269770630, + "step": 12507, + "time_per_iteration": 2.5577621459960938 + }, + { + "auxiliary_loss_clip": 0.01074725, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.03655696, + "balance_loss_mlp": 1.02121353, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.6835644909648873, + "language_loss": 0.71066725, + "learning_rate": 6.10997688743631e-07, + "loss": 0.7317363, + "num_input_tokens_seen": 269787280, + "step": 12508, + "time_per_iteration": 2.734269618988037 + }, + { + "auxiliary_loss_clip": 0.01080996, + "auxiliary_loss_mlp": 0.01026896, + "balance_loss_clip": 1.03178179, + "balance_loss_mlp": 1.01549995, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.7201049077345942, + "language_loss": 0.72194642, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74302536, + "num_input_tokens_seen": 269805205, + "step": 12509, + "time_per_iteration": 2.5747132301330566 + }, + { + "auxiliary_loss_clip": 0.01105755, + "auxiliary_loss_mlp": 0.01037556, + "balance_loss_clip": 1.03622174, + "balance_loss_mlp": 1.02475858, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.6706622499470285, + "language_loss": 0.61951876, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64095187, + "num_input_tokens_seen": 269824820, + "step": 12510, + "time_per_iteration": 2.523216724395752 + }, + { + "auxiliary_loss_clip": 0.01086078, + "auxiliary_loss_mlp": 0.01026248, + "balance_loss_clip": 1.03449476, + "balance_loss_mlp": 1.01497054, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.6705242097338096, + "language_loss": 0.81480813, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83593142, + "num_input_tokens_seen": 269842825, + "step": 12511, + "time_per_iteration": 4.086632251739502 + }, + { + "auxiliary_loss_clip": 0.01075896, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.0338372, + "balance_loss_mlp": 1.02237761, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 2.0144445656952255, + "language_loss": 0.75741279, + "learning_rate": 6.098772526115412e-07, + "loss": 0.77852023, + "num_input_tokens_seen": 269859000, + "step": 12512, + "time_per_iteration": 2.530029296875 + }, + { + "auxiliary_loss_clip": 0.01083938, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.03244257, + "balance_loss_mlp": 1.01488829, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.7718743839798563, + "language_loss": 0.82401049, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84510696, + "num_input_tokens_seen": 269878895, + "step": 12513, + "time_per_iteration": 2.6278035640716553 + }, + { + "auxiliary_loss_clip": 0.010887, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.03516698, + "balance_loss_mlp": 1.02087772, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 1.7254551556123368, + "language_loss": 0.74523842, + "learning_rate": 6.093173507845771e-07, + "loss": 0.76645511, + "num_input_tokens_seen": 269897280, + "step": 12514, + "time_per_iteration": 2.5235254764556885 + }, + { + "auxiliary_loss_clip": 0.01079115, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.03367758, + "balance_loss_mlp": 1.01925194, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.732828798415603, + "language_loss": 0.68966013, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71074307, + "num_input_tokens_seen": 269914640, + "step": 12515, + "time_per_iteration": 2.5494465827941895 + }, + { + "auxiliary_loss_clip": 0.01089082, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.03384745, + "balance_loss_mlp": 1.02055621, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 1.6751060332687564, + "language_loss": 0.70224011, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72344673, + "num_input_tokens_seen": 269934960, + "step": 12516, + "time_per_iteration": 2.644462823867798 + }, + { + "auxiliary_loss_clip": 0.01050728, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.03273511, + "balance_loss_mlp": 1.02106035, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.4590913086455541, + "language_loss": 0.89531249, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91614479, + "num_input_tokens_seen": 269956655, + "step": 12517, + "time_per_iteration": 2.691438913345337 + }, + { + "auxiliary_loss_clip": 0.01078746, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.03469408, + "balance_loss_mlp": 1.02373254, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.9779883313496578, + "language_loss": 0.74215788, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76329279, + "num_input_tokens_seen": 269976835, + "step": 12518, + "time_per_iteration": 4.101513624191284 + }, + { + "auxiliary_loss_clip": 0.00986617, + "auxiliary_loss_mlp": 0.01001264, + "balance_loss_clip": 1.0101223, + "balance_loss_mlp": 1.00020301, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7098167681916746, + "language_loss": 0.55683172, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57671058, + "num_input_tokens_seen": 270040630, + "step": 12519, + "time_per_iteration": 3.3207602500915527 + }, + { + "auxiliary_loss_clip": 0.01082347, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.03420377, + "balance_loss_mlp": 1.01967013, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.4314029008235387, + "language_loss": 0.77822042, + "learning_rate": 6.07638911279029e-07, + "loss": 0.79934037, + "num_input_tokens_seen": 270059695, + "step": 12520, + "time_per_iteration": 2.5857648849487305 + }, + { + "auxiliary_loss_clip": 0.01077897, + "auxiliary_loss_mlp": 0.01037201, + "balance_loss_clip": 1.03091764, + "balance_loss_mlp": 1.0263356, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 2.027376027970737, + "language_loss": 0.74655342, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76770437, + "num_input_tokens_seen": 270078420, + "step": 12521, + "time_per_iteration": 2.618328332901001 + }, + { + "auxiliary_loss_clip": 0.01075213, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.03525376, + "balance_loss_mlp": 1.02181411, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.8269502830038937, + "language_loss": 0.66992718, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69101894, + "num_input_tokens_seen": 270097040, + "step": 12522, + "time_per_iteration": 2.6372368335723877 + }, + { + "auxiliary_loss_clip": 0.01090484, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.03493643, + "balance_loss_mlp": 1.02541423, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 1.7093431646969004, + "language_loss": 0.78318846, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80445516, + "num_input_tokens_seen": 270116365, + "step": 12523, + "time_per_iteration": 2.604562520980835 + }, + { + "auxiliary_loss_clip": 0.01097763, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.01641965, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 1.8980715863132183, + "language_loss": 0.80685389, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82810473, + "num_input_tokens_seen": 270135395, + "step": 12524, + "time_per_iteration": 2.5448551177978516 + }, + { + "auxiliary_loss_clip": 0.01088929, + "auxiliary_loss_mlp": 0.0074921, + "balance_loss_clip": 1.03520775, + "balance_loss_mlp": 1.00026107, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.5689221321977762, + "language_loss": 0.74139607, + "learning_rate": 6.062416635517326e-07, + "loss": 0.75977743, + "num_input_tokens_seen": 270156425, + "step": 12525, + "time_per_iteration": 2.553126335144043 + }, + { + "auxiliary_loss_clip": 0.01065999, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.03299165, + "balance_loss_mlp": 1.02215302, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.9403304000447505, + "language_loss": 0.71945572, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74044478, + "num_input_tokens_seen": 270176905, + "step": 12526, + "time_per_iteration": 2.6193106174468994 + }, + { + "auxiliary_loss_clip": 0.01077498, + "auxiliary_loss_mlp": 0.01026328, + "balance_loss_clip": 1.03377056, + "balance_loss_mlp": 1.01591456, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 1.6770031620615755, + "language_loss": 0.72177058, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74280882, + "num_input_tokens_seen": 270196640, + "step": 12527, + "time_per_iteration": 2.681211233139038 + }, + { + "auxiliary_loss_clip": 0.01064443, + "auxiliary_loss_mlp": 0.0102309, + "balance_loss_clip": 1.03369904, + "balance_loss_mlp": 1.01301038, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 2.6401082818264148, + "language_loss": 0.81207204, + "learning_rate": 6.054039490480539e-07, + "loss": 0.83294737, + "num_input_tokens_seen": 270213905, + "step": 12528, + "time_per_iteration": 4.049485445022583 + }, + { + "auxiliary_loss_clip": 0.01045281, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.0356307, + "balance_loss_mlp": 1.02410913, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 1.9292517112339334, + "language_loss": 0.85036552, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87117982, + "num_input_tokens_seen": 270231995, + "step": 12529, + "time_per_iteration": 2.7358791828155518 + }, + { + "auxiliary_loss_clip": 0.01071672, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.03625321, + "balance_loss_mlp": 1.02040172, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 2.068351235692649, + "language_loss": 0.74035299, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76138842, + "num_input_tokens_seen": 270251480, + "step": 12530, + "time_per_iteration": 2.68222713470459 + }, + { + "auxiliary_loss_clip": 0.00982525, + "auxiliary_loss_mlp": 0.00998592, + "balance_loss_clip": 1.01700449, + "balance_loss_mlp": 0.99754256, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8276294940853184, + "language_loss": 0.63613701, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65594816, + "num_input_tokens_seen": 270306480, + "step": 12531, + "time_per_iteration": 3.0705325603485107 + }, + { + "auxiliary_loss_clip": 0.01079939, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.03549182, + "balance_loss_mlp": 1.01511741, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 1.9013725558632073, + "language_loss": 0.70041507, + "learning_rate": 6.042877367909633e-07, + "loss": 0.72148353, + "num_input_tokens_seen": 270324595, + "step": 12532, + "time_per_iteration": 2.5744926929473877 + }, + { + "auxiliary_loss_clip": 0.01067695, + "auxiliary_loss_mlp": 0.01026088, + "balance_loss_clip": 1.03323412, + "balance_loss_mlp": 1.01615191, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.590254642613683, + "language_loss": 0.77694374, + "learning_rate": 6.040088160141132e-07, + "loss": 0.7978816, + "num_input_tokens_seen": 270344375, + "step": 12533, + "time_per_iteration": 2.5887365341186523 + }, + { + "auxiliary_loss_clip": 0.0101592, + "auxiliary_loss_mlp": 0.01006314, + "balance_loss_clip": 1.00591612, + "balance_loss_mlp": 1.00526452, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7867651693110957, + "language_loss": 0.57364434, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59386665, + "num_input_tokens_seen": 270405235, + "step": 12534, + "time_per_iteration": 3.111481189727783 + }, + { + "auxiliary_loss_clip": 0.01075094, + "auxiliary_loss_mlp": 0.0102645, + "balance_loss_clip": 1.03163016, + "balance_loss_mlp": 1.01547122, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.6442184155341961, + "language_loss": 0.71398801, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73500347, + "num_input_tokens_seen": 270425820, + "step": 12535, + "time_per_iteration": 2.606757164001465 + }, + { + "auxiliary_loss_clip": 0.01072258, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.02952564, + "balance_loss_mlp": 1.01699841, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 2.2835354353398634, + "language_loss": 0.80655491, + "learning_rate": 6.031723713426135e-07, + "loss": 0.82756901, + "num_input_tokens_seen": 270447120, + "step": 12536, + "time_per_iteration": 2.6512835025787354 + }, + { + "auxiliary_loss_clip": 0.01062482, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.03062832, + "balance_loss_mlp": 1.01895881, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 1.9961249329946023, + "language_loss": 0.74593925, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76685679, + "num_input_tokens_seen": 270468680, + "step": 12537, + "time_per_iteration": 2.6208949089050293 + }, + { + "auxiliary_loss_clip": 0.010989, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.03374577, + "balance_loss_mlp": 1.02002001, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.6156286716503632, + "language_loss": 0.74223584, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76353508, + "num_input_tokens_seen": 270486310, + "step": 12538, + "time_per_iteration": 2.5030460357666016 + }, + { + "auxiliary_loss_clip": 0.01061876, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.03365517, + "balance_loss_mlp": 1.02088428, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.6180278106133692, + "language_loss": 0.67391789, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69486177, + "num_input_tokens_seen": 270507210, + "step": 12539, + "time_per_iteration": 2.61068058013916 + }, + { + "auxiliary_loss_clip": 0.01099307, + "auxiliary_loss_mlp": 0.01026423, + "balance_loss_clip": 1.03577018, + "balance_loss_mlp": 1.01509166, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.6584074740002794, + "language_loss": 0.74580312, + "learning_rate": 6.020578533797229e-07, + "loss": 0.7670604, + "num_input_tokens_seen": 270525250, + "step": 12540, + "time_per_iteration": 2.548736333847046 + }, + { + "auxiliary_loss_clip": 0.01100499, + "auxiliary_loss_mlp": 0.01032683, + "balance_loss_clip": 1.03399992, + "balance_loss_mlp": 1.02094126, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.227122447469515, + "language_loss": 0.72737485, + "learning_rate": 6.017793563878566e-07, + "loss": 0.7487067, + "num_input_tokens_seen": 270539295, + "step": 12541, + "time_per_iteration": 3.9425618648529053 + }, + { + "auxiliary_loss_clip": 0.01097668, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.03412151, + "balance_loss_mlp": 1.01850772, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.55004217859591, + "language_loss": 0.72186935, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74314302, + "num_input_tokens_seen": 270562815, + "step": 12542, + "time_per_iteration": 2.6896860599517822 + }, + { + "auxiliary_loss_clip": 0.01069942, + "auxiliary_loss_mlp": 0.01024248, + "balance_loss_clip": 1.03056824, + "balance_loss_mlp": 1.01330447, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 2.1658905780023097, + "language_loss": 0.84596014, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86690211, + "num_input_tokens_seen": 270579055, + "step": 12543, + "time_per_iteration": 2.6452689170837402 + }, + { + "auxiliary_loss_clip": 0.01068928, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.03796124, + "balance_loss_mlp": 1.02081573, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.9601593806025144, + "language_loss": 0.73801464, + "learning_rate": 6.009441835784927e-07, + "loss": 0.75902069, + "num_input_tokens_seen": 270599080, + "step": 12544, + "time_per_iteration": 2.6633830070495605 + }, + { + "auxiliary_loss_clip": 0.01086768, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03500748, + "balance_loss_mlp": 1.0195272, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 2.3866617322529944, + "language_loss": 0.67595816, + "learning_rate": 6.006658987326383e-07, + "loss": 0.69712949, + "num_input_tokens_seen": 270618715, + "step": 12545, + "time_per_iteration": 2.5676321983337402 + }, + { + "auxiliary_loss_clip": 0.01072349, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.03135228, + "balance_loss_mlp": 1.02101636, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.765478450367514, + "language_loss": 0.68705487, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70810592, + "num_input_tokens_seen": 270635695, + "step": 12546, + "time_per_iteration": 2.574928045272827 + }, + { + "auxiliary_loss_clip": 0.01085931, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.03378963, + "balance_loss_mlp": 1.01926136, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.2829252162836062, + "language_loss": 0.73398525, + "learning_rate": 6.00109488240147e-07, + "loss": 0.7551558, + "num_input_tokens_seen": 270654325, + "step": 12547, + "time_per_iteration": 2.5501039028167725 + }, + { + "auxiliary_loss_clip": 0.01099188, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.0348165, + "balance_loss_mlp": 1.01729679, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 1.984795758915025, + "language_loss": 0.67855716, + "learning_rate": 5.998313626146099e-07, + "loss": 0.69984585, + "num_input_tokens_seen": 270674260, + "step": 12548, + "time_per_iteration": 2.5034828186035156 + }, + { + "auxiliary_loss_clip": 0.01078172, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.03235126, + "balance_loss_mlp": 1.01931632, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 2.0434901301171973, + "language_loss": 0.87311405, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89420533, + "num_input_tokens_seen": 270692200, + "step": 12549, + "time_per_iteration": 2.598562002182007 + }, + { + "auxiliary_loss_clip": 0.01050436, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.03179097, + "balance_loss_mlp": 1.02265906, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.7398995398616748, + "language_loss": 0.77221644, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79305434, + "num_input_tokens_seen": 270709675, + "step": 12550, + "time_per_iteration": 2.7038702964782715 + }, + { + "auxiliary_loss_clip": 0.01099035, + "auxiliary_loss_mlp": 0.01022663, + "balance_loss_clip": 1.03334808, + "balance_loss_mlp": 1.01196361, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.435964328124782, + "language_loss": 0.69528532, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71650219, + "num_input_tokens_seen": 270733055, + "step": 12551, + "time_per_iteration": 4.110002279281616 + }, + { + "auxiliary_loss_clip": 0.01074942, + "auxiliary_loss_mlp": 0.01026193, + "balance_loss_clip": 1.03421557, + "balance_loss_mlp": 1.01466548, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 2.2735476909283476, + "language_loss": 0.86253214, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88354355, + "num_input_tokens_seen": 270749275, + "step": 12552, + "time_per_iteration": 2.56125545501709 + }, + { + "auxiliary_loss_clip": 0.01088856, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.03357244, + "balance_loss_mlp": 1.01901078, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 1.5918886049231094, + "language_loss": 0.78159058, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80278152, + "num_input_tokens_seen": 270768230, + "step": 12553, + "time_per_iteration": 2.5952517986297607 + }, + { + "auxiliary_loss_clip": 0.01088311, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.03546703, + "balance_loss_mlp": 1.02008688, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.2228401969167564, + "language_loss": 0.62725008, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64844668, + "num_input_tokens_seen": 270786285, + "step": 12554, + "time_per_iteration": 2.615689277648926 + }, + { + "auxiliary_loss_clip": 0.01075087, + "auxiliary_loss_mlp": 0.01031957, + "balance_loss_clip": 1.03252089, + "balance_loss_mlp": 1.02131748, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.5994333149011637, + "language_loss": 0.73437512, + "learning_rate": 5.978859704731864e-07, + "loss": 0.7554456, + "num_input_tokens_seen": 270805505, + "step": 12555, + "time_per_iteration": 2.707551956176758 + }, + { + "auxiliary_loss_clip": 0.01076328, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.03629947, + "balance_loss_mlp": 1.01884067, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 2.088345423406189, + "language_loss": 0.79015172, + "learning_rate": 5.976082698990645e-07, + "loss": 0.8112179, + "num_input_tokens_seen": 270824610, + "step": 12556, + "time_per_iteration": 2.596280097961426 + }, + { + "auxiliary_loss_clip": 0.01014632, + "auxiliary_loss_mlp": 0.01007963, + "balance_loss_clip": 1.00441003, + "balance_loss_mlp": 1.00708723, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.7372877734928354, + "language_loss": 0.50422525, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52445114, + "num_input_tokens_seen": 270886155, + "step": 12557, + "time_per_iteration": 3.078089952468872 + }, + { + "auxiliary_loss_clip": 0.01089358, + "auxiliary_loss_mlp": 0.01027645, + "balance_loss_clip": 1.03657401, + "balance_loss_mlp": 1.01623106, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 2.4955965014633827, + "language_loss": 0.71554083, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73671085, + "num_input_tokens_seen": 270905325, + "step": 12558, + "time_per_iteration": 4.137371301651001 + }, + { + "auxiliary_loss_clip": 0.01071681, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.03221893, + "balance_loss_mlp": 1.0207715, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.789338804684765, + "language_loss": 0.79939032, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82043695, + "num_input_tokens_seen": 270927535, + "step": 12559, + "time_per_iteration": 2.669074535369873 + }, + { + "auxiliary_loss_clip": 0.01051322, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.0333848, + "balance_loss_mlp": 1.01670265, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 2.0404011109988045, + "language_loss": 0.78234839, + "learning_rate": 5.96497999496199e-07, + "loss": 0.80314744, + "num_input_tokens_seen": 270946920, + "step": 12560, + "time_per_iteration": 2.660214900970459 + }, + { + "auxiliary_loss_clip": 0.01044639, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.02978325, + "balance_loss_mlp": 1.02558815, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 2.308573767806388, + "language_loss": 0.70878112, + "learning_rate": 5.96220564921515e-07, + "loss": 0.72960293, + "num_input_tokens_seen": 270965705, + "step": 12561, + "time_per_iteration": 2.722083568572998 + }, + { + "auxiliary_loss_clip": 0.01068179, + "auxiliary_loss_mlp": 0.00749719, + "balance_loss_clip": 1.02915692, + "balance_loss_mlp": 1.00037646, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.7832041750692096, + "language_loss": 0.75766927, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77584827, + "num_input_tokens_seen": 270986550, + "step": 12562, + "time_per_iteration": 2.6360390186309814 + }, + { + "auxiliary_loss_clip": 0.010743, + "auxiliary_loss_mlp": 0.01027535, + "balance_loss_clip": 1.03317726, + "balance_loss_mlp": 1.01589394, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 1.995495768052928, + "language_loss": 0.75749916, + "learning_rate": 5.956658554770371e-07, + "loss": 0.77851748, + "num_input_tokens_seen": 271006250, + "step": 12563, + "time_per_iteration": 2.676558494567871 + }, + { + "auxiliary_loss_clip": 0.01067698, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.0332644, + "balance_loss_mlp": 1.02089107, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.7540921743429467, + "language_loss": 0.67553568, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69656652, + "num_input_tokens_seen": 271025575, + "step": 12564, + "time_per_iteration": 2.745638132095337 + }, + { + "auxiliary_loss_clip": 0.01069446, + "auxiliary_loss_mlp": 0.01037339, + "balance_loss_clip": 1.03280866, + "balance_loss_mlp": 1.0243746, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 2.74799908000977, + "language_loss": 0.68610001, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70716786, + "num_input_tokens_seen": 271045805, + "step": 12565, + "time_per_iteration": 2.606844663619995 + }, + { + "auxiliary_loss_clip": 0.01079024, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.0319277, + "balance_loss_mlp": 1.01701486, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.627449765555636, + "language_loss": 0.74913758, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77022105, + "num_input_tokens_seen": 271066065, + "step": 12566, + "time_per_iteration": 2.686422109603882 + }, + { + "auxiliary_loss_clip": 0.01092209, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03456903, + "balance_loss_mlp": 1.01992655, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 1.9917851751331743, + "language_loss": 0.74055004, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76179552, + "num_input_tokens_seen": 271085870, + "step": 12567, + "time_per_iteration": 2.548600196838379 + }, + { + "auxiliary_loss_clip": 0.01099014, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.03385818, + "balance_loss_mlp": 1.01636887, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.9410185386173138, + "language_loss": 0.62990236, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65116435, + "num_input_tokens_seen": 271104260, + "step": 12568, + "time_per_iteration": 2.5406124591827393 + }, + { + "auxiliary_loss_clip": 0.01009164, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.03053093, + "balance_loss_mlp": 1.0222671, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 2.265560500369489, + "language_loss": 0.67125285, + "learning_rate": 5.940030055397789e-07, + "loss": 0.69168758, + "num_input_tokens_seen": 271125745, + "step": 12569, + "time_per_iteration": 4.460738897323608 + }, + { + "auxiliary_loss_clip": 0.01092928, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.0364933, + "balance_loss_mlp": 1.02634144, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.9970935819201394, + "language_loss": 0.67316341, + "learning_rate": 5.93726050426697e-07, + "loss": 0.6944834, + "num_input_tokens_seen": 271147145, + "step": 12570, + "time_per_iteration": 2.9084632396698 + }, + { + "auxiliary_loss_clip": 0.01101148, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.03534865, + "balance_loss_mlp": 1.01880074, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 2.0777938169805803, + "language_loss": 0.71604061, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73735988, + "num_input_tokens_seen": 271170865, + "step": 12571, + "time_per_iteration": 2.8723974227905273 + }, + { + "auxiliary_loss_clip": 0.01050576, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.02974844, + "balance_loss_mlp": 1.019629, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.7324059276939576, + "language_loss": 0.73622119, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75705564, + "num_input_tokens_seen": 271191450, + "step": 12572, + "time_per_iteration": 2.722207546234131 + }, + { + "auxiliary_loss_clip": 0.01080681, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.0358206, + "balance_loss_mlp": 1.02097499, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.3527139007980375, + "language_loss": 0.76713622, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78826678, + "num_input_tokens_seen": 271207335, + "step": 12573, + "time_per_iteration": 2.589132785797119 + }, + { + "auxiliary_loss_clip": 0.01075505, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.03564918, + "balance_loss_mlp": 1.02018392, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.5225785799668412, + "language_loss": 0.69395578, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71503019, + "num_input_tokens_seen": 271226895, + "step": 12574, + "time_per_iteration": 2.7426021099090576 + }, + { + "auxiliary_loss_clip": 0.01056932, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.02809846, + "balance_loss_mlp": 1.02312434, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.1160340449513177, + "language_loss": 0.71649414, + "learning_rate": 5.923420749619974e-07, + "loss": 0.73741311, + "num_input_tokens_seen": 271244375, + "step": 12575, + "time_per_iteration": 3.001431703567505 + }, + { + "auxiliary_loss_clip": 0.0109573, + "auxiliary_loss_mlp": 0.00749511, + "balance_loss_clip": 1.03207135, + "balance_loss_mlp": 1.00028968, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.215274512062847, + "language_loss": 0.71834314, + "learning_rate": 5.92065439962673e-07, + "loss": 0.73679554, + "num_input_tokens_seen": 271259530, + "step": 12576, + "time_per_iteration": 2.5063862800598145 + }, + { + "auxiliary_loss_clip": 0.01062713, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.03277564, + "balance_loss_mlp": 1.01801658, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 2.407306411463662, + "language_loss": 0.67078668, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69170672, + "num_input_tokens_seen": 271276835, + "step": 12577, + "time_per_iteration": 2.680633783340454 + }, + { + "auxiliary_loss_clip": 0.01077119, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.03202856, + "balance_loss_mlp": 1.02336669, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.6314846161831922, + "language_loss": 0.77907908, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80019569, + "num_input_tokens_seen": 271296275, + "step": 12578, + "time_per_iteration": 2.5984063148498535 + }, + { + "auxiliary_loss_clip": 0.01087575, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.03234053, + "balance_loss_mlp": 1.01768994, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.876188005091209, + "language_loss": 0.75734711, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77851725, + "num_input_tokens_seen": 271315685, + "step": 12579, + "time_per_iteration": 2.5646495819091797 + }, + { + "auxiliary_loss_clip": 0.01057848, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.03269398, + "balance_loss_mlp": 1.01894498, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 2.58607656827911, + "language_loss": 0.62831879, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64921772, + "num_input_tokens_seen": 271336790, + "step": 12580, + "time_per_iteration": 2.896768808364868 + }, + { + "auxiliary_loss_clip": 0.01052152, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.03120089, + "balance_loss_mlp": 1.01710653, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 4.018318470685749, + "language_loss": 0.74944484, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77024949, + "num_input_tokens_seen": 271355470, + "step": 12581, + "time_per_iteration": 4.271665096282959 + }, + { + "auxiliary_loss_clip": 0.01058201, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.03234708, + "balance_loss_mlp": 1.01739788, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 4.329994498348627, + "language_loss": 0.62735313, + "learning_rate": 5.904067515031412e-07, + "loss": 0.64821976, + "num_input_tokens_seen": 271375810, + "step": 12582, + "time_per_iteration": 2.7309021949768066 + }, + { + "auxiliary_loss_clip": 0.01024341, + "auxiliary_loss_mlp": 0.01001978, + "balance_loss_clip": 1.0043273, + "balance_loss_mlp": 1.00100064, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9846278012387689, + "language_loss": 0.60602975, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62629294, + "num_input_tokens_seen": 271424775, + "step": 12583, + "time_per_iteration": 2.844020128250122 + }, + { + "auxiliary_loss_clip": 0.0107386, + "auxiliary_loss_mlp": 0.01033145, + "balance_loss_clip": 1.03391981, + "balance_loss_mlp": 1.02159309, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.2101144229138194, + "language_loss": 0.78882277, + "learning_rate": 5.898542828535125e-07, + "loss": 0.80989283, + "num_input_tokens_seen": 271440500, + "step": 12584, + "time_per_iteration": 2.5836503505706787 + }, + { + "auxiliary_loss_clip": 0.01062724, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.03007817, + "balance_loss_mlp": 1.02563894, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 1.8888364778641213, + "language_loss": 0.78041816, + "learning_rate": 5.895781287327612e-07, + "loss": 0.80142981, + "num_input_tokens_seen": 271458180, + "step": 12585, + "time_per_iteration": 2.6735918521881104 + }, + { + "auxiliary_loss_clip": 0.01104315, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.03693914, + "balance_loss_mlp": 1.02330267, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.6164160689968028, + "language_loss": 0.83073324, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85213292, + "num_input_tokens_seen": 271475730, + "step": 12586, + "time_per_iteration": 2.629814386367798 + }, + { + "auxiliary_loss_clip": 0.01103761, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.03661942, + "balance_loss_mlp": 1.01849747, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 1.8693270800525468, + "language_loss": 0.83187145, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85320497, + "num_input_tokens_seen": 271495030, + "step": 12587, + "time_per_iteration": 2.5812835693359375 + }, + { + "auxiliary_loss_clip": 0.01062924, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.03089404, + "balance_loss_mlp": 1.01615322, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 2.165864402859042, + "language_loss": 0.71018064, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73108327, + "num_input_tokens_seen": 271515355, + "step": 12588, + "time_per_iteration": 2.680142402648926 + }, + { + "auxiliary_loss_clip": 0.01100496, + "auxiliary_loss_mlp": 0.00749591, + "balance_loss_clip": 1.03422594, + "balance_loss_mlp": 1.00030208, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.6267176816865765, + "language_loss": 0.68637264, + "learning_rate": 5.884740471878327e-07, + "loss": 0.7048735, + "num_input_tokens_seen": 271535090, + "step": 12589, + "time_per_iteration": 2.561563730239868 + }, + { + "auxiliary_loss_clip": 0.01087212, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.03284693, + "balance_loss_mlp": 1.01539731, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.8666191279214086, + "language_loss": 0.92209703, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94323921, + "num_input_tokens_seen": 271551075, + "step": 12590, + "time_per_iteration": 2.535510778427124 + }, + { + "auxiliary_loss_clip": 0.0106802, + "auxiliary_loss_mlp": 0.01028996, + "balance_loss_clip": 1.03025138, + "balance_loss_mlp": 1.01762938, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 2.3723665149357362, + "language_loss": 0.65446723, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67543733, + "num_input_tokens_seen": 271571035, + "step": 12591, + "time_per_iteration": 4.233403205871582 + }, + { + "auxiliary_loss_clip": 0.01087487, + "auxiliary_loss_mlp": 0.01026437, + "balance_loss_clip": 1.03501344, + "balance_loss_mlp": 1.01589346, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.6841716663100716, + "language_loss": 0.73561889, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75675815, + "num_input_tokens_seen": 271592950, + "step": 12592, + "time_per_iteration": 2.574373960494995 + }, + { + "auxiliary_loss_clip": 0.01088228, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.03312612, + "balance_loss_mlp": 1.0247004, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.1410824701794264, + "language_loss": 0.71299648, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73424101, + "num_input_tokens_seen": 271608835, + "step": 12593, + "time_per_iteration": 2.5195224285125732 + }, + { + "auxiliary_loss_clip": 0.01101217, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.03474498, + "balance_loss_mlp": 1.01844764, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 1.9207553486010636, + "language_loss": 0.65752059, + "learning_rate": 5.870951496521903e-07, + "loss": 0.67883152, + "num_input_tokens_seen": 271627730, + "step": 12594, + "time_per_iteration": 2.473008155822754 + }, + { + "auxiliary_loss_clip": 0.01067897, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.03292477, + "balance_loss_mlp": 1.02010524, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 2.259096642512084, + "language_loss": 0.81115568, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83214754, + "num_input_tokens_seen": 271646415, + "step": 12595, + "time_per_iteration": 2.5814626216888428 + }, + { + "auxiliary_loss_clip": 0.01066693, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.03424013, + "balance_loss_mlp": 1.01967144, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 2.629825124357732, + "language_loss": 0.71707612, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73805082, + "num_input_tokens_seen": 271666240, + "step": 12596, + "time_per_iteration": 2.5654728412628174 + }, + { + "auxiliary_loss_clip": 0.01004484, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.03294027, + "balance_loss_mlp": 1.02154899, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.517663144693805, + "language_loss": 0.8053956, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82576132, + "num_input_tokens_seen": 271686370, + "step": 12597, + "time_per_iteration": 3.036101818084717 + }, + { + "auxiliary_loss_clip": 0.0107249, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.03512621, + "balance_loss_mlp": 1.01880121, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 2.6463224120234434, + "language_loss": 0.83208418, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85311592, + "num_input_tokens_seen": 271705050, + "step": 12598, + "time_per_iteration": 4.937488555908203 + }, + { + "auxiliary_loss_clip": 0.01073071, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.03395653, + "balance_loss_mlp": 1.01435661, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.7724244935213993, + "language_loss": 0.62610286, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64707798, + "num_input_tokens_seen": 271724915, + "step": 12599, + "time_per_iteration": 2.6711862087249756 + }, + { + "auxiliary_loss_clip": 0.01073308, + "auxiliary_loss_mlp": 0.00749682, + "balance_loss_clip": 1.03335047, + "balance_loss_mlp": 1.00032687, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.3862302515474823, + "language_loss": 0.63096195, + "learning_rate": 5.854422407815161e-07, + "loss": 0.64919192, + "num_input_tokens_seen": 271742410, + "step": 12600, + "time_per_iteration": 2.7245378494262695 + }, + { + "auxiliary_loss_clip": 0.01070646, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.03258097, + "balance_loss_mlp": 1.0183897, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.9122133543339916, + "language_loss": 0.66012853, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68114442, + "num_input_tokens_seen": 271761425, + "step": 12601, + "time_per_iteration": 2.698826313018799 + }, + { + "auxiliary_loss_clip": 0.01068883, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.03281999, + "balance_loss_mlp": 1.01675308, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.5995190711014888, + "language_loss": 0.67334628, + "learning_rate": 5.848917001679335e-07, + "loss": 0.6943084, + "num_input_tokens_seen": 271780875, + "step": 12602, + "time_per_iteration": 2.597014904022217 + }, + { + "auxiliary_loss_clip": 0.01092341, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.03584969, + "balance_loss_mlp": 1.02199423, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.8267475476367345, + "language_loss": 0.66869617, + "learning_rate": 5.846165103474967e-07, + "loss": 0.68996096, + "num_input_tokens_seen": 271799490, + "step": 12603, + "time_per_iteration": 2.5809731483459473 + }, + { + "auxiliary_loss_clip": 0.01073161, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.03142464, + "balance_loss_mlp": 1.02114618, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 2.3901544469914655, + "language_loss": 0.61946094, + "learning_rate": 5.843413741985439e-07, + "loss": 0.64050591, + "num_input_tokens_seen": 271817040, + "step": 12604, + "time_per_iteration": 2.630866289138794 + }, + { + "auxiliary_loss_clip": 0.01100945, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.03720069, + "balance_loss_mlp": 1.0262115, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.7326969972747253, + "language_loss": 0.79787427, + "learning_rate": 5.840662917315076e-07, + "loss": 0.81926131, + "num_input_tokens_seen": 271835480, + "step": 12605, + "time_per_iteration": 2.5446653366088867 + }, + { + "auxiliary_loss_clip": 0.01103388, + "auxiliary_loss_mlp": 0.0102784, + "balance_loss_clip": 1.0352056, + "balance_loss_mlp": 1.0160147, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 9.23018225576452, + "language_loss": 0.7882148, + "learning_rate": 5.837912629568198e-07, + "loss": 0.8095271, + "num_input_tokens_seen": 271849835, + "step": 12606, + "time_per_iteration": 2.5136666297912598 + }, + { + "auxiliary_loss_clip": 0.01081132, + "auxiliary_loss_mlp": 0.01027086, + "balance_loss_clip": 1.03341806, + "balance_loss_mlp": 1.01728129, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.394883471821741, + "language_loss": 0.73093969, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75202191, + "num_input_tokens_seen": 271869560, + "step": 12607, + "time_per_iteration": 2.5791234970092773 + }, + { + "auxiliary_loss_clip": 0.01074341, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.03473806, + "balance_loss_mlp": 1.01843286, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 4.210218772723734, + "language_loss": 0.74580264, + "learning_rate": 5.83241366526202e-07, + "loss": 0.76684737, + "num_input_tokens_seen": 271887950, + "step": 12608, + "time_per_iteration": 2.6206750869750977 + }, + { + "auxiliary_loss_clip": 0.0106392, + "auxiliary_loss_mlp": 0.00749373, + "balance_loss_clip": 1.03236055, + "balance_loss_mlp": 1.00024343, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.7788793240868759, + "language_loss": 0.71239924, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73053211, + "num_input_tokens_seen": 271907700, + "step": 12609, + "time_per_iteration": 4.255563974380493 + }, + { + "auxiliary_loss_clip": 0.01100441, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.03387117, + "balance_loss_mlp": 1.01954198, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 1.6728969469055588, + "language_loss": 0.81597364, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83729541, + "num_input_tokens_seen": 271926840, + "step": 12610, + "time_per_iteration": 2.51523494720459 + }, + { + "auxiliary_loss_clip": 0.01083519, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.03569245, + "balance_loss_mlp": 1.02025843, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.6738188751273593, + "language_loss": 0.70652771, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72768223, + "num_input_tokens_seen": 271946465, + "step": 12611, + "time_per_iteration": 2.5950634479522705 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.03643036, + "balance_loss_mlp": 1.01902819, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 2.7831181293152825, + "language_loss": 0.70594233, + "learning_rate": 5.821422184318893e-07, + "loss": 0.72725999, + "num_input_tokens_seen": 271967295, + "step": 12612, + "time_per_iteration": 2.5753836631774902 + }, + { + "auxiliary_loss_clip": 0.01036278, + "auxiliary_loss_mlp": 0.0103456, + "balance_loss_clip": 1.03059185, + "balance_loss_mlp": 1.0233897, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.4990204190267862, + "language_loss": 0.59679264, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61750096, + "num_input_tokens_seen": 271987960, + "step": 12613, + "time_per_iteration": 2.749195098876953 + }, + { + "auxiliary_loss_clip": 0.01071994, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.03184462, + "balance_loss_mlp": 1.02681077, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.513039860955844, + "language_loss": 0.59900033, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62011707, + "num_input_tokens_seen": 272011780, + "step": 12614, + "time_per_iteration": 2.7243614196777344 + }, + { + "auxiliary_loss_clip": 0.01063685, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.03107095, + "balance_loss_mlp": 1.01779723, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 2.7896263170647875, + "language_loss": 0.73155951, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75249255, + "num_input_tokens_seen": 272030825, + "step": 12615, + "time_per_iteration": 2.6039750576019287 + }, + { + "auxiliary_loss_clip": 0.01008827, + "auxiliary_loss_mlp": 0.00999248, + "balance_loss_clip": 1.01041758, + "balance_loss_mlp": 0.99811596, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.835311917348228, + "language_loss": 0.67716992, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69725072, + "num_input_tokens_seen": 272095825, + "step": 12616, + "time_per_iteration": 3.202035665512085 + }, + { + "auxiliary_loss_clip": 0.0106701, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.03445268, + "balance_loss_mlp": 1.02076614, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.7991461669611164, + "language_loss": 0.85065556, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87165177, + "num_input_tokens_seen": 272113950, + "step": 12617, + "time_per_iteration": 2.5947842597961426 + }, + { + "auxiliary_loss_clip": 0.0106803, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.03556657, + "balance_loss_mlp": 1.02177572, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 7.053915669941014, + "language_loss": 0.75099462, + "learning_rate": 5.804951094578757e-07, + "loss": 0.77199918, + "num_input_tokens_seen": 272130315, + "step": 12618, + "time_per_iteration": 2.6336050033569336 + }, + { + "auxiliary_loss_clip": 0.01078372, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.03398204, + "balance_loss_mlp": 1.01872778, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 1.9618575911412872, + "language_loss": 0.76972103, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79081011, + "num_input_tokens_seen": 272149080, + "step": 12619, + "time_per_iteration": 2.57582950592041 + }, + { + "auxiliary_loss_clip": 0.01057321, + "auxiliary_loss_mlp": 0.01032294, + "balance_loss_clip": 1.03115201, + "balance_loss_mlp": 1.02065313, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.8114303559903033, + "language_loss": 0.82539475, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84629095, + "num_input_tokens_seen": 272168285, + "step": 12620, + "time_per_iteration": 2.715946674346924 + }, + { + "auxiliary_loss_clip": 0.01074441, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.03302467, + "balance_loss_mlp": 1.02087593, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.273288937306679, + "language_loss": 0.82289469, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84397674, + "num_input_tokens_seen": 272184585, + "step": 12621, + "time_per_iteration": 4.115125417709351 + }, + { + "auxiliary_loss_clip": 0.01078568, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.03444743, + "balance_loss_mlp": 1.01847816, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 2.074988158978937, + "language_loss": 0.73737711, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75845963, + "num_input_tokens_seen": 272200205, + "step": 12622, + "time_per_iteration": 2.5718886852264404 + }, + { + "auxiliary_loss_clip": 0.01014372, + "auxiliary_loss_mlp": 0.00999649, + "balance_loss_clip": 1.00401616, + "balance_loss_mlp": 0.99861187, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8701047192236024, + "language_loss": 0.60908747, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62922764, + "num_input_tokens_seen": 272259670, + "step": 12623, + "time_per_iteration": 3.117655038833618 + }, + { + "auxiliary_loss_clip": 0.01097512, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.03516507, + "balance_loss_mlp": 1.02259493, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 2.1190716068023936, + "language_loss": 0.67113352, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69244111, + "num_input_tokens_seen": 272277925, + "step": 12624, + "time_per_iteration": 2.521286725997925 + }, + { + "auxiliary_loss_clip": 0.01097826, + "auxiliary_loss_mlp": 0.01026992, + "balance_loss_clip": 1.03482509, + "balance_loss_mlp": 1.0159117, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.750969201535104, + "language_loss": 0.75923312, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78048134, + "num_input_tokens_seen": 272296010, + "step": 12625, + "time_per_iteration": 2.5788142681121826 + }, + { + "auxiliary_loss_clip": 0.01069223, + "auxiliary_loss_mlp": 0.01046618, + "balance_loss_clip": 1.03187466, + "balance_loss_mlp": 1.03335595, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.8615008401464175, + "language_loss": 0.62959945, + "learning_rate": 5.783019789020977e-07, + "loss": 0.65075791, + "num_input_tokens_seen": 272318330, + "step": 12626, + "time_per_iteration": 2.671886444091797 + }, + { + "auxiliary_loss_clip": 0.01064404, + "auxiliary_loss_mlp": 0.00749544, + "balance_loss_clip": 1.03684449, + "balance_loss_mlp": 1.00028682, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.053160387424179, + "language_loss": 0.74239969, + "learning_rate": 5.780280800727084e-07, + "loss": 0.76053917, + "num_input_tokens_seen": 272335265, + "step": 12627, + "time_per_iteration": 2.6793384552001953 + }, + { + "auxiliary_loss_clip": 0.01090023, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.0345757, + "balance_loss_mlp": 1.01632953, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.0980736067482626, + "language_loss": 0.68491191, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70608908, + "num_input_tokens_seen": 272354795, + "step": 12628, + "time_per_iteration": 2.5710525512695312 + }, + { + "auxiliary_loss_clip": 0.01093667, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.0378933, + "balance_loss_mlp": 1.02230716, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 2.861373611035606, + "language_loss": 0.63606352, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65734732, + "num_input_tokens_seen": 272372875, + "step": 12629, + "time_per_iteration": 2.675250291824341 + }, + { + "auxiliary_loss_clip": 0.01071544, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.03002477, + "balance_loss_mlp": 1.01697278, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.4412860002232826, + "language_loss": 0.77781618, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79880869, + "num_input_tokens_seen": 272394715, + "step": 12630, + "time_per_iteration": 2.7129452228546143 + }, + { + "auxiliary_loss_clip": 0.01024322, + "auxiliary_loss_mlp": 0.01000521, + "balance_loss_clip": 1.00429416, + "balance_loss_mlp": 0.99952525, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8213376088970952, + "language_loss": 0.61524916, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63549757, + "num_input_tokens_seen": 272458775, + "step": 12631, + "time_per_iteration": 4.602959394454956 + }, + { + "auxiliary_loss_clip": 0.01070536, + "auxiliary_loss_mlp": 0.00749708, + "balance_loss_clip": 1.03340864, + "balance_loss_mlp": 1.00033379, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 2.0066476640445665, + "language_loss": 0.7390033, + "learning_rate": 5.766593949531767e-07, + "loss": 0.75720584, + "num_input_tokens_seen": 272479355, + "step": 12632, + "time_per_iteration": 2.6205084323883057 + }, + { + "auxiliary_loss_clip": 0.01077803, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.03407741, + "balance_loss_mlp": 1.02077484, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.920357122102839, + "language_loss": 0.75098681, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77208358, + "num_input_tokens_seen": 272493555, + "step": 12633, + "time_per_iteration": 2.551039695739746 + }, + { + "auxiliary_loss_clip": 0.01075961, + "auxiliary_loss_mlp": 0.01025297, + "balance_loss_clip": 1.03440249, + "balance_loss_mlp": 1.01488996, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 2.3023586479050224, + "language_loss": 0.7326228, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75363541, + "num_input_tokens_seen": 272508925, + "step": 12634, + "time_per_iteration": 2.5319647789001465 + }, + { + "auxiliary_loss_clip": 0.01102159, + "auxiliary_loss_mlp": 0.01035436, + "balance_loss_clip": 1.03650653, + "balance_loss_mlp": 1.02387881, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.7003586825118537, + "language_loss": 0.64539355, + "learning_rate": 5.758388314770408e-07, + "loss": 0.6667695, + "num_input_tokens_seen": 272528805, + "step": 12635, + "time_per_iteration": 2.5444977283477783 + }, + { + "auxiliary_loss_clip": 0.01036783, + "auxiliary_loss_mlp": 0.01037195, + "balance_loss_clip": 1.02989411, + "balance_loss_mlp": 1.02283597, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.8803536213470657, + "language_loss": 0.69059461, + "learning_rate": 5.7556541831317e-07, + "loss": 0.71133441, + "num_input_tokens_seen": 272546655, + "step": 12636, + "time_per_iteration": 2.636467456817627 + }, + { + "auxiliary_loss_clip": 0.01081336, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.03484344, + "balance_loss_mlp": 1.02577376, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.971262139804817, + "language_loss": 0.80919081, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83036876, + "num_input_tokens_seen": 272564010, + "step": 12637, + "time_per_iteration": 2.617504835128784 + }, + { + "auxiliary_loss_clip": 0.0108666, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.03260124, + "balance_loss_mlp": 1.01965582, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 2.2325791295794724, + "language_loss": 0.66134155, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68252003, + "num_input_tokens_seen": 272585840, + "step": 12638, + "time_per_iteration": 4.1550133228302 + }, + { + "auxiliary_loss_clip": 0.01102306, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.0360384, + "balance_loss_mlp": 1.02352524, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.1860370800525843, + "language_loss": 0.65338987, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67477214, + "num_input_tokens_seen": 272602300, + "step": 12639, + "time_per_iteration": 2.514873504638672 + }, + { + "auxiliary_loss_clip": 0.01086451, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.03248549, + "balance_loss_mlp": 1.0176475, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.0933323542468982, + "language_loss": 0.70194626, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72310257, + "num_input_tokens_seen": 272619595, + "step": 12640, + "time_per_iteration": 2.5541279315948486 + }, + { + "auxiliary_loss_clip": 0.01082369, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.03703988, + "balance_loss_mlp": 1.01823413, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.7027148624363266, + "language_loss": 0.67071354, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69183856, + "num_input_tokens_seen": 272638825, + "step": 12641, + "time_per_iteration": 2.5837290287017822 + }, + { + "auxiliary_loss_clip": 0.01087677, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.0326128, + "balance_loss_mlp": 1.02011156, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.2135467199334484, + "language_loss": 0.66517389, + "learning_rate": 5.73926074001422e-07, + "loss": 0.68637174, + "num_input_tokens_seen": 272657240, + "step": 12642, + "time_per_iteration": 2.594597101211548 + }, + { + "auxiliary_loss_clip": 0.0108062, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.03612173, + "balance_loss_mlp": 1.02079511, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.8109199734702905, + "language_loss": 0.75324005, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77436662, + "num_input_tokens_seen": 272677520, + "step": 12643, + "time_per_iteration": 2.648016929626465 + }, + { + "auxiliary_loss_clip": 0.01064475, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.0344677, + "balance_loss_mlp": 1.02024698, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 2.8392966069672894, + "language_loss": 0.78919142, + "learning_rate": 5.733800584019508e-07, + "loss": 0.810166, + "num_input_tokens_seen": 272696770, + "step": 12644, + "time_per_iteration": 2.6340999603271484 + }, + { + "auxiliary_loss_clip": 0.01070392, + "auxiliary_loss_mlp": 0.01026247, + "balance_loss_clip": 1.03178358, + "balance_loss_mlp": 1.01482069, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.5668171319071416, + "language_loss": 0.80622649, + "learning_rate": 5.731071317433957e-07, + "loss": 0.8271929, + "num_input_tokens_seen": 272718340, + "step": 12645, + "time_per_iteration": 2.6643102169036865 + }, + { + "auxiliary_loss_clip": 0.01075506, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.03567743, + "balance_loss_mlp": 1.01979446, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.741260037842477, + "language_loss": 0.73063028, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75169814, + "num_input_tokens_seen": 272739575, + "step": 12646, + "time_per_iteration": 2.6602704524993896 + }, + { + "auxiliary_loss_clip": 0.01087024, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.03311348, + "balance_loss_mlp": 1.02387416, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 1.877624119421159, + "language_loss": 0.67163146, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69284678, + "num_input_tokens_seen": 272758710, + "step": 12647, + "time_per_iteration": 2.545185089111328 + }, + { + "auxiliary_loss_clip": 0.01014391, + "auxiliary_loss_mlp": 0.00999248, + "balance_loss_clip": 1.00446069, + "balance_loss_mlp": 0.99792463, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6860834289249597, + "language_loss": 0.48986953, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51000595, + "num_input_tokens_seen": 272814855, + "step": 12648, + "time_per_iteration": 4.64260458946228 + }, + { + "auxiliary_loss_clip": 0.01085828, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.03346467, + "balance_loss_mlp": 1.02269697, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 1.6080288531861149, + "language_loss": 0.76834297, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78953791, + "num_input_tokens_seen": 272834400, + "step": 12649, + "time_per_iteration": 2.5404036045074463 + }, + { + "auxiliary_loss_clip": 0.01056141, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.03247142, + "balance_loss_mlp": 1.01983345, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.5526222211369243, + "language_loss": 0.68540955, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70628417, + "num_input_tokens_seen": 272854760, + "step": 12650, + "time_per_iteration": 2.6502091884613037 + }, + { + "auxiliary_loss_clip": 0.01014833, + "auxiliary_loss_mlp": 0.01001132, + "balance_loss_clip": 1.00479817, + "balance_loss_mlp": 1.00023162, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.934053785424363, + "language_loss": 0.62702352, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64718318, + "num_input_tokens_seen": 272919030, + "step": 12651, + "time_per_iteration": 3.108750820159912 + }, + { + "auxiliary_loss_clip": 0.01066456, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.03351152, + "balance_loss_mlp": 1.02184677, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.498206955730408, + "language_loss": 0.71331006, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73429859, + "num_input_tokens_seen": 272938925, + "step": 12652, + "time_per_iteration": 2.8012423515319824 + }, + { + "auxiliary_loss_clip": 0.01048192, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.03302848, + "balance_loss_mlp": 1.0237937, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 1.809207161012943, + "language_loss": 0.80448037, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82531524, + "num_input_tokens_seen": 272954945, + "step": 12653, + "time_per_iteration": 2.690100908279419 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.03632975, + "balance_loss_mlp": 1.01784587, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.4988234496371524, + "language_loss": 0.79963624, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82097077, + "num_input_tokens_seen": 272972855, + "step": 12654, + "time_per_iteration": 2.4841082096099854 + }, + { + "auxiliary_loss_clip": 0.01062175, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.03165436, + "balance_loss_mlp": 1.02670336, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.0889004000699174, + "language_loss": 0.79557228, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81658089, + "num_input_tokens_seen": 272989895, + "step": 12655, + "time_per_iteration": 2.5732569694519043 + }, + { + "auxiliary_loss_clip": 0.01082284, + "auxiliary_loss_mlp": 0.01023076, + "balance_loss_clip": 1.03346145, + "balance_loss_mlp": 1.01367068, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.5915555935051238, + "language_loss": 0.68283117, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70388478, + "num_input_tokens_seen": 273011695, + "step": 12656, + "time_per_iteration": 2.6661057472229004 + }, + { + "auxiliary_loss_clip": 0.01088117, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.03021598, + "balance_loss_mlp": 1.02127171, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 2.390076746035515, + "language_loss": 0.73382008, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75503248, + "num_input_tokens_seen": 273028815, + "step": 12657, + "time_per_iteration": 2.5535147190093994 + }, + { + "auxiliary_loss_clip": 0.01006399, + "auxiliary_loss_mlp": 0.01003948, + "balance_loss_clip": 1.00611949, + "balance_loss_mlp": 1.00304222, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8584222117060464, + "language_loss": 0.64909995, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66920346, + "num_input_tokens_seen": 273084080, + "step": 12658, + "time_per_iteration": 3.0871942043304443 + }, + { + "auxiliary_loss_clip": 0.01082449, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.03373396, + "balance_loss_mlp": 1.02156639, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.6131168209210691, + "language_loss": 0.7943359, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81548619, + "num_input_tokens_seen": 273102295, + "step": 12659, + "time_per_iteration": 2.554321527481079 + }, + { + "auxiliary_loss_clip": 0.01086437, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.01663184, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.4512956064244842, + "language_loss": 0.68903339, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71017432, + "num_input_tokens_seen": 273123400, + "step": 12660, + "time_per_iteration": 2.5974180698394775 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.03446209, + "balance_loss_mlp": 1.01869071, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 2.024521811403265, + "language_loss": 0.70430863, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72560191, + "num_input_tokens_seen": 273145150, + "step": 12661, + "time_per_iteration": 4.007274389266968 + }, + { + "auxiliary_loss_clip": 0.01086377, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.03237844, + "balance_loss_mlp": 1.01769519, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.9324640069750874, + "language_loss": 0.83539629, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85654402, + "num_input_tokens_seen": 273165180, + "step": 12662, + "time_per_iteration": 2.5874390602111816 + }, + { + "auxiliary_loss_clip": 0.01075418, + "auxiliary_loss_mlp": 0.01037292, + "balance_loss_clip": 1.03310037, + "balance_loss_mlp": 1.02664042, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.69831836890706, + "language_loss": 0.6893664, + "learning_rate": 5.682037143624505e-07, + "loss": 0.71049351, + "num_input_tokens_seen": 273184005, + "step": 12663, + "time_per_iteration": 2.563978433609009 + }, + { + "auxiliary_loss_clip": 0.01086413, + "auxiliary_loss_mlp": 0.01024668, + "balance_loss_clip": 1.03439808, + "balance_loss_mlp": 1.01412964, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.622575065966134, + "language_loss": 0.7019195, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72303033, + "num_input_tokens_seen": 273203565, + "step": 12664, + "time_per_iteration": 2.586395263671875 + }, + { + "auxiliary_loss_clip": 0.01093704, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.03599536, + "balance_loss_mlp": 1.02335143, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 1.9748494531612353, + "language_loss": 0.7914871, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81278288, + "num_input_tokens_seen": 273221645, + "step": 12665, + "time_per_iteration": 2.625035285949707 + }, + { + "auxiliary_loss_clip": 0.01099022, + "auxiliary_loss_mlp": 0.00749241, + "balance_loss_clip": 1.03679585, + "balance_loss_mlp": 1.00027323, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.8548627918777434, + "language_loss": 0.88238823, + "learning_rate": 5.673881867632959e-07, + "loss": 0.9008708, + "num_input_tokens_seen": 273242040, + "step": 12666, + "time_per_iteration": 2.537827253341675 + }, + { + "auxiliary_loss_clip": 0.01044304, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.03230631, + "balance_loss_mlp": 1.02472734, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 2.524104482985983, + "language_loss": 0.83455276, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85536528, + "num_input_tokens_seen": 273257365, + "step": 12667, + "time_per_iteration": 2.6263062953948975 + }, + { + "auxiliary_loss_clip": 0.01075782, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.03398895, + "balance_loss_mlp": 1.02106869, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.7994760668938548, + "language_loss": 0.78330517, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80437708, + "num_input_tokens_seen": 273274710, + "step": 12668, + "time_per_iteration": 2.569765090942383 + }, + { + "auxiliary_loss_clip": 0.01060816, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.03054404, + "balance_loss_mlp": 1.01595891, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.7491327384916102, + "language_loss": 0.64010715, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66098738, + "num_input_tokens_seen": 273292870, + "step": 12669, + "time_per_iteration": 2.6062510013580322 + }, + { + "auxiliary_loss_clip": 0.01069915, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.03115928, + "balance_loss_mlp": 1.02017975, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 1.9675715718845295, + "language_loss": 0.6634742, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68450409, + "num_input_tokens_seen": 273312375, + "step": 12670, + "time_per_iteration": 2.6020095348358154 + }, + { + "auxiliary_loss_clip": 0.01089612, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.03402805, + "balance_loss_mlp": 1.02318704, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.707441644584625, + "language_loss": 0.72876632, + "learning_rate": 5.660300607310493e-07, + "loss": 0.7500056, + "num_input_tokens_seen": 273332590, + "step": 12671, + "time_per_iteration": 4.071162700653076 + }, + { + "auxiliary_loss_clip": 0.01056394, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.02864861, + "balance_loss_mlp": 1.02145243, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.7090478907556466, + "language_loss": 0.73584783, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75674301, + "num_input_tokens_seen": 273352885, + "step": 12672, + "time_per_iteration": 2.6237869262695312 + }, + { + "auxiliary_loss_clip": 0.0099246, + "auxiliary_loss_mlp": 0.01000814, + "balance_loss_clip": 1.00391531, + "balance_loss_mlp": 0.99985451, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7576068696114319, + "language_loss": 0.56657791, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58651066, + "num_input_tokens_seen": 273411730, + "step": 12673, + "time_per_iteration": 3.1209957599639893 + }, + { + "auxiliary_loss_clip": 0.01089566, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.03549814, + "balance_loss_mlp": 1.01868773, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 7.037597439605295, + "language_loss": 0.74478632, + "learning_rate": 5.652158375447102e-07, + "loss": 0.76598841, + "num_input_tokens_seen": 273430020, + "step": 12674, + "time_per_iteration": 2.6049892902374268 + }, + { + "auxiliary_loss_clip": 0.01063116, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.03033566, + "balance_loss_mlp": 1.02230227, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 2.0575813826752007, + "language_loss": 0.72314095, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74411589, + "num_input_tokens_seen": 273448690, + "step": 12675, + "time_per_iteration": 2.5720653533935547 + }, + { + "auxiliary_loss_clip": 0.01082545, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.03239167, + "balance_loss_mlp": 1.0204277, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.233547775580766, + "language_loss": 0.72794974, + "learning_rate": 5.646732941057936e-07, + "loss": 0.74908423, + "num_input_tokens_seen": 273465190, + "step": 12676, + "time_per_iteration": 2.5422818660736084 + }, + { + "auxiliary_loss_clip": 0.01067183, + "auxiliary_loss_mlp": 0.00749751, + "balance_loss_clip": 1.03461695, + "balance_loss_mlp": 1.00041378, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.412881742805277, + "language_loss": 0.53536534, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55353469, + "num_input_tokens_seen": 273478620, + "step": 12677, + "time_per_iteration": 2.6405484676361084 + }, + { + "auxiliary_loss_clip": 0.01037482, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.03092742, + "balance_loss_mlp": 1.02250898, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 2.22416328076979, + "language_loss": 0.79137981, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81209427, + "num_input_tokens_seen": 273497635, + "step": 12678, + "time_per_iteration": 4.121824026107788 + }, + { + "auxiliary_loss_clip": 0.01060306, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.03084576, + "balance_loss_mlp": 1.02664924, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 1.8603930408187705, + "language_loss": 0.7753613, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79635632, + "num_input_tokens_seen": 273513955, + "step": 12679, + "time_per_iteration": 2.596921443939209 + }, + { + "auxiliary_loss_clip": 0.01087741, + "auxiliary_loss_mlp": 0.01024514, + "balance_loss_clip": 1.0345422, + "balance_loss_mlp": 1.0136658, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.50285232248381, + "language_loss": 0.79892874, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82005125, + "num_input_tokens_seen": 273533970, + "step": 12680, + "time_per_iteration": 2.6054744720458984 + }, + { + "auxiliary_loss_clip": 0.01077999, + "auxiliary_loss_mlp": 0.01024997, + "balance_loss_clip": 1.03362513, + "balance_loss_mlp": 1.01256323, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.7489997231656726, + "language_loss": 0.62492847, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64595842, + "num_input_tokens_seen": 273553090, + "step": 12681, + "time_per_iteration": 2.589212656021118 + }, + { + "auxiliary_loss_clip": 0.01064146, + "auxiliary_loss_mlp": 0.01026971, + "balance_loss_clip": 1.03432178, + "balance_loss_mlp": 1.01648617, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 1.7951443171399084, + "language_loss": 0.76335001, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78426117, + "num_input_tokens_seen": 273572460, + "step": 12682, + "time_per_iteration": 2.7016122341156006 + }, + { + "auxiliary_loss_clip": 0.01074516, + "auxiliary_loss_mlp": 0.0102786, + "balance_loss_clip": 1.03227103, + "balance_loss_mlp": 1.01730466, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.8870089474576521, + "language_loss": 0.68618202, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70720577, + "num_input_tokens_seen": 273592815, + "step": 12683, + "time_per_iteration": 2.699751138687134 + }, + { + "auxiliary_loss_clip": 0.01054775, + "auxiliary_loss_mlp": 0.00749785, + "balance_loss_clip": 1.02939677, + "balance_loss_mlp": 1.00033736, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 1.973098871136129, + "language_loss": 0.83130926, + "learning_rate": 5.625052982818472e-07, + "loss": 0.84935486, + "num_input_tokens_seen": 273611790, + "step": 12684, + "time_per_iteration": 2.690751314163208 + }, + { + "auxiliary_loss_clip": 0.01077943, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.03377903, + "balance_loss_mlp": 1.02499533, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 4.890442134113349, + "language_loss": 0.82698423, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84813434, + "num_input_tokens_seen": 273628340, + "step": 12685, + "time_per_iteration": 2.6635398864746094 + }, + { + "auxiliary_loss_clip": 0.01068513, + "auxiliary_loss_mlp": 0.00749403, + "balance_loss_clip": 1.0336175, + "balance_loss_mlp": 1.00036168, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 3.2604065014334176, + "language_loss": 0.77262521, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79080433, + "num_input_tokens_seen": 273646585, + "step": 12686, + "time_per_iteration": 2.7540671825408936 + }, + { + "auxiliary_loss_clip": 0.01036346, + "auxiliary_loss_mlp": 0.01049271, + "balance_loss_clip": 1.02941632, + "balance_loss_mlp": 1.03412521, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.651109620128179, + "language_loss": 0.72038043, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74123657, + "num_input_tokens_seen": 273665410, + "step": 12687, + "time_per_iteration": 2.745286226272583 + }, + { + "auxiliary_loss_clip": 0.01070265, + "auxiliary_loss_mlp": 0.0104174, + "balance_loss_clip": 1.03090751, + "balance_loss_mlp": 1.02810287, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 2.613672287841946, + "language_loss": 0.64705032, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66817033, + "num_input_tokens_seen": 273683035, + "step": 12688, + "time_per_iteration": 4.168217182159424 + }, + { + "auxiliary_loss_clip": 0.01087986, + "auxiliary_loss_mlp": 0.01026529, + "balance_loss_clip": 1.03472781, + "balance_loss_mlp": 1.01600885, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 2.21165400324701, + "language_loss": 0.70467877, + "learning_rate": 5.611520721310515e-07, + "loss": 0.72582388, + "num_input_tokens_seen": 273700130, + "step": 12689, + "time_per_iteration": 2.5884618759155273 + }, + { + "auxiliary_loss_clip": 0.0106738, + "auxiliary_loss_mlp": 0.01038522, + "balance_loss_clip": 1.0335722, + "balance_loss_mlp": 1.02683973, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.7577248221292472, + "language_loss": 0.69671345, + "learning_rate": 5.608815905436238e-07, + "loss": 0.71777248, + "num_input_tokens_seen": 273720310, + "step": 12690, + "time_per_iteration": 2.6723151206970215 + }, + { + "auxiliary_loss_clip": 0.01070767, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.03207588, + "balance_loss_mlp": 1.02325749, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.663400381105236, + "language_loss": 0.69291413, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71397924, + "num_input_tokens_seen": 273744475, + "step": 12691, + "time_per_iteration": 2.893247127532959 + }, + { + "auxiliary_loss_clip": 0.01082766, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.03278542, + "balance_loss_mlp": 1.0214895, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 2.182938081091753, + "language_loss": 0.81726265, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83840489, + "num_input_tokens_seen": 273764635, + "step": 12692, + "time_per_iteration": 2.5979557037353516 + }, + { + "auxiliary_loss_clip": 0.0106731, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.03688049, + "balance_loss_mlp": 1.01957202, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.0395407987506338, + "language_loss": 0.76857483, + "learning_rate": 5.600704732514438e-07, + "loss": 0.78955185, + "num_input_tokens_seen": 273780115, + "step": 12693, + "time_per_iteration": 2.6354517936706543 + }, + { + "auxiliary_loss_clip": 0.01061767, + "auxiliary_loss_mlp": 0.01029329, + "balance_loss_clip": 1.0342803, + "balance_loss_mlp": 1.01768839, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 3.9198682860975507, + "language_loss": 0.72780973, + "learning_rate": 5.598002100115933e-07, + "loss": 0.74872071, + "num_input_tokens_seen": 273796605, + "step": 12694, + "time_per_iteration": 2.7376816272735596 + }, + { + "auxiliary_loss_clip": 0.01083777, + "auxiliary_loss_mlp": 0.01026148, + "balance_loss_clip": 1.03160691, + "balance_loss_mlp": 1.01512074, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 2.901547134300021, + "language_loss": 0.70512867, + "learning_rate": 5.595300013842625e-07, + "loss": 0.72622794, + "num_input_tokens_seen": 273816515, + "step": 12695, + "time_per_iteration": 2.581399917602539 + }, + { + "auxiliary_loss_clip": 0.01099442, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.03487325, + "balance_loss_mlp": 1.01853192, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.8347743184108452, + "language_loss": 0.72180176, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74309099, + "num_input_tokens_seen": 273837060, + "step": 12696, + "time_per_iteration": 2.5538601875305176 + }, + { + "auxiliary_loss_clip": 0.01038412, + "auxiliary_loss_mlp": 0.01038904, + "balance_loss_clip": 1.03146803, + "balance_loss_mlp": 1.02658987, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.4249581876055433, + "language_loss": 0.71976614, + "learning_rate": 5.589897480081453e-07, + "loss": 0.74053931, + "num_input_tokens_seen": 273853365, + "step": 12697, + "time_per_iteration": 2.7391514778137207 + }, + { + "auxiliary_loss_clip": 0.01065169, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.03467894, + "balance_loss_mlp": 1.01744938, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 1.8544637013538137, + "language_loss": 0.66839361, + "learning_rate": 5.587197032798461e-07, + "loss": 0.68932652, + "num_input_tokens_seen": 273870750, + "step": 12698, + "time_per_iteration": 2.6962389945983887 + }, + { + "auxiliary_loss_clip": 0.01083099, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.03089535, + "balance_loss_mlp": 1.01694608, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.6988989148606126, + "language_loss": 0.72277391, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74388754, + "num_input_tokens_seen": 273890890, + "step": 12699, + "time_per_iteration": 2.5951478481292725 + }, + { + "auxiliary_loss_clip": 0.01074024, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.03131318, + "balance_loss_mlp": 1.0198729, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.7067747406391485, + "language_loss": 0.73133147, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75237703, + "num_input_tokens_seen": 273914015, + "step": 12700, + "time_per_iteration": 2.784522533416748 + }, + { + "auxiliary_loss_clip": 0.0109761, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.03287566, + "balance_loss_mlp": 1.01591873, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 2.015991590672614, + "language_loss": 0.69252312, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71376765, + "num_input_tokens_seen": 273927415, + "step": 12701, + "time_per_iteration": 4.008802175521851 + }, + { + "auxiliary_loss_clip": 0.01070712, + "auxiliary_loss_mlp": 0.01029681, + "balance_loss_clip": 1.03467488, + "balance_loss_mlp": 1.01906574, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 1.7556186563375207, + "language_loss": 0.64536083, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66636473, + "num_input_tokens_seen": 273946690, + "step": 12702, + "time_per_iteration": 2.541956901550293 + }, + { + "auxiliary_loss_clip": 0.01067082, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.03356659, + "balance_loss_mlp": 1.02005124, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.8999977147712293, + "language_loss": 0.65727866, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67826003, + "num_input_tokens_seen": 273966870, + "step": 12703, + "time_per_iteration": 2.6553151607513428 + }, + { + "auxiliary_loss_clip": 0.01073898, + "auxiliary_loss_mlp": 0.01026391, + "balance_loss_clip": 1.03490126, + "balance_loss_mlp": 1.01488686, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.8215188447386481, + "language_loss": 0.83734834, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85835123, + "num_input_tokens_seen": 273986360, + "step": 12704, + "time_per_iteration": 2.5730233192443848 + }, + { + "auxiliary_loss_clip": 0.01076232, + "auxiliary_loss_mlp": 0.01031059, + "balance_loss_clip": 1.0355562, + "balance_loss_mlp": 1.01997864, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.4279137419720347, + "language_loss": 0.68298185, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70405471, + "num_input_tokens_seen": 274009745, + "step": 12705, + "time_per_iteration": 2.60082745552063 + }, + { + "auxiliary_loss_clip": 0.01070062, + "auxiliary_loss_mlp": 0.01026909, + "balance_loss_clip": 1.03213704, + "balance_loss_mlp": 1.01567984, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.747220527219695, + "language_loss": 0.74159604, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76256579, + "num_input_tokens_seen": 274028775, + "step": 12706, + "time_per_iteration": 2.664933919906616 + }, + { + "auxiliary_loss_clip": 0.01079682, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.03226733, + "balance_loss_mlp": 1.01991892, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.9841196970192352, + "language_loss": 0.78365171, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80476409, + "num_input_tokens_seen": 274047520, + "step": 12707, + "time_per_iteration": 2.5111875534057617 + }, + { + "auxiliary_loss_clip": 0.01070467, + "auxiliary_loss_mlp": 0.01028672, + "balance_loss_clip": 1.03257096, + "balance_loss_mlp": 1.01757336, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.7548787182351648, + "language_loss": 0.79971117, + "learning_rate": 5.560222636275751e-07, + "loss": 0.82070261, + "num_input_tokens_seen": 274065350, + "step": 12708, + "time_per_iteration": 2.6308176517486572 + }, + { + "auxiliary_loss_clip": 0.01015934, + "auxiliary_loss_mlp": 0.01001019, + "balance_loss_clip": 1.00803423, + "balance_loss_mlp": 0.99977893, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8169505085748288, + "language_loss": 0.56406713, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58423662, + "num_input_tokens_seen": 274122315, + "step": 12709, + "time_per_iteration": 3.1570146083831787 + }, + { + "auxiliary_loss_clip": 0.01085098, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.03296435, + "balance_loss_mlp": 1.02248025, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.9287073644910464, + "language_loss": 0.63131773, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65251964, + "num_input_tokens_seen": 274140555, + "step": 12710, + "time_per_iteration": 2.58027982711792 + }, + { + "auxiliary_loss_clip": 0.01049482, + "auxiliary_loss_mlp": 0.0074961, + "balance_loss_clip": 1.03172755, + "balance_loss_mlp": 1.00034297, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.266818675901632, + "language_loss": 0.64789569, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66588664, + "num_input_tokens_seen": 274161125, + "step": 12711, + "time_per_iteration": 4.238481283187866 + }, + { + "auxiliary_loss_clip": 0.01076334, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.03223848, + "balance_loss_mlp": 1.02112055, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.5599863283757847, + "language_loss": 0.72894627, + "learning_rate": 5.549448203559293e-07, + "loss": 0.75003189, + "num_input_tokens_seen": 274180835, + "step": 12712, + "time_per_iteration": 2.6062822341918945 + }, + { + "auxiliary_loss_clip": 0.01065305, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.0339787, + "balance_loss_mlp": 1.01928163, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 3.8468960958797815, + "language_loss": 0.80523884, + "learning_rate": 5.546755965040804e-07, + "loss": 0.8261869, + "num_input_tokens_seen": 274201190, + "step": 12713, + "time_per_iteration": 2.628911256790161 + }, + { + "auxiliary_loss_clip": 0.01088865, + "auxiliary_loss_mlp": 0.00749577, + "balance_loss_clip": 1.03320122, + "balance_loss_mlp": 1.00029075, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.5937607140349717, + "language_loss": 0.83473247, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85311687, + "num_input_tokens_seen": 274217595, + "step": 12714, + "time_per_iteration": 2.6135220527648926 + }, + { + "auxiliary_loss_clip": 0.01091692, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.03474295, + "balance_loss_mlp": 1.02350163, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.4980603079378427, + "language_loss": 0.73063153, + "learning_rate": 5.541373132311287e-07, + "loss": 0.7518962, + "num_input_tokens_seen": 274237885, + "step": 12715, + "time_per_iteration": 2.619277000427246 + }, + { + "auxiliary_loss_clip": 0.0105634, + "auxiliary_loss_mlp": 0.01026649, + "balance_loss_clip": 1.03072715, + "balance_loss_mlp": 1.01550877, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 2.11895250996577, + "language_loss": 0.63173962, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65256947, + "num_input_tokens_seen": 274258820, + "step": 12716, + "time_per_iteration": 2.6810507774353027 + }, + { + "auxiliary_loss_clip": 0.01101427, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.03430057, + "balance_loss_mlp": 1.02139497, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 2.7654750116773106, + "language_loss": 0.79751623, + "learning_rate": 5.535992492672068e-07, + "loss": 0.81886685, + "num_input_tokens_seen": 274278835, + "step": 12717, + "time_per_iteration": 2.5356881618499756 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.03570163, + "balance_loss_mlp": 1.02037323, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.902038850935318, + "language_loss": 0.66640323, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68771541, + "num_input_tokens_seen": 274297110, + "step": 12718, + "time_per_iteration": 4.083320617675781 + }, + { + "auxiliary_loss_clip": 0.01052363, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.0315392, + "balance_loss_mlp": 1.02297413, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 1.8886470263858903, + "language_loss": 0.77558672, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79644382, + "num_input_tokens_seen": 274315610, + "step": 12719, + "time_per_iteration": 2.618492364883423 + }, + { + "auxiliary_loss_clip": 0.01099867, + "auxiliary_loss_mlp": 0.0102997, + "balance_loss_clip": 1.0345062, + "balance_loss_mlp": 1.01844275, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 2.107610204193682, + "language_loss": 0.70016897, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72146732, + "num_input_tokens_seen": 274333975, + "step": 12720, + "time_per_iteration": 2.538076400756836 + }, + { + "auxiliary_loss_clip": 0.01065348, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.03317535, + "balance_loss_mlp": 1.02292538, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.9096297621313523, + "language_loss": 0.73902011, + "learning_rate": 5.52523779592875e-07, + "loss": 0.76000983, + "num_input_tokens_seen": 274353695, + "step": 12721, + "time_per_iteration": 2.63812518119812 + }, + { + "auxiliary_loss_clip": 0.01057665, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.03199816, + "balance_loss_mlp": 1.01752532, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.9064155469012483, + "language_loss": 0.73583114, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75669503, + "num_input_tokens_seen": 274371120, + "step": 12722, + "time_per_iteration": 2.6401684284210205 + }, + { + "auxiliary_loss_clip": 0.01086796, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.03323007, + "balance_loss_mlp": 1.0223968, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 2.3758598393984127, + "language_loss": 0.73793209, + "learning_rate": 5.519863740455912e-07, + "loss": 0.75913739, + "num_input_tokens_seen": 274389665, + "step": 12723, + "time_per_iteration": 2.5753960609436035 + }, + { + "auxiliary_loss_clip": 0.01097995, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.03154325, + "balance_loss_mlp": 1.01768446, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 2.1010619591040296, + "language_loss": 0.72641814, + "learning_rate": 5.517177536300881e-07, + "loss": 0.74769545, + "num_input_tokens_seen": 274408750, + "step": 12724, + "time_per_iteration": 2.539820432662964 + }, + { + "auxiliary_loss_clip": 0.01084064, + "auxiliary_loss_mlp": 0.010233, + "balance_loss_clip": 1.032794, + "balance_loss_mlp": 1.01254153, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.8721604704812502, + "language_loss": 0.83909005, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86016369, + "num_input_tokens_seen": 274424600, + "step": 12725, + "time_per_iteration": 2.5259809494018555 + }, + { + "auxiliary_loss_clip": 0.01061196, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.03457046, + "balance_loss_mlp": 1.02132154, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.7780181189231736, + "language_loss": 0.77466118, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79560864, + "num_input_tokens_seen": 274443075, + "step": 12726, + "time_per_iteration": 2.7281360626220703 + }, + { + "auxiliary_loss_clip": 0.01086839, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.03275037, + "balance_loss_mlp": 1.02160347, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.8133789737824673, + "language_loss": 0.70531344, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72650892, + "num_input_tokens_seen": 274463240, + "step": 12727, + "time_per_iteration": 2.620077133178711 + }, + { + "auxiliary_loss_clip": 0.01092797, + "auxiliary_loss_mlp": 0.01027568, + "balance_loss_clip": 1.03137827, + "balance_loss_mlp": 1.01729858, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.7263857176365234, + "language_loss": 0.79812503, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81932873, + "num_input_tokens_seen": 274482750, + "step": 12728, + "time_per_iteration": 4.1009228229522705 + }, + { + "auxiliary_loss_clip": 0.01099676, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.03433251, + "balance_loss_mlp": 1.01633406, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 1.7772538053202942, + "language_loss": 0.55432618, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57560474, + "num_input_tokens_seen": 274503545, + "step": 12729, + "time_per_iteration": 2.6438255310058594 + }, + { + "auxiliary_loss_clip": 0.01076227, + "auxiliary_loss_mlp": 0.00749377, + "balance_loss_clip": 1.03254235, + "balance_loss_mlp": 1.00034845, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 1.7046886395627632, + "language_loss": 0.77666956, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79492557, + "num_input_tokens_seen": 274523825, + "step": 12730, + "time_per_iteration": 2.623826026916504 + }, + { + "auxiliary_loss_clip": 0.01093017, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.03723037, + "balance_loss_mlp": 1.02480376, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 7.50189139206386, + "language_loss": 0.69461662, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71590596, + "num_input_tokens_seen": 274541625, + "step": 12731, + "time_per_iteration": 2.608276844024658 + }, + { + "auxiliary_loss_clip": 0.01100668, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.03459144, + "balance_loss_mlp": 1.01801813, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.1001656295472513, + "language_loss": 0.70299107, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72429252, + "num_input_tokens_seen": 274557580, + "step": 12732, + "time_per_iteration": 2.5734925270080566 + }, + { + "auxiliary_loss_clip": 0.01079464, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.03352594, + "balance_loss_mlp": 1.01742494, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.585294087418201, + "language_loss": 0.78330231, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80438757, + "num_input_tokens_seen": 274578135, + "step": 12733, + "time_per_iteration": 2.672873020172119 + }, + { + "auxiliary_loss_clip": 0.01084625, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.03244281, + "balance_loss_mlp": 1.01839375, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7941491300325432, + "language_loss": 0.77416712, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79530871, + "num_input_tokens_seen": 274595655, + "step": 12734, + "time_per_iteration": 2.5790388584136963 + }, + { + "auxiliary_loss_clip": 0.01072816, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.03339362, + "balance_loss_mlp": 1.01746249, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.674889973135037, + "language_loss": 0.7299847, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75101388, + "num_input_tokens_seen": 274616305, + "step": 12735, + "time_per_iteration": 2.6266067028045654 + }, + { + "auxiliary_loss_clip": 0.01069089, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.03334212, + "balance_loss_mlp": 1.02167654, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.5227927806492032, + "language_loss": 0.72559845, + "learning_rate": 5.484985952378145e-07, + "loss": 0.74662018, + "num_input_tokens_seen": 274638110, + "step": 12736, + "time_per_iteration": 2.6429998874664307 + }, + { + "auxiliary_loss_clip": 0.01089644, + "auxiliary_loss_mlp": 0.00749647, + "balance_loss_clip": 1.03498065, + "balance_loss_mlp": 1.00031972, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 3.9934535040056836, + "language_loss": 0.77569091, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79408383, + "num_input_tokens_seen": 274656565, + "step": 12737, + "time_per_iteration": 2.5452520847320557 + }, + { + "auxiliary_loss_clip": 0.01071815, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.03090632, + "balance_loss_mlp": 1.0205729, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 7.684958495814054, + "language_loss": 0.76895332, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78999317, + "num_input_tokens_seen": 274674215, + "step": 12738, + "time_per_iteration": 2.584130048751831 + }, + { + "auxiliary_loss_clip": 0.01078444, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.03381526, + "balance_loss_mlp": 1.01721764, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.9688878775576795, + "language_loss": 0.62901568, + "learning_rate": 5.476950433777603e-07, + "loss": 0.65009052, + "num_input_tokens_seen": 274693445, + "step": 12739, + "time_per_iteration": 2.6183807849884033 + }, + { + "auxiliary_loss_clip": 0.01099903, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.03457856, + "balance_loss_mlp": 1.02282357, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 1.8377706620552712, + "language_loss": 0.78619885, + "learning_rate": 5.474273028873004e-07, + "loss": 0.80754471, + "num_input_tokens_seen": 274712815, + "step": 12740, + "time_per_iteration": 2.47562313079834 + }, + { + "auxiliary_loss_clip": 0.01087562, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.03242099, + "balance_loss_mlp": 1.01930809, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.635027092714748, + "language_loss": 0.65581596, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67700183, + "num_input_tokens_seen": 274732690, + "step": 12741, + "time_per_iteration": 4.087583780288696 + }, + { + "auxiliary_loss_clip": 0.01069963, + "auxiliary_loss_mlp": 0.01026472, + "balance_loss_clip": 1.03165555, + "balance_loss_mlp": 1.01487303, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.861904014373764, + "language_loss": 0.75550824, + "learning_rate": 5.468919871616386e-07, + "loss": 0.77647257, + "num_input_tokens_seen": 274752460, + "step": 12742, + "time_per_iteration": 2.558533191680908 + }, + { + "auxiliary_loss_clip": 0.01075325, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.03606343, + "balance_loss_mlp": 1.0209204, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.9286807647681075, + "language_loss": 0.76800418, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78907037, + "num_input_tokens_seen": 274773070, + "step": 12743, + "time_per_iteration": 2.6483359336853027 + }, + { + "auxiliary_loss_clip": 0.01071735, + "auxiliary_loss_mlp": 0.01029975, + "balance_loss_clip": 1.0314424, + "balance_loss_mlp": 1.01903176, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 2.0483852324189593, + "language_loss": 0.74780059, + "learning_rate": 5.463568918439805e-07, + "loss": 0.76881766, + "num_input_tokens_seen": 274790220, + "step": 12744, + "time_per_iteration": 2.923393726348877 + }, + { + "auxiliary_loss_clip": 0.01085956, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.03251076, + "balance_loss_mlp": 1.01558995, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.340480079822541, + "language_loss": 0.71205604, + "learning_rate": 5.460894268635181e-07, + "loss": 0.7331897, + "num_input_tokens_seen": 274805095, + "step": 12745, + "time_per_iteration": 2.624206304550171 + }, + { + "auxiliary_loss_clip": 0.01080886, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.03146219, + "balance_loss_mlp": 1.02197278, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 3.651215890728907, + "language_loss": 0.76989388, + "learning_rate": 5.458220170154896e-07, + "loss": 0.79105347, + "num_input_tokens_seen": 274821800, + "step": 12746, + "time_per_iteration": 2.6095874309539795 + }, + { + "auxiliary_loss_clip": 0.00994548, + "auxiliary_loss_mlp": 0.01010488, + "balance_loss_clip": 1.00483561, + "balance_loss_mlp": 1.00944507, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6683089255158632, + "language_loss": 0.56822997, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58828038, + "num_input_tokens_seen": 274886970, + "step": 12747, + "time_per_iteration": 3.2614197731018066 + }, + { + "auxiliary_loss_clip": 0.01095095, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.03311586, + "balance_loss_mlp": 1.0205667, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.4848536300177302, + "language_loss": 0.72115684, + "learning_rate": 5.452873627572956e-07, + "loss": 0.7424103, + "num_input_tokens_seen": 274907240, + "step": 12748, + "time_per_iteration": 2.5973024368286133 + }, + { + "auxiliary_loss_clip": 0.01060974, + "auxiliary_loss_mlp": 0.01030158, + "balance_loss_clip": 1.03178561, + "balance_loss_mlp": 1.01854753, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 3.167958651860978, + "language_loss": 0.69119489, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71210623, + "num_input_tokens_seen": 274924650, + "step": 12749, + "time_per_iteration": 2.6551601886749268 + }, + { + "auxiliary_loss_clip": 0.01086794, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.03229904, + "balance_loss_mlp": 1.0217104, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.60089931375788, + "language_loss": 0.7320295, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75322998, + "num_input_tokens_seen": 274944550, + "step": 12750, + "time_per_iteration": 4.088428974151611 + }, + { + "auxiliary_loss_clip": 0.01083473, + "auxiliary_loss_mlp": 0.01026193, + "balance_loss_clip": 1.03175473, + "balance_loss_mlp": 1.015589, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 2.413904982397523, + "language_loss": 0.75984699, + "learning_rate": 5.444857951167026e-07, + "loss": 0.78094363, + "num_input_tokens_seen": 274961330, + "step": 12751, + "time_per_iteration": 2.571985960006714 + }, + { + "auxiliary_loss_clip": 0.01062624, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.03310573, + "balance_loss_mlp": 1.02105868, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 3.191936321315048, + "language_loss": 0.61259961, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63355148, + "num_input_tokens_seen": 274981655, + "step": 12752, + "time_per_iteration": 2.700002908706665 + }, + { + "auxiliary_loss_clip": 0.0108915, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.03414321, + "balance_loss_mlp": 1.01761222, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 2.306711170293614, + "language_loss": 0.69070262, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71188998, + "num_input_tokens_seen": 274999970, + "step": 12753, + "time_per_iteration": 2.556312322616577 + }, + { + "auxiliary_loss_clip": 0.01090417, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.03531301, + "balance_loss_mlp": 1.02633202, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.303857817945198, + "language_loss": 0.62275094, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64402878, + "num_input_tokens_seen": 275015805, + "step": 12754, + "time_per_iteration": 2.5351428985595703 + }, + { + "auxiliary_loss_clip": 0.01098128, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.03535378, + "balance_loss_mlp": 1.01806462, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 3.9888130234844943, + "language_loss": 0.80121779, + "learning_rate": 5.434178110152401e-07, + "loss": 0.82248867, + "num_input_tokens_seen": 275031810, + "step": 12755, + "time_per_iteration": 2.6033968925476074 + }, + { + "auxiliary_loss_clip": 0.01098253, + "auxiliary_loss_mlp": 0.01027793, + "balance_loss_clip": 1.03496504, + "balance_loss_mlp": 1.01705825, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 4.8258114367236065, + "language_loss": 0.70196199, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72322249, + "num_input_tokens_seen": 275049325, + "step": 12756, + "time_per_iteration": 2.552110433578491 + }, + { + "auxiliary_loss_clip": 0.01086583, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.03306937, + "balance_loss_mlp": 1.0229764, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.5419514029828727, + "language_loss": 0.6984731, + "learning_rate": 5.428841503264706e-07, + "loss": 0.71967113, + "num_input_tokens_seen": 275070865, + "step": 12757, + "time_per_iteration": 2.592784881591797 + }, + { + "auxiliary_loss_clip": 0.01078341, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.03426695, + "balance_loss_mlp": 1.0219934, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 1.8089101086191874, + "language_loss": 0.75609779, + "learning_rate": 5.426174028579955e-07, + "loss": 0.77721953, + "num_input_tokens_seen": 275088015, + "step": 12758, + "time_per_iteration": 4.234838485717773 + }, + { + "auxiliary_loss_clip": 0.01084578, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.0322454, + "balance_loss_mlp": 1.02374327, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 2.030395698622215, + "language_loss": 0.76094133, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78213453, + "num_input_tokens_seen": 275106975, + "step": 12759, + "time_per_iteration": 2.578117847442627 + }, + { + "auxiliary_loss_clip": 0.01073564, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.02990985, + "balance_loss_mlp": 1.01515484, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 2.100614084049689, + "language_loss": 0.6850276, + "learning_rate": 5.420840737234425e-07, + "loss": 0.7060194, + "num_input_tokens_seen": 275129560, + "step": 12760, + "time_per_iteration": 2.720785140991211 + }, + { + "auxiliary_loss_clip": 0.01074468, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.03179073, + "balance_loss_mlp": 1.02081478, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.8573719862757896, + "language_loss": 0.7927078, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81377864, + "num_input_tokens_seen": 275151180, + "step": 12761, + "time_per_iteration": 2.5708792209625244 + }, + { + "auxiliary_loss_clip": 0.0107023, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.03147805, + "balance_loss_mlp": 1.01914191, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.8438556210575439, + "language_loss": 0.65963173, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68063301, + "num_input_tokens_seen": 275170605, + "step": 12762, + "time_per_iteration": 2.5836617946624756 + }, + { + "auxiliary_loss_clip": 0.01088831, + "auxiliary_loss_mlp": 0.01027531, + "balance_loss_clip": 1.03426814, + "balance_loss_mlp": 1.0159024, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.8900090667365097, + "language_loss": 0.74324173, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76440537, + "num_input_tokens_seen": 275188750, + "step": 12763, + "time_per_iteration": 2.539918899536133 + }, + { + "auxiliary_loss_clip": 0.0107678, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.03582442, + "balance_loss_mlp": 1.01854312, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.5788619289183434, + "language_loss": 0.70426798, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72533226, + "num_input_tokens_seen": 275211365, + "step": 12764, + "time_per_iteration": 2.736588478088379 + }, + { + "auxiliary_loss_clip": 0.01087783, + "auxiliary_loss_mlp": 0.01025443, + "balance_loss_clip": 1.034482, + "balance_loss_mlp": 1.01475632, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.6876506520257137, + "language_loss": 0.69182837, + "learning_rate": 5.40751718539491e-07, + "loss": 0.7129606, + "num_input_tokens_seen": 275231670, + "step": 12765, + "time_per_iteration": 2.595700979232788 + }, + { + "auxiliary_loss_clip": 0.01066809, + "auxiliary_loss_mlp": 0.0102597, + "balance_loss_clip": 1.03081536, + "balance_loss_mlp": 1.01627791, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 2.1670946427936686, + "language_loss": 0.61247087, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63339877, + "num_input_tokens_seen": 275249425, + "step": 12766, + "time_per_iteration": 2.5259251594543457 + }, + { + "auxiliary_loss_clip": 0.00998824, + "auxiliary_loss_mlp": 0.01002815, + "balance_loss_clip": 1.01324189, + "balance_loss_mlp": 1.00189114, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7322231275781692, + "language_loss": 0.60790169, + "learning_rate": 5.402191637390803e-07, + "loss": 0.627918, + "num_input_tokens_seen": 275312485, + "step": 12767, + "time_per_iteration": 3.298955202102661 + }, + { + "auxiliary_loss_clip": 0.01074527, + "auxiliary_loss_mlp": 0.01021031, + "balance_loss_clip": 1.03337967, + "balance_loss_mlp": 1.01114893, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.7825561199182167, + "language_loss": 0.69533646, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71629208, + "num_input_tokens_seen": 275331680, + "step": 12768, + "time_per_iteration": 2.572767734527588 + }, + { + "auxiliary_loss_clip": 0.0109221, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.03544116, + "balance_loss_mlp": 1.02303576, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.6294039111791228, + "language_loss": 0.70773751, + "learning_rate": 5.3968683035881e-07, + "loss": 0.72900701, + "num_input_tokens_seen": 275351615, + "step": 12769, + "time_per_iteration": 4.114778757095337 + }, + { + "auxiliary_loss_clip": 0.0108727, + "auxiliary_loss_mlp": 0.01026215, + "balance_loss_clip": 1.03284788, + "balance_loss_mlp": 1.01480031, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 2.2157652853422753, + "language_loss": 0.80387396, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82500887, + "num_input_tokens_seen": 275368815, + "step": 12770, + "time_per_iteration": 2.560291051864624 + }, + { + "auxiliary_loss_clip": 0.01057451, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.03078425, + "balance_loss_mlp": 1.02109349, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.520427019965189, + "language_loss": 0.78409141, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80498898, + "num_input_tokens_seen": 275389345, + "step": 12771, + "time_per_iteration": 2.7203614711761475 + }, + { + "auxiliary_loss_clip": 0.01095886, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.03259158, + "balance_loss_mlp": 1.02240968, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.3291285476309844, + "language_loss": 0.68255484, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70384252, + "num_input_tokens_seen": 275411240, + "step": 12772, + "time_per_iteration": 2.541203260421753 + }, + { + "auxiliary_loss_clip": 0.01080529, + "auxiliary_loss_mlp": 0.01022854, + "balance_loss_clip": 1.03239071, + "balance_loss_mlp": 1.01242352, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.786332888246295, + "language_loss": 0.73551041, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75654423, + "num_input_tokens_seen": 275432010, + "step": 12773, + "time_per_iteration": 2.606616735458374 + }, + { + "auxiliary_loss_clip": 0.01054374, + "auxiliary_loss_mlp": 0.01027932, + "balance_loss_clip": 1.02973127, + "balance_loss_mlp": 1.01780474, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.8250547073907784, + "language_loss": 0.80963206, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83045506, + "num_input_tokens_seen": 275453710, + "step": 12774, + "time_per_iteration": 2.6951258182525635 + }, + { + "auxiliary_loss_clip": 0.0108893, + "auxiliary_loss_mlp": 0.00749241, + "balance_loss_clip": 1.03565001, + "balance_loss_mlp": 1.00026286, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.5500883612383085, + "language_loss": 0.70038801, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71876973, + "num_input_tokens_seen": 275472915, + "step": 12775, + "time_per_iteration": 2.644320011138916 + }, + { + "auxiliary_loss_clip": 0.00990226, + "auxiliary_loss_mlp": 0.00999703, + "balance_loss_clip": 1.00998068, + "balance_loss_mlp": 0.99863052, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.7061237009193791, + "language_loss": 0.56836689, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58826613, + "num_input_tokens_seen": 275534785, + "step": 12776, + "time_per_iteration": 3.286019802093506 + }, + { + "auxiliary_loss_clip": 0.010808, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.03137255, + "balance_loss_mlp": 1.02174544, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 2.03710205767881, + "language_loss": 0.73960119, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76073277, + "num_input_tokens_seen": 275553205, + "step": 12777, + "time_per_iteration": 2.5909972190856934 + }, + { + "auxiliary_loss_clip": 0.01069138, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.03523278, + "balance_loss_mlp": 1.02069557, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.0924705105533725, + "language_loss": 0.70357752, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72458339, + "num_input_tokens_seen": 275571490, + "step": 12778, + "time_per_iteration": 2.6439101696014404 + }, + { + "auxiliary_loss_clip": 0.01084805, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.03484309, + "balance_loss_mlp": 1.0168761, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.826439132031133, + "language_loss": 0.70019817, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72131801, + "num_input_tokens_seen": 275589665, + "step": 12779, + "time_per_iteration": 2.579322338104248 + }, + { + "auxiliary_loss_clip": 0.01074075, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.03338623, + "balance_loss_mlp": 1.01750803, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.663178218973773, + "language_loss": 0.58763969, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60866827, + "num_input_tokens_seen": 275615605, + "step": 12780, + "time_per_iteration": 3.0169200897216797 + }, + { + "auxiliary_loss_clip": 0.01090232, + "auxiliary_loss_mlp": 0.01039856, + "balance_loss_clip": 1.03325081, + "balance_loss_mlp": 1.02702904, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 2.0249288795933484, + "language_loss": 0.68008375, + "learning_rate": 5.364974844194759e-07, + "loss": 0.7013846, + "num_input_tokens_seen": 275634965, + "step": 12781, + "time_per_iteration": 4.06234884262085 + }, + { + "auxiliary_loss_clip": 0.01046108, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.03083992, + "balance_loss_mlp": 1.01771379, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.5557189542577285, + "language_loss": 0.79420936, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81495535, + "num_input_tokens_seen": 275655785, + "step": 12782, + "time_per_iteration": 2.671288251876831 + }, + { + "auxiliary_loss_clip": 0.01068099, + "auxiliary_loss_mlp": 0.0102623, + "balance_loss_clip": 1.03232968, + "balance_loss_mlp": 1.01516724, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 2.281413731585887, + "language_loss": 0.66584027, + "learning_rate": 5.35966703239153e-07, + "loss": 0.68678355, + "num_input_tokens_seen": 275676160, + "step": 12783, + "time_per_iteration": 2.6413638591766357 + }, + { + "auxiliary_loss_clip": 0.01076192, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.03397417, + "balance_loss_mlp": 1.02281308, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 1.5976123083527207, + "language_loss": 0.69070143, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71180791, + "num_input_tokens_seen": 275695660, + "step": 12784, + "time_per_iteration": 2.558753490447998 + }, + { + "auxiliary_loss_clip": 0.01055932, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.03439474, + "balance_loss_mlp": 1.01578426, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 1.811630652256923, + "language_loss": 0.8068518, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82766545, + "num_input_tokens_seen": 275714025, + "step": 12785, + "time_per_iteration": 2.6305902004241943 + }, + { + "auxiliary_loss_clip": 0.01080469, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.03185034, + "balance_loss_mlp": 1.01995611, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.6324426365422653, + "language_loss": 0.76960158, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79073787, + "num_input_tokens_seen": 275737300, + "step": 12786, + "time_per_iteration": 2.7986111640930176 + }, + { + "auxiliary_loss_clip": 0.01096853, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.03333378, + "balance_loss_mlp": 1.01712048, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 3.648583888912778, + "language_loss": 0.58490688, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60615087, + "num_input_tokens_seen": 275757895, + "step": 12787, + "time_per_iteration": 2.649808883666992 + }, + { + "auxiliary_loss_clip": 0.01067306, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.02824759, + "balance_loss_mlp": 1.01834869, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.6593214793444935, + "language_loss": 0.75835741, + "learning_rate": 5.346407219994292e-07, + "loss": 0.77932274, + "num_input_tokens_seen": 275776745, + "step": 12788, + "time_per_iteration": 2.697695255279541 + }, + { + "auxiliary_loss_clip": 0.01055023, + "auxiliary_loss_mlp": 0.00749371, + "balance_loss_clip": 1.03192258, + "balance_loss_mlp": 1.00029731, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.7001715218697724, + "language_loss": 0.67031515, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68835908, + "num_input_tokens_seen": 275797205, + "step": 12789, + "time_per_iteration": 2.763232469558716 + }, + { + "auxiliary_loss_clip": 0.01075553, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.03219211, + "balance_loss_mlp": 1.02033973, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.7617929136726072, + "language_loss": 0.68411303, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70519215, + "num_input_tokens_seen": 275817935, + "step": 12790, + "time_per_iteration": 2.737260341644287 + }, + { + "auxiliary_loss_clip": 0.01073084, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.03345799, + "balance_loss_mlp": 1.01911426, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.5234678558260335, + "language_loss": 0.68449807, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70553023, + "num_input_tokens_seen": 275837145, + "step": 12791, + "time_per_iteration": 4.053051233291626 + }, + { + "auxiliary_loss_clip": 0.01085198, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.03410923, + "balance_loss_mlp": 1.01831031, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.708214542790275, + "language_loss": 0.79589206, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81702793, + "num_input_tokens_seen": 275855705, + "step": 12792, + "time_per_iteration": 2.5014004707336426 + }, + { + "auxiliary_loss_clip": 0.01071267, + "auxiliary_loss_mlp": 0.00749432, + "balance_loss_clip": 1.03643155, + "balance_loss_mlp": 1.00028169, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.720120179554382, + "language_loss": 0.72522545, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74343246, + "num_input_tokens_seen": 275873930, + "step": 12793, + "time_per_iteration": 2.603825330734253 + }, + { + "auxiliary_loss_clip": 0.01060356, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.03472435, + "balance_loss_mlp": 1.02235579, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.547017036840459, + "language_loss": 0.63390106, + "learning_rate": 5.330513783189803e-07, + "loss": 0.6548444, + "num_input_tokens_seen": 275895895, + "step": 12794, + "time_per_iteration": 2.755051612854004 + }, + { + "auxiliary_loss_clip": 0.01069709, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.03195214, + "balance_loss_mlp": 1.02612305, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.5530218414769557, + "language_loss": 0.76309502, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78416854, + "num_input_tokens_seen": 275917825, + "step": 12795, + "time_per_iteration": 2.6193125247955322 + }, + { + "auxiliary_loss_clip": 0.0106204, + "auxiliary_loss_mlp": 0.01025899, + "balance_loss_clip": 1.03068709, + "balance_loss_mlp": 1.01449633, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.6750562270378617, + "language_loss": 0.71681881, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73769814, + "num_input_tokens_seen": 275937890, + "step": 12796, + "time_per_iteration": 2.6795527935028076 + }, + { + "auxiliary_loss_clip": 0.01097169, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.03310883, + "balance_loss_mlp": 1.01689959, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 2.2779758330269724, + "language_loss": 0.64935976, + "learning_rate": 5.32257457305499e-07, + "loss": 0.67060971, + "num_input_tokens_seen": 275954495, + "step": 12797, + "time_per_iteration": 2.4636423587799072 + }, + { + "auxiliary_loss_clip": 0.01057645, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.03057837, + "balance_loss_mlp": 1.02025509, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 1.955218890817367, + "language_loss": 0.91318762, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93408638, + "num_input_tokens_seen": 275972395, + "step": 12798, + "time_per_iteration": 4.236025333404541 + }, + { + "auxiliary_loss_clip": 0.01047613, + "auxiliary_loss_mlp": 0.01025735, + "balance_loss_clip": 1.0317657, + "balance_loss_mlp": 1.01518464, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 2.4049530840144424, + "language_loss": 0.82366121, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84439462, + "num_input_tokens_seen": 275989020, + "step": 12799, + "time_per_iteration": 2.610018730163574 + }, + { + "auxiliary_loss_clip": 0.01043255, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.03256595, + "balance_loss_mlp": 1.01336539, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.1238103920186595, + "language_loss": 0.78126812, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80194902, + "num_input_tokens_seen": 276006525, + "step": 12800, + "time_per_iteration": 2.732374429702759 + }, + { + "auxiliary_loss_clip": 0.01082517, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.03397775, + "balance_loss_mlp": 1.01635814, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.5942756973411774, + "language_loss": 0.83850133, + "learning_rate": 5.31199675198198e-07, + "loss": 0.85961211, + "num_input_tokens_seen": 276027130, + "step": 12801, + "time_per_iteration": 2.633955717086792 + }, + { + "auxiliary_loss_clip": 0.0107429, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.03177369, + "balance_loss_mlp": 1.01688051, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.8319314172417034, + "language_loss": 0.72200316, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74302715, + "num_input_tokens_seen": 276045715, + "step": 12802, + "time_per_iteration": 2.6740353107452393 + }, + { + "auxiliary_loss_clip": 0.01066862, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.03038931, + "balance_loss_mlp": 1.01950908, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.9354715619382714, + "language_loss": 0.75804532, + "learning_rate": 5.306711182867747e-07, + "loss": 0.77901781, + "num_input_tokens_seen": 276065375, + "step": 12803, + "time_per_iteration": 2.7304205894470215 + }, + { + "auxiliary_loss_clip": 0.01007283, + "auxiliary_loss_mlp": 0.01006139, + "balance_loss_clip": 1.00660682, + "balance_loss_mlp": 1.00522709, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7640962299624444, + "language_loss": 0.5586071, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57874125, + "num_input_tokens_seen": 276131405, + "step": 12804, + "time_per_iteration": 3.1661136150360107 + }, + { + "auxiliary_loss_clip": 0.01003656, + "auxiliary_loss_mlp": 0.01005462, + "balance_loss_clip": 1.00381982, + "balance_loss_mlp": 1.00448406, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7439723770396873, + "language_loss": 0.54003453, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56012571, + "num_input_tokens_seen": 276200755, + "step": 12805, + "time_per_iteration": 3.2282190322875977 + }, + { + "auxiliary_loss_clip": 0.01067352, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.03548467, + "balance_loss_mlp": 1.01846409, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 1.7853819122136552, + "language_loss": 0.72910714, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75007534, + "num_input_tokens_seen": 276217880, + "step": 12806, + "time_per_iteration": 2.729832172393799 + }, + { + "auxiliary_loss_clip": 0.01072267, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.03122556, + "balance_loss_mlp": 1.02493179, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 1.8964859057914791, + "language_loss": 0.74989814, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77098203, + "num_input_tokens_seen": 276234810, + "step": 12807, + "time_per_iteration": 2.646892786026001 + }, + { + "auxiliary_loss_clip": 0.01091978, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.03522587, + "balance_loss_mlp": 1.0180521, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.09954391543601, + "language_loss": 0.80195582, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82317221, + "num_input_tokens_seen": 276252850, + "step": 12808, + "time_per_iteration": 2.598400592803955 + }, + { + "auxiliary_loss_clip": 0.01091387, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.03480136, + "balance_loss_mlp": 1.02349663, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 2.4781869303827486, + "language_loss": 0.78715837, + "learning_rate": 5.290867850833718e-07, + "loss": 0.80842334, + "num_input_tokens_seen": 276272525, + "step": 12809, + "time_per_iteration": 4.30185079574585 + }, + { + "auxiliary_loss_clip": 0.01060775, + "auxiliary_loss_mlp": 0.01023943, + "balance_loss_clip": 1.03072488, + "balance_loss_mlp": 1.0140841, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.4419172440322607, + "language_loss": 0.70401978, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72486693, + "num_input_tokens_seen": 276294210, + "step": 12810, + "time_per_iteration": 2.675654649734497 + }, + { + "auxiliary_loss_clip": 0.01069884, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.03041017, + "balance_loss_mlp": 1.02331102, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.494220640542803, + "language_loss": 0.78478801, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80585933, + "num_input_tokens_seen": 276310290, + "step": 12811, + "time_per_iteration": 2.621640682220459 + }, + { + "auxiliary_loss_clip": 0.00993092, + "auxiliary_loss_mlp": 0.01004086, + "balance_loss_clip": 1.00529122, + "balance_loss_mlp": 1.00286996, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8257974468408367, + "language_loss": 0.56669617, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58666795, + "num_input_tokens_seen": 276371715, + "step": 12812, + "time_per_iteration": 3.187873363494873 + }, + { + "auxiliary_loss_clip": 0.01059072, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.03224075, + "balance_loss_mlp": 1.01968956, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.775680913502759, + "language_loss": 0.72021484, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74111629, + "num_input_tokens_seen": 276389895, + "step": 12813, + "time_per_iteration": 2.6716928482055664 + }, + { + "auxiliary_loss_clip": 0.010877, + "auxiliary_loss_mlp": 0.0102428, + "balance_loss_clip": 1.03337693, + "balance_loss_mlp": 1.01284194, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.9582740110655867, + "language_loss": 0.6651175, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68623734, + "num_input_tokens_seen": 276408990, + "step": 12814, + "time_per_iteration": 2.6249730587005615 + }, + { + "auxiliary_loss_clip": 0.01072646, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.03110743, + "balance_loss_mlp": 1.01930833, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.8134544281219136, + "language_loss": 0.65595222, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67697835, + "num_input_tokens_seen": 276428190, + "step": 12815, + "time_per_iteration": 2.6740097999572754 + }, + { + "auxiliary_loss_clip": 0.0108742, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.03361583, + "balance_loss_mlp": 1.01952291, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 4.096639644344564, + "language_loss": 0.65089273, + "learning_rate": 5.272409343590322e-07, + "loss": 0.67207408, + "num_input_tokens_seen": 276446855, + "step": 12816, + "time_per_iteration": 2.5747575759887695 + }, + { + "auxiliary_loss_clip": 0.01087982, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.03439689, + "balance_loss_mlp": 1.02105856, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.2232675239941426, + "language_loss": 0.71853954, + "learning_rate": 5.26977464707133e-07, + "loss": 0.7397393, + "num_input_tokens_seen": 276462000, + "step": 12817, + "time_per_iteration": 2.5476837158203125 + }, + { + "auxiliary_loss_clip": 0.010548, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_clip": 1.03239429, + "balance_loss_mlp": 1.01574314, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 2.142312484217863, + "language_loss": 0.60922492, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63003969, + "num_input_tokens_seen": 276481190, + "step": 12818, + "time_per_iteration": 2.6557765007019043 + }, + { + "auxiliary_loss_clip": 0.01084828, + "auxiliary_loss_mlp": 0.01025606, + "balance_loss_clip": 1.03350377, + "balance_loss_mlp": 1.01568151, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.900944283314109, + "language_loss": 0.67520607, + "learning_rate": 5.264506929848093e-07, + "loss": 0.6963104, + "num_input_tokens_seen": 276499520, + "step": 12819, + "time_per_iteration": 2.5649118423461914 + }, + { + "auxiliary_loss_clip": 0.01100549, + "auxiliary_loss_mlp": 0.01026381, + "balance_loss_clip": 1.03559434, + "balance_loss_mlp": 1.01556873, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.8897521846936791, + "language_loss": 0.57523257, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59650183, + "num_input_tokens_seen": 276519110, + "step": 12820, + "time_per_iteration": 2.546783208847046 + }, + { + "auxiliary_loss_clip": 0.01068659, + "auxiliary_loss_mlp": 0.01025679, + "balance_loss_clip": 1.03308511, + "balance_loss_mlp": 1.01486647, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.7466440710043547, + "language_loss": 0.80981362, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83075696, + "num_input_tokens_seen": 276538805, + "step": 12821, + "time_per_iteration": 2.646023988723755 + }, + { + "auxiliary_loss_clip": 0.01098646, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.03419995, + "balance_loss_mlp": 1.02033567, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.5680218339022995, + "language_loss": 0.68861681, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70992112, + "num_input_tokens_seen": 276554770, + "step": 12822, + "time_per_iteration": 4.033381700515747 + }, + { + "auxiliary_loss_clip": 0.01073716, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.03170824, + "balance_loss_mlp": 1.02176809, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.720911329433315, + "language_loss": 0.72167397, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74273604, + "num_input_tokens_seen": 276574535, + "step": 12823, + "time_per_iteration": 2.6202924251556396 + }, + { + "auxiliary_loss_clip": 0.01092492, + "auxiliary_loss_mlp": 0.01037416, + "balance_loss_clip": 1.03477764, + "balance_loss_mlp": 1.02458942, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.785777388234441, + "language_loss": 0.76533675, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78663582, + "num_input_tokens_seen": 276592925, + "step": 12824, + "time_per_iteration": 2.4987053871154785 + }, + { + "auxiliary_loss_clip": 0.01073083, + "auxiliary_loss_mlp": 0.01026841, + "balance_loss_clip": 1.03306532, + "balance_loss_mlp": 1.01542044, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 1.894320557899728, + "language_loss": 0.72217721, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74317646, + "num_input_tokens_seen": 276610540, + "step": 12825, + "time_per_iteration": 2.5395419597625732 + }, + { + "auxiliary_loss_clip": 0.01095551, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.03462446, + "balance_loss_mlp": 1.01947618, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.5533426259596659, + "language_loss": 0.73791975, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75916499, + "num_input_tokens_seen": 276629200, + "step": 12826, + "time_per_iteration": 2.4741222858428955 + }, + { + "auxiliary_loss_clip": 0.01098401, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.03248763, + "balance_loss_mlp": 1.01780176, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.5623816741815295, + "language_loss": 0.81380606, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83508623, + "num_input_tokens_seen": 276648655, + "step": 12827, + "time_per_iteration": 2.521815776824951 + }, + { + "auxiliary_loss_clip": 0.0102345, + "auxiliary_loss_mlp": 0.00997369, + "balance_loss_clip": 1.00358057, + "balance_loss_mlp": 0.99640906, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8569690700505114, + "language_loss": 0.55131888, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57152706, + "num_input_tokens_seen": 276716500, + "step": 12828, + "time_per_iteration": 3.200343132019043 + }, + { + "auxiliary_loss_clip": 0.01043949, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.02833247, + "balance_loss_mlp": 1.01575077, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 2.052903171953723, + "language_loss": 0.69913203, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71983731, + "num_input_tokens_seen": 276733535, + "step": 12829, + "time_per_iteration": 2.650399684906006 + }, + { + "auxiliary_loss_clip": 0.01070626, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.03628504, + "balance_loss_mlp": 1.02058005, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 2.9141812563701333, + "language_loss": 0.79284215, + "learning_rate": 5.235574458679579e-07, + "loss": 0.81387335, + "num_input_tokens_seen": 276749575, + "step": 12830, + "time_per_iteration": 4.083462476730347 + }, + { + "auxiliary_loss_clip": 0.01088771, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.03350663, + "balance_loss_mlp": 1.01826191, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 5.996211502428029, + "language_loss": 0.77775449, + "learning_rate": 5.232947591245269e-07, + "loss": 0.79894555, + "num_input_tokens_seen": 276769460, + "step": 12831, + "time_per_iteration": 2.5962331295013428 + }, + { + "auxiliary_loss_clip": 0.01066222, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.03059924, + "balance_loss_mlp": 1.02049994, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.4986817526112335, + "language_loss": 0.6104058, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63139892, + "num_input_tokens_seen": 276790820, + "step": 12832, + "time_per_iteration": 2.633854866027832 + }, + { + "auxiliary_loss_clip": 0.01071783, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.032148, + "balance_loss_mlp": 1.02279747, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.673234068466566, + "language_loss": 0.79481566, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81587285, + "num_input_tokens_seen": 276811345, + "step": 12833, + "time_per_iteration": 2.6219842433929443 + }, + { + "auxiliary_loss_clip": 0.00987771, + "auxiliary_loss_mlp": 0.01005963, + "balance_loss_clip": 1.01000059, + "balance_loss_mlp": 1.00512302, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8454436986708863, + "language_loss": 0.55334103, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57327843, + "num_input_tokens_seen": 276870950, + "step": 12834, + "time_per_iteration": 3.238873243331909 + }, + { + "auxiliary_loss_clip": 0.01047678, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.03043997, + "balance_loss_mlp": 1.01637673, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.1824381348881725, + "language_loss": 0.73267901, + "learning_rate": 5.222445722184903e-07, + "loss": 0.75344342, + "num_input_tokens_seen": 276890760, + "step": 12835, + "time_per_iteration": 2.697988271713257 + }, + { + "auxiliary_loss_clip": 0.0106318, + "auxiliary_loss_mlp": 0.00749297, + "balance_loss_clip": 1.03043926, + "balance_loss_mlp": 1.00025177, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 2.336607247238061, + "language_loss": 0.70477867, + "learning_rate": 5.219821655586814e-07, + "loss": 0.72290343, + "num_input_tokens_seen": 276909625, + "step": 12836, + "time_per_iteration": 2.6188995838165283 + }, + { + "auxiliary_loss_clip": 0.01075047, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.03306699, + "balance_loss_mlp": 1.01920557, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.6368299214557958, + "language_loss": 0.59418654, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61523807, + "num_input_tokens_seen": 276930760, + "step": 12837, + "time_per_iteration": 2.8044562339782715 + }, + { + "auxiliary_loss_clip": 0.01019365, + "auxiliary_loss_mlp": 0.01003811, + "balance_loss_clip": 1.01488829, + "balance_loss_mlp": 1.00291657, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.872994633143738, + "language_loss": 0.55820835, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57844007, + "num_input_tokens_seen": 276989580, + "step": 12838, + "time_per_iteration": 4.544138669967651 + }, + { + "auxiliary_loss_clip": 0.01084598, + "auxiliary_loss_mlp": 0.01027166, + "balance_loss_clip": 1.03168368, + "balance_loss_mlp": 1.01683068, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.530360373270066, + "language_loss": 0.69374061, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71485829, + "num_input_tokens_seen": 277005450, + "step": 12839, + "time_per_iteration": 2.582294464111328 + }, + { + "auxiliary_loss_clip": 0.01083986, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.03320837, + "balance_loss_mlp": 1.01636624, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 2.2063267936736106, + "language_loss": 0.80020523, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82131302, + "num_input_tokens_seen": 277023055, + "step": 12840, + "time_per_iteration": 2.582034111022949 + }, + { + "auxiliary_loss_clip": 0.01086098, + "auxiliary_loss_mlp": 0.00749376, + "balance_loss_clip": 1.03297508, + "balance_loss_mlp": 1.00031471, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.8057708242683286, + "language_loss": 0.79617512, + "learning_rate": 5.206709731573402e-07, + "loss": 0.81452984, + "num_input_tokens_seen": 277041150, + "step": 12841, + "time_per_iteration": 2.5668487548828125 + }, + { + "auxiliary_loss_clip": 0.01059964, + "auxiliary_loss_mlp": 0.01026197, + "balance_loss_clip": 1.03242314, + "balance_loss_mlp": 1.01473522, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.829335333478053, + "language_loss": 0.76264119, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78350282, + "num_input_tokens_seen": 277063895, + "step": 12842, + "time_per_iteration": 2.680755138397217 + }, + { + "auxiliary_loss_clip": 0.01042266, + "auxiliary_loss_mlp": 0.00749526, + "balance_loss_clip": 1.03276551, + "balance_loss_mlp": 1.00028145, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 1.4911642208407911, + "language_loss": 0.68281567, + "learning_rate": 5.201468888013445e-07, + "loss": 0.7007336, + "num_input_tokens_seen": 277084045, + "step": 12843, + "time_per_iteration": 2.769730567932129 + }, + { + "auxiliary_loss_clip": 0.01072096, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.02917409, + "balance_loss_mlp": 1.01686954, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 1.8995643485638896, + "language_loss": 0.73552501, + "learning_rate": 5.198849307926465e-07, + "loss": 0.75652325, + "num_input_tokens_seen": 277102625, + "step": 12844, + "time_per_iteration": 2.703962802886963 + }, + { + "auxiliary_loss_clip": 0.01078794, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.03131783, + "balance_loss_mlp": 1.02259922, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.4909494687187335, + "language_loss": 0.71509957, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73623466, + "num_input_tokens_seen": 277123210, + "step": 12845, + "time_per_iteration": 2.692437171936035 + }, + { + "auxiliary_loss_clip": 0.01094141, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.03262901, + "balance_loss_mlp": 1.01781988, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.9993415610199423, + "language_loss": 0.64716953, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66838974, + "num_input_tokens_seen": 277144895, + "step": 12846, + "time_per_iteration": 2.6171493530273438 + }, + { + "auxiliary_loss_clip": 0.01015186, + "auxiliary_loss_mlp": 0.00746652, + "balance_loss_clip": 1.00524652, + "balance_loss_mlp": 0.99986601, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7839101285694305, + "language_loss": 0.61680686, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63442528, + "num_input_tokens_seen": 277205160, + "step": 12847, + "time_per_iteration": 3.062124013900757 + }, + { + "auxiliary_loss_clip": 0.01095098, + "auxiliary_loss_mlp": 0.01027207, + "balance_loss_clip": 1.03128755, + "balance_loss_mlp": 1.01635873, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.705817104589175, + "language_loss": 0.79195452, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81317759, + "num_input_tokens_seen": 277223005, + "step": 12848, + "time_per_iteration": 2.537548065185547 + }, + { + "auxiliary_loss_clip": 0.0105801, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.03122663, + "balance_loss_mlp": 1.02242827, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 2.3470126941377565, + "language_loss": 0.72763979, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74856836, + "num_input_tokens_seen": 277241785, + "step": 12849, + "time_per_iteration": 4.133309602737427 + }, + { + "auxiliary_loss_clip": 0.01096204, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.03269005, + "balance_loss_mlp": 1.01894641, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.9670243502896974, + "language_loss": 0.78526485, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80652809, + "num_input_tokens_seen": 277259050, + "step": 12850, + "time_per_iteration": 2.4849531650543213 + }, + { + "auxiliary_loss_clip": 0.01036539, + "auxiliary_loss_mlp": 0.00749429, + "balance_loss_clip": 1.02600527, + "balance_loss_mlp": 1.00025225, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.4924830466714534, + "language_loss": 0.79794306, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81580269, + "num_input_tokens_seen": 277278235, + "step": 12851, + "time_per_iteration": 2.72499418258667 + }, + { + "auxiliary_loss_clip": 0.01082673, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.01603949, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.40754408039408, + "language_loss": 0.7398603, + "learning_rate": 5.177912880970474e-07, + "loss": 0.76096612, + "num_input_tokens_seen": 277298355, + "step": 12852, + "time_per_iteration": 2.555483102798462 + }, + { + "auxiliary_loss_clip": 0.01095319, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.03184223, + "balance_loss_mlp": 1.0212189, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.9451008006507875, + "language_loss": 0.82147515, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84274447, + "num_input_tokens_seen": 277316095, + "step": 12853, + "time_per_iteration": 2.555678606033325 + }, + { + "auxiliary_loss_clip": 0.01022973, + "auxiliary_loss_mlp": 0.01002656, + "balance_loss_clip": 1.00310695, + "balance_loss_mlp": 1.00171459, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 1.587303017593148, + "language_loss": 0.54482168, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56507796, + "num_input_tokens_seen": 277380130, + "step": 12854, + "time_per_iteration": 3.1321985721588135 + }, + { + "auxiliary_loss_clip": 0.01088034, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.03282249, + "balance_loss_mlp": 1.01469183, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.5195160994319328, + "language_loss": 0.71792781, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73907804, + "num_input_tokens_seen": 277404015, + "step": 12855, + "time_per_iteration": 2.6980061531066895 + }, + { + "auxiliary_loss_clip": 0.01097398, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.03290558, + "balance_loss_mlp": 1.01784396, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 3.2249604787590775, + "language_loss": 0.67442292, + "learning_rate": 5.167458153638254e-07, + "loss": 0.69569325, + "num_input_tokens_seen": 277421375, + "step": 12856, + "time_per_iteration": 2.589616298675537 + }, + { + "auxiliary_loss_clip": 0.01063674, + "auxiliary_loss_mlp": 0.01027035, + "balance_loss_clip": 1.03083098, + "balance_loss_mlp": 1.01641917, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.598589411379275, + "language_loss": 0.79051685, + "learning_rate": 5.164845877686162e-07, + "loss": 0.81142396, + "num_input_tokens_seen": 277440170, + "step": 12857, + "time_per_iteration": 2.7208142280578613 + }, + { + "auxiliary_loss_clip": 0.01051659, + "auxiliary_loss_mlp": 0.00749335, + "balance_loss_clip": 1.03771019, + "balance_loss_mlp": 1.00031543, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 1.8749962518740828, + "language_loss": 0.78213376, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80014372, + "num_input_tokens_seen": 277456880, + "step": 12858, + "time_per_iteration": 2.8079209327697754 + }, + { + "auxiliary_loss_clip": 0.01096981, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.03197694, + "balance_loss_mlp": 1.01588988, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 1.9109175475039823, + "language_loss": 0.76982951, + "learning_rate": 5.159623013532591e-07, + "loss": 0.7910713, + "num_input_tokens_seen": 277475365, + "step": 12859, + "time_per_iteration": 2.619173288345337 + }, + { + "auxiliary_loss_clip": 0.01084264, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.0346415, + "balance_loss_mlp": 1.01733661, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 2.00888905303567, + "language_loss": 0.67743111, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69854367, + "num_input_tokens_seen": 277494975, + "step": 12860, + "time_per_iteration": 2.621025562286377 + }, + { + "auxiliary_loss_clip": 0.01099928, + "auxiliary_loss_mlp": 0.01035028, + "balance_loss_clip": 1.03301287, + "balance_loss_mlp": 1.02331567, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.7164167372479167, + "language_loss": 0.74596238, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76731193, + "num_input_tokens_seen": 277510520, + "step": 12861, + "time_per_iteration": 2.5862326622009277 + }, + { + "auxiliary_loss_clip": 0.01091526, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.0352354, + "balance_loss_mlp": 1.01662612, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.5344886685016532, + "language_loss": 0.7440331, + "learning_rate": 5.15179293816405e-07, + "loss": 0.76523316, + "num_input_tokens_seen": 277530505, + "step": 12862, + "time_per_iteration": 4.086052417755127 + }, + { + "auxiliary_loss_clip": 0.01047332, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.02837312, + "balance_loss_mlp": 1.01961255, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.7576032544474331, + "language_loss": 0.82987857, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85065281, + "num_input_tokens_seen": 277550810, + "step": 12863, + "time_per_iteration": 2.758007049560547 + }, + { + "auxiliary_loss_clip": 0.01096731, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03293133, + "balance_loss_mlp": 1.0187155, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.6922286715178805, + "language_loss": 0.73310089, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75436175, + "num_input_tokens_seen": 277567680, + "step": 12864, + "time_per_iteration": 2.6255595684051514 + }, + { + "auxiliary_loss_clip": 0.0107245, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.03024673, + "balance_loss_mlp": 1.02142239, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.8519556565758855, + "language_loss": 0.82485723, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84590232, + "num_input_tokens_seen": 277588970, + "step": 12865, + "time_per_iteration": 2.6766490936279297 + }, + { + "auxiliary_loss_clip": 0.01103589, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.03623688, + "balance_loss_mlp": 1.02009726, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 2.1184432234583808, + "language_loss": 0.71707267, + "learning_rate": 5.141360720771077e-07, + "loss": 0.73843336, + "num_input_tokens_seen": 277605450, + "step": 12866, + "time_per_iteration": 2.508814573287964 + }, + { + "auxiliary_loss_clip": 0.01053515, + "auxiliary_loss_mlp": 0.00749365, + "balance_loss_clip": 1.03289592, + "balance_loss_mlp": 1.00019443, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 2.340669348907317, + "language_loss": 0.64665216, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66468096, + "num_input_tokens_seen": 277622530, + "step": 12867, + "time_per_iteration": 2.743983507156372 + }, + { + "auxiliary_loss_clip": 0.01085087, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.03330076, + "balance_loss_mlp": 1.02190089, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.692278398891075, + "language_loss": 0.70902538, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73020017, + "num_input_tokens_seen": 277642700, + "step": 12868, + "time_per_iteration": 2.5532875061035156 + }, + { + "auxiliary_loss_clip": 0.0109135, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.03616023, + "balance_loss_mlp": 1.01494431, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.3808046962536475, + "language_loss": 0.78030181, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80147576, + "num_input_tokens_seen": 277660005, + "step": 12869, + "time_per_iteration": 2.5884084701538086 + }, + { + "auxiliary_loss_clip": 0.01084396, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.03222775, + "balance_loss_mlp": 1.01569676, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.5834045711731861, + "language_loss": 0.73500156, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75611305, + "num_input_tokens_seen": 277682890, + "step": 12870, + "time_per_iteration": 4.335521221160889 + }, + { + "auxiliary_loss_clip": 0.01087756, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.03354132, + "balance_loss_mlp": 1.02009058, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 2.6331034336923533, + "language_loss": 0.75891721, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78010929, + "num_input_tokens_seen": 277699330, + "step": 12871, + "time_per_iteration": 2.648024082183838 + }, + { + "auxiliary_loss_clip": 0.01070397, + "auxiliary_loss_mlp": 0.01029495, + "balance_loss_clip": 1.0304811, + "balance_loss_mlp": 1.01912975, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.5716408751160247, + "language_loss": 0.69170022, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71269917, + "num_input_tokens_seen": 277718750, + "step": 12872, + "time_per_iteration": 2.6417906284332275 + }, + { + "auxiliary_loss_clip": 0.01099134, + "auxiliary_loss_mlp": 0.01030992, + "balance_loss_clip": 1.03436542, + "balance_loss_mlp": 1.01873708, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.5251990713310994, + "language_loss": 0.85307503, + "learning_rate": 5.123126036618804e-07, + "loss": 0.8743763, + "num_input_tokens_seen": 277734645, + "step": 12873, + "time_per_iteration": 2.486949920654297 + }, + { + "auxiliary_loss_clip": 0.01099581, + "auxiliary_loss_mlp": 0.01031926, + "balance_loss_clip": 1.0348022, + "balance_loss_mlp": 1.02085698, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 3.0663459606979253, + "language_loss": 0.64896113, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67027617, + "num_input_tokens_seen": 277755535, + "step": 12874, + "time_per_iteration": 2.591625690460205 + }, + { + "auxiliary_loss_clip": 0.01058706, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.03620458, + "balance_loss_mlp": 1.01750541, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.6378691608835, + "language_loss": 0.62412786, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64500165, + "num_input_tokens_seen": 277775585, + "step": 12875, + "time_per_iteration": 2.721031665802002 + }, + { + "auxiliary_loss_clip": 0.01088799, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.03338563, + "balance_loss_mlp": 1.0209341, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 1.8689100014680078, + "language_loss": 0.6549058, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67611837, + "num_input_tokens_seen": 277794795, + "step": 12876, + "time_per_iteration": 2.5796573162078857 + }, + { + "auxiliary_loss_clip": 0.01071341, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03198171, + "balance_loss_mlp": 1.01927423, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.0898655528254495, + "language_loss": 0.71149004, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73250341, + "num_input_tokens_seen": 277813235, + "step": 12877, + "time_per_iteration": 2.5974786281585693 + }, + { + "auxiliary_loss_clip": 0.01055445, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.0306623, + "balance_loss_mlp": 1.02189255, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.677330880475671, + "language_loss": 0.82913691, + "learning_rate": 5.110118184224736e-07, + "loss": 0.8500421, + "num_input_tokens_seen": 277832560, + "step": 12878, + "time_per_iteration": 2.6305291652679443 + }, + { + "auxiliary_loss_clip": 0.01077776, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.03411674, + "balance_loss_mlp": 1.02195311, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 2.0327061515582723, + "language_loss": 0.73526227, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75637907, + "num_input_tokens_seen": 277850120, + "step": 12879, + "time_per_iteration": 4.037978410720825 + }, + { + "auxiliary_loss_clip": 0.01070865, + "auxiliary_loss_mlp": 0.01026795, + "balance_loss_clip": 1.03036654, + "balance_loss_mlp": 1.01587582, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 2.296187997351965, + "language_loss": 0.79544449, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81642109, + "num_input_tokens_seen": 277871020, + "step": 12880, + "time_per_iteration": 2.711566925048828 + }, + { + "auxiliary_loss_clip": 0.01073082, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03259826, + "balance_loss_mlp": 1.02111471, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.7680746259257714, + "language_loss": 0.70048177, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72153383, + "num_input_tokens_seen": 277891525, + "step": 12881, + "time_per_iteration": 2.59812068939209 + }, + { + "auxiliary_loss_clip": 0.01076205, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.0312959, + "balance_loss_mlp": 1.02380443, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 1.8331291191358836, + "language_loss": 0.84200072, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86312133, + "num_input_tokens_seen": 277910425, + "step": 12882, + "time_per_iteration": 2.569749116897583 + }, + { + "auxiliary_loss_clip": 0.01000147, + "auxiliary_loss_mlp": 0.01001528, + "balance_loss_clip": 1.01583374, + "balance_loss_mlp": 1.00047326, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7689058750634872, + "language_loss": 0.60506195, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62507874, + "num_input_tokens_seen": 277972795, + "step": 12883, + "time_per_iteration": 3.2068910598754883 + }, + { + "auxiliary_loss_clip": 0.01048423, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.0302732, + "balance_loss_mlp": 1.01955831, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.8050104625443737, + "language_loss": 0.72578859, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74659741, + "num_input_tokens_seen": 277990675, + "step": 12884, + "time_per_iteration": 2.8534297943115234 + }, + { + "auxiliary_loss_clip": 0.01086687, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.03396845, + "balance_loss_mlp": 1.02128673, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 1.632571284551118, + "language_loss": 0.8099674, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83114809, + "num_input_tokens_seen": 278010050, + "step": 12885, + "time_per_iteration": 2.628556489944458 + }, + { + "auxiliary_loss_clip": 0.01094959, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.03312302, + "balance_loss_mlp": 1.01986599, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 2.069016073750763, + "language_loss": 0.64102024, + "learning_rate": 5.089334986059029e-07, + "loss": 0.6622709, + "num_input_tokens_seen": 278030660, + "step": 12886, + "time_per_iteration": 2.597842216491699 + }, + { + "auxiliary_loss_clip": 0.01058789, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.03029776, + "balance_loss_mlp": 1.02161598, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 2.4692159947846704, + "language_loss": 0.69579351, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71670055, + "num_input_tokens_seen": 278047645, + "step": 12887, + "time_per_iteration": 2.625070333480835 + }, + { + "auxiliary_loss_clip": 0.01084651, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.03192735, + "balance_loss_mlp": 1.01578939, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.6488114662219306, + "language_loss": 0.70595711, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72706604, + "num_input_tokens_seen": 278066170, + "step": 12888, + "time_per_iteration": 2.6692862510681152 + }, + { + "auxiliary_loss_clip": 0.01087585, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.03211653, + "balance_loss_mlp": 1.01898813, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.6717473577691617, + "language_loss": 0.81272793, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83390838, + "num_input_tokens_seen": 278085545, + "step": 12889, + "time_per_iteration": 4.116791725158691 + }, + { + "auxiliary_loss_clip": 0.01061754, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.03108191, + "balance_loss_mlp": 1.01892424, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 2.178092584696889, + "language_loss": 0.79276329, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81367636, + "num_input_tokens_seen": 278102995, + "step": 12890, + "time_per_iteration": 2.6438398361206055 + }, + { + "auxiliary_loss_clip": 0.01071165, + "auxiliary_loss_mlp": 0.01027595, + "balance_loss_clip": 1.03538346, + "balance_loss_mlp": 1.01643074, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 2.948951231011037, + "language_loss": 0.65971106, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68069863, + "num_input_tokens_seen": 278121460, + "step": 12891, + "time_per_iteration": 2.6832897663116455 + }, + { + "auxiliary_loss_clip": 0.01085415, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.03151417, + "balance_loss_mlp": 1.01784158, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.4639537698700018, + "language_loss": 0.78769088, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80883265, + "num_input_tokens_seen": 278143905, + "step": 12892, + "time_per_iteration": 2.671933174133301 + }, + { + "auxiliary_loss_clip": 0.0108848, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.03400314, + "balance_loss_mlp": 1.01859605, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 1.8453769055367266, + "language_loss": 0.67064893, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69183552, + "num_input_tokens_seen": 278160850, + "step": 12893, + "time_per_iteration": 2.532799243927002 + }, + { + "auxiliary_loss_clip": 0.01014981, + "auxiliary_loss_mlp": 0.01003689, + "balance_loss_clip": 1.00479019, + "balance_loss_mlp": 1.0027535, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8249354981810423, + "language_loss": 0.58499062, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60517728, + "num_input_tokens_seen": 278219950, + "step": 12894, + "time_per_iteration": 3.152299642562866 + }, + { + "auxiliary_loss_clip": 0.0107965, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.03435683, + "balance_loss_mlp": 1.02059793, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.531882532918161, + "language_loss": 0.77876085, + "learning_rate": 5.065997144786895e-07, + "loss": 0.79987872, + "num_input_tokens_seen": 278237805, + "step": 12895, + "time_per_iteration": 2.56796932220459 + }, + { + "auxiliary_loss_clip": 0.0105595, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.03192973, + "balance_loss_mlp": 1.01950955, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.6248107890785721, + "language_loss": 0.6738385, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69471824, + "num_input_tokens_seen": 278257660, + "step": 12896, + "time_per_iteration": 2.6764280796051025 + }, + { + "auxiliary_loss_clip": 0.01073322, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.0314765, + "balance_loss_mlp": 1.02463961, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.7845423459136178, + "language_loss": 0.68652344, + "learning_rate": 5.060817184602629e-07, + "loss": 0.70760322, + "num_input_tokens_seen": 278275110, + "step": 12897, + "time_per_iteration": 2.5758442878723145 + }, + { + "auxiliary_loss_clip": 0.01101128, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.03631616, + "balance_loss_mlp": 1.02467203, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.9483797371779303, + "language_loss": 0.74836755, + "learning_rate": 5.058228054204364e-07, + "loss": 0.76974368, + "num_input_tokens_seen": 278293035, + "step": 12898, + "time_per_iteration": 2.576728105545044 + }, + { + "auxiliary_loss_clip": 0.01086494, + "auxiliary_loss_mlp": 0.00749561, + "balance_loss_clip": 1.03253317, + "balance_loss_mlp": 1.00023305, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 2.1735424879604355, + "language_loss": 0.70324236, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72160292, + "num_input_tokens_seen": 278311010, + "step": 12899, + "time_per_iteration": 2.546936273574829 + }, + { + "auxiliary_loss_clip": 0.01060248, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.03041911, + "balance_loss_mlp": 1.01984668, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 1.897078999671869, + "language_loss": 0.74800676, + "learning_rate": 5.053051493286453e-07, + "loss": 0.76892507, + "num_input_tokens_seen": 278329900, + "step": 12900, + "time_per_iteration": 2.726780414581299 + }, + { + "auxiliary_loss_clip": 0.01079239, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.03257072, + "balance_loss_mlp": 1.02563715, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 1.8007145053843667, + "language_loss": 0.77258271, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79373139, + "num_input_tokens_seen": 278349980, + "step": 12901, + "time_per_iteration": 2.785689353942871 + }, + { + "auxiliary_loss_clip": 0.01089153, + "auxiliary_loss_mlp": 0.01032403, + "balance_loss_clip": 1.03613567, + "balance_loss_mlp": 1.02047634, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.5971012346628346, + "language_loss": 0.77095354, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79216909, + "num_input_tokens_seen": 278372485, + "step": 12902, + "time_per_iteration": 2.5712993144989014 + }, + { + "auxiliary_loss_clip": 0.01086948, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.03282523, + "balance_loss_mlp": 1.01801586, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.9284125094874676, + "language_loss": 0.73588276, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75704157, + "num_input_tokens_seen": 278391660, + "step": 12903, + "time_per_iteration": 4.126187086105347 + }, + { + "auxiliary_loss_clip": 0.01076994, + "auxiliary_loss_mlp": 0.01024278, + "balance_loss_clip": 1.03453338, + "balance_loss_mlp": 1.01304293, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 2.497570959344915, + "language_loss": 0.7627902, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78380299, + "num_input_tokens_seen": 278409125, + "step": 12904, + "time_per_iteration": 2.5768792629241943 + }, + { + "auxiliary_loss_clip": 0.01094566, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.0332799, + "balance_loss_mlp": 1.01484144, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.4499490817589096, + "language_loss": 0.68266559, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70386267, + "num_input_tokens_seen": 278429450, + "step": 12905, + "time_per_iteration": 2.5071115493774414 + }, + { + "auxiliary_loss_clip": 0.01084679, + "auxiliary_loss_mlp": 0.00749232, + "balance_loss_clip": 1.03569937, + "balance_loss_mlp": 1.00016236, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.767217109356477, + "language_loss": 0.67477477, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69311392, + "num_input_tokens_seen": 278449925, + "step": 12906, + "time_per_iteration": 2.5667619705200195 + }, + { + "auxiliary_loss_clip": 0.01053746, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.02884328, + "balance_loss_mlp": 1.01649952, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 2.168114109329765, + "language_loss": 0.81121159, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83202803, + "num_input_tokens_seen": 278467255, + "step": 12907, + "time_per_iteration": 2.5973198413848877 + }, + { + "auxiliary_loss_clip": 0.01080813, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.03233826, + "balance_loss_mlp": 1.02049994, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.798149568511232, + "language_loss": 0.66967237, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69079304, + "num_input_tokens_seen": 278484250, + "step": 12908, + "time_per_iteration": 2.4738106727600098 + }, + { + "auxiliary_loss_clip": 0.01064206, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.03157377, + "balance_loss_mlp": 1.02354991, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.6561318965469245, + "language_loss": 0.70203412, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72302604, + "num_input_tokens_seen": 278502740, + "step": 12909, + "time_per_iteration": 2.583042621612549 + }, + { + "auxiliary_loss_clip": 0.01086212, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.03426385, + "balance_loss_mlp": 1.01949096, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.7701135851220924, + "language_loss": 0.68031067, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70147049, + "num_input_tokens_seen": 278523890, + "step": 12910, + "time_per_iteration": 4.012547492980957 + }, + { + "auxiliary_loss_clip": 0.01050917, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.03205061, + "balance_loss_mlp": 1.02099562, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 3.5798550536763525, + "language_loss": 0.71752334, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73834848, + "num_input_tokens_seen": 278543185, + "step": 12911, + "time_per_iteration": 2.610240936279297 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.00749343, + "balance_loss_clip": 1.03559482, + "balance_loss_mlp": 1.00018764, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.288949406523426, + "language_loss": 0.63237661, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65088236, + "num_input_tokens_seen": 278559220, + "step": 12912, + "time_per_iteration": 2.5899970531463623 + }, + { + "auxiliary_loss_clip": 0.00993302, + "auxiliary_loss_mlp": 0.01003574, + "balance_loss_clip": 1.00397468, + "balance_loss_mlp": 1.00261998, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.8528991832169801, + "language_loss": 0.53260708, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55257589, + "num_input_tokens_seen": 278618185, + "step": 12913, + "time_per_iteration": 3.2211334705352783 + }, + { + "auxiliary_loss_clip": 0.01078763, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.03504384, + "balance_loss_mlp": 1.01785851, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 1.8876283053697729, + "language_loss": 0.61856055, + "learning_rate": 5.016879091243338e-07, + "loss": 0.63964182, + "num_input_tokens_seen": 278636210, + "step": 12914, + "time_per_iteration": 2.6114675998687744 + }, + { + "auxiliary_loss_clip": 0.01074672, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.03184569, + "balance_loss_mlp": 1.01856947, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.6452588179051038, + "language_loss": 0.82238895, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84343278, + "num_input_tokens_seen": 278653305, + "step": 12915, + "time_per_iteration": 2.6339728832244873 + }, + { + "auxiliary_loss_clip": 0.01086638, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.03356993, + "balance_loss_mlp": 1.02101088, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.8037797747626005, + "language_loss": 0.74684846, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76804507, + "num_input_tokens_seen": 278671850, + "step": 12916, + "time_per_iteration": 2.604682683944702 + }, + { + "auxiliary_loss_clip": 0.01037823, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.02917707, + "balance_loss_mlp": 1.01835847, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.985057073991622, + "language_loss": 0.65470874, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67539477, + "num_input_tokens_seen": 278697860, + "step": 12917, + "time_per_iteration": 2.963444948196411 + }, + { + "auxiliary_loss_clip": 0.01086242, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.03165233, + "balance_loss_mlp": 1.01988637, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.8453213199256526, + "language_loss": 0.64398754, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66515732, + "num_input_tokens_seen": 278720655, + "step": 12918, + "time_per_iteration": 2.745122194290161 + }, + { + "auxiliary_loss_clip": 0.01097156, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.0341773, + "balance_loss_mlp": 1.02145958, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.8282127798636245, + "language_loss": 0.73541224, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75670099, + "num_input_tokens_seen": 278737375, + "step": 12919, + "time_per_iteration": 3.9242358207702637 + }, + { + "auxiliary_loss_clip": 0.01055808, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.03301144, + "balance_loss_mlp": 1.01729536, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 2.6190846978102695, + "language_loss": 0.7914685, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81231564, + "num_input_tokens_seen": 278756510, + "step": 12920, + "time_per_iteration": 2.6698360443115234 + }, + { + "auxiliary_loss_clip": 0.0108921, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.03488374, + "balance_loss_mlp": 1.01745796, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.7105535983635016, + "language_loss": 0.70748794, + "learning_rate": 4.998834633291829e-07, + "loss": 0.72866547, + "num_input_tokens_seen": 278775410, + "step": 12921, + "time_per_iteration": 2.6142971515655518 + }, + { + "auxiliary_loss_clip": 0.01093007, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.03550291, + "balance_loss_mlp": 1.01844072, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.8830027283622832, + "language_loss": 0.76519823, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78643584, + "num_input_tokens_seen": 278794260, + "step": 12922, + "time_per_iteration": 2.553831100463867 + }, + { + "auxiliary_loss_clip": 0.01047846, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.03202558, + "balance_loss_mlp": 1.02231383, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.704694606250024, + "language_loss": 0.80076814, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82158446, + "num_input_tokens_seen": 278813290, + "step": 12923, + "time_per_iteration": 2.7107839584350586 + }, + { + "auxiliary_loss_clip": 0.01061849, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.03530908, + "balance_loss_mlp": 1.01915812, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 2.8710209934713378, + "language_loss": 0.92117524, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94209176, + "num_input_tokens_seen": 278830610, + "step": 12924, + "time_per_iteration": 2.6340928077697754 + }, + { + "auxiliary_loss_clip": 0.01086167, + "auxiliary_loss_mlp": 0.01026476, + "balance_loss_clip": 1.03231812, + "balance_loss_mlp": 1.01537752, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 2.98070328206182, + "language_loss": 0.65987456, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68100095, + "num_input_tokens_seen": 278849530, + "step": 12925, + "time_per_iteration": 2.5597527027130127 + }, + { + "auxiliary_loss_clip": 0.01065475, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.03320813, + "balance_loss_mlp": 1.02176023, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.9693941161996535, + "language_loss": 0.71898067, + "learning_rate": 4.985962798170314e-07, + "loss": 0.73996729, + "num_input_tokens_seen": 278869005, + "step": 12926, + "time_per_iteration": 2.8117525577545166 + }, + { + "auxiliary_loss_clip": 0.01090534, + "auxiliary_loss_mlp": 0.01026986, + "balance_loss_clip": 1.0345279, + "balance_loss_mlp": 1.01514864, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 2.530178657595197, + "language_loss": 0.65619612, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67737138, + "num_input_tokens_seen": 278888790, + "step": 12927, + "time_per_iteration": 2.6328370571136475 + }, + { + "auxiliary_loss_clip": 0.01075701, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.0330627, + "balance_loss_mlp": 1.02236152, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 2.8770913628511012, + "language_loss": 0.72417158, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74526942, + "num_input_tokens_seen": 278908150, + "step": 12928, + "time_per_iteration": 2.584130048751831 + }, + { + "auxiliary_loss_clip": 0.01055616, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03272772, + "balance_loss_mlp": 1.01673412, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.6545895069720389, + "language_loss": 0.74455214, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76538974, + "num_input_tokens_seen": 278927425, + "step": 12929, + "time_per_iteration": 4.181232690811157 + }, + { + "auxiliary_loss_clip": 0.01061698, + "auxiliary_loss_mlp": 0.01025912, + "balance_loss_clip": 1.03310108, + "balance_loss_mlp": 1.01480794, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 2.3680479442278672, + "language_loss": 0.77523768, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79611385, + "num_input_tokens_seen": 278946475, + "step": 12930, + "time_per_iteration": 2.6481690406799316 + }, + { + "auxiliary_loss_clip": 0.01100005, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.03650904, + "balance_loss_mlp": 1.01608598, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 2.3633889713904077, + "language_loss": 0.79733443, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81861025, + "num_input_tokens_seen": 278964345, + "step": 12931, + "time_per_iteration": 2.503070592880249 + }, + { + "auxiliary_loss_clip": 0.00997439, + "auxiliary_loss_mlp": 0.01010886, + "balance_loss_clip": 1.01761305, + "balance_loss_mlp": 1.00974739, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8388270193041414, + "language_loss": 0.597188, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61727118, + "num_input_tokens_seen": 279022380, + "step": 12932, + "time_per_iteration": 3.210470676422119 + }, + { + "auxiliary_loss_clip": 0.01087129, + "auxiliary_loss_mlp": 0.01030486, + "balance_loss_clip": 1.03403473, + "balance_loss_mlp": 1.02001905, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 2.1945282298959605, + "language_loss": 0.76285517, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78403133, + "num_input_tokens_seen": 279044275, + "step": 12933, + "time_per_iteration": 2.5896365642547607 + }, + { + "auxiliary_loss_clip": 0.01072695, + "auxiliary_loss_mlp": 0.01034113, + "balance_loss_clip": 1.0342989, + "balance_loss_mlp": 1.02181029, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.015586686143578, + "language_loss": 0.73035061, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75141871, + "num_input_tokens_seen": 279063375, + "step": 12934, + "time_per_iteration": 2.630464553833008 + }, + { + "auxiliary_loss_clip": 0.01053893, + "auxiliary_loss_mlp": 0.01027233, + "balance_loss_clip": 1.031775, + "balance_loss_mlp": 1.01551521, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 7.462724888564509, + "language_loss": 0.70648295, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72729421, + "num_input_tokens_seen": 279082680, + "step": 12935, + "time_per_iteration": 2.6265385150909424 + }, + { + "auxiliary_loss_clip": 0.01072785, + "auxiliary_loss_mlp": 0.00749477, + "balance_loss_clip": 1.03363943, + "balance_loss_mlp": 1.00021958, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.5855205259011729, + "language_loss": 0.83771574, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85593837, + "num_input_tokens_seen": 279099805, + "step": 12936, + "time_per_iteration": 2.57123064994812 + }, + { + "auxiliary_loss_clip": 0.01092512, + "auxiliary_loss_mlp": 0.01028952, + "balance_loss_clip": 1.03465438, + "balance_loss_mlp": 1.01804447, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 1.9450994400885515, + "language_loss": 0.67599845, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69721311, + "num_input_tokens_seen": 279117975, + "step": 12937, + "time_per_iteration": 2.5135984420776367 + }, + { + "auxiliary_loss_clip": 0.01100314, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.03531158, + "balance_loss_mlp": 1.01810551, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.464584986783015, + "language_loss": 0.87162936, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89292264, + "num_input_tokens_seen": 279137255, + "step": 12938, + "time_per_iteration": 2.4969801902770996 + }, + { + "auxiliary_loss_clip": 0.01089316, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.03475344, + "balance_loss_mlp": 1.01670969, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 2.06580078416173, + "language_loss": 0.85142863, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87260365, + "num_input_tokens_seen": 279154500, + "step": 12939, + "time_per_iteration": 2.480983257293701 + }, + { + "auxiliary_loss_clip": 0.01095044, + "auxiliary_loss_mlp": 0.00749223, + "balance_loss_clip": 1.03376305, + "balance_loss_mlp": 1.00018752, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.900274642616951, + "language_loss": 0.69033033, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70877302, + "num_input_tokens_seen": 279173635, + "step": 12940, + "time_per_iteration": 2.4959118366241455 + }, + { + "auxiliary_loss_clip": 0.01064685, + "auxiliary_loss_mlp": 0.01025546, + "balance_loss_clip": 1.03259635, + "balance_loss_mlp": 1.01525855, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.5211524808100967, + "language_loss": 0.77786338, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79876572, + "num_input_tokens_seen": 279194430, + "step": 12941, + "time_per_iteration": 2.6354455947875977 + }, + { + "auxiliary_loss_clip": 0.01090167, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.02320087, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.465793965657406, + "language_loss": 0.73257756, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75384104, + "num_input_tokens_seen": 279212920, + "step": 12942, + "time_per_iteration": 2.620897054672241 + }, + { + "auxiliary_loss_clip": 0.01041212, + "auxiliary_loss_mlp": 0.01033343, + "balance_loss_clip": 1.03033876, + "balance_loss_mlp": 1.0213685, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 2.059760116321012, + "language_loss": 0.68124849, + "learning_rate": 4.942305097079751e-07, + "loss": 0.70199406, + "num_input_tokens_seen": 279232310, + "step": 12943, + "time_per_iteration": 4.155431270599365 + }, + { + "auxiliary_loss_clip": 0.00991574, + "auxiliary_loss_mlp": 0.0101254, + "balance_loss_clip": 1.00324702, + "balance_loss_mlp": 1.01141357, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7770568690263503, + "language_loss": 0.58521634, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60525745, + "num_input_tokens_seen": 279295375, + "step": 12944, + "time_per_iteration": 3.3236002922058105 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.0343163, + "balance_loss_mlp": 1.02264845, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 1.8316089625733585, + "language_loss": 0.67503142, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69639009, + "num_input_tokens_seen": 279313660, + "step": 12945, + "time_per_iteration": 2.628552198410034 + }, + { + "auxiliary_loss_clip": 0.01082772, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.03319836, + "balance_loss_mlp": 1.02061713, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 5.409838908769421, + "language_loss": 0.69035518, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71150023, + "num_input_tokens_seen": 279334495, + "step": 12946, + "time_per_iteration": 2.634777545928955 + }, + { + "auxiliary_loss_clip": 0.0109853, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.03455961, + "balance_loss_mlp": 1.01775885, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 1.9512029485206963, + "language_loss": 0.6516844, + "learning_rate": 4.932056660665689e-07, + "loss": 0.6729607, + "num_input_tokens_seen": 279352985, + "step": 12947, + "time_per_iteration": 2.5095038414001465 + }, + { + "auxiliary_loss_clip": 0.01029534, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.02755737, + "balance_loss_mlp": 1.02646494, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 1.8523736897834497, + "language_loss": 0.65226424, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67295575, + "num_input_tokens_seen": 279371360, + "step": 12948, + "time_per_iteration": 2.6986336708068848 + }, + { + "auxiliary_loss_clip": 0.0109871, + "auxiliary_loss_mlp": 0.01029468, + "balance_loss_clip": 1.03459716, + "balance_loss_mlp": 1.0181787, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 2.566335644793116, + "language_loss": 0.75226671, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77354848, + "num_input_tokens_seen": 279389400, + "step": 12949, + "time_per_iteration": 2.514484405517578 + }, + { + "auxiliary_loss_clip": 0.01102448, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.03533673, + "balance_loss_mlp": 1.02155018, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.4801575566009164, + "language_loss": 0.68790078, + "learning_rate": 4.924376332483202e-07, + "loss": 0.70925272, + "num_input_tokens_seen": 279409715, + "step": 12950, + "time_per_iteration": 2.5290348529815674 + }, + { + "auxiliary_loss_clip": 0.01083299, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.03421354, + "balance_loss_mlp": 1.0173105, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.8384692601355723, + "language_loss": 0.72025704, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74137235, + "num_input_tokens_seen": 279427705, + "step": 12951, + "time_per_iteration": 4.0956361293792725 + }, + { + "auxiliary_loss_clip": 0.01070397, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.03083146, + "balance_loss_mlp": 1.01729131, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 2.853035627314933, + "language_loss": 0.65882367, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67981166, + "num_input_tokens_seen": 279448215, + "step": 12952, + "time_per_iteration": 2.5999984741210938 + }, + { + "auxiliary_loss_clip": 0.0106736, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.0300231, + "balance_loss_mlp": 1.01809859, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.4796819365433975, + "language_loss": 0.81095517, + "learning_rate": 4.916701149323022e-07, + "loss": 0.8319118, + "num_input_tokens_seen": 279466260, + "step": 12953, + "time_per_iteration": 2.5907957553863525 + }, + { + "auxiliary_loss_clip": 0.01104228, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03787589, + "balance_loss_mlp": 1.01803589, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 2.145950724559857, + "language_loss": 0.76386189, + "learning_rate": 4.91414389872737e-07, + "loss": 0.78519773, + "num_input_tokens_seen": 279484520, + "step": 12954, + "time_per_iteration": 2.4945123195648193 + }, + { + "auxiliary_loss_clip": 0.01083885, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.03182077, + "balance_loss_mlp": 1.01871181, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.5758224195162978, + "language_loss": 0.72982967, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7509616, + "num_input_tokens_seen": 279503130, + "step": 12955, + "time_per_iteration": 2.5550739765167236 + }, + { + "auxiliary_loss_clip": 0.0106325, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.03151095, + "balance_loss_mlp": 1.02217209, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.5041530581699913, + "language_loss": 0.68873155, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70970136, + "num_input_tokens_seen": 279521930, + "step": 12956, + "time_per_iteration": 2.6235463619232178 + }, + { + "auxiliary_loss_clip": 0.01059125, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.03041875, + "balance_loss_mlp": 1.01923835, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 2.0302580577192617, + "language_loss": 0.75755227, + "learning_rate": 4.906475579671252e-07, + "loss": 0.77844024, + "num_input_tokens_seen": 279542375, + "step": 12957, + "time_per_iteration": 2.724123001098633 + }, + { + "auxiliary_loss_clip": 0.0102877, + "auxiliary_loss_mlp": 0.01025742, + "balance_loss_clip": 1.03327858, + "balance_loss_mlp": 1.01473927, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 2.1596859457361286, + "language_loss": 0.77437258, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79491776, + "num_input_tokens_seen": 279561885, + "step": 12958, + "time_per_iteration": 2.860201358795166 + }, + { + "auxiliary_loss_clip": 0.01082571, + "auxiliary_loss_mlp": 0.01041066, + "balance_loss_clip": 1.03270733, + "balance_loss_mlp": 1.02739906, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 1.9905165549104928, + "language_loss": 0.71418411, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73542047, + "num_input_tokens_seen": 279579965, + "step": 12959, + "time_per_iteration": 4.183340787887573 + }, + { + "auxiliary_loss_clip": 0.01079932, + "auxiliary_loss_mlp": 0.0074941, + "balance_loss_clip": 1.03325987, + "balance_loss_mlp": 1.00027978, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.6425690563100888, + "language_loss": 0.77999806, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79829156, + "num_input_tokens_seen": 279599030, + "step": 12960, + "time_per_iteration": 2.5994625091552734 + }, + { + "auxiliary_loss_clip": 0.01090721, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.03495383, + "balance_loss_mlp": 1.02369952, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 6.0991334619891235, + "language_loss": 0.74781632, + "learning_rate": 4.896259167586385e-07, + "loss": 0.76908118, + "num_input_tokens_seen": 279614400, + "step": 12961, + "time_per_iteration": 2.549751043319702 + }, + { + "auxiliary_loss_clip": 0.01072693, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.03346646, + "balance_loss_mlp": 1.02155232, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.6614462279318403, + "language_loss": 0.73786777, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75891554, + "num_input_tokens_seen": 279633745, + "step": 12962, + "time_per_iteration": 2.6206464767456055 + }, + { + "auxiliary_loss_clip": 0.01089185, + "auxiliary_loss_mlp": 0.0102645, + "balance_loss_clip": 1.03582358, + "balance_loss_mlp": 1.01563776, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.842251394750079, + "language_loss": 0.69591177, + "learning_rate": 4.891154397568795e-07, + "loss": 0.7170682, + "num_input_tokens_seen": 279651165, + "step": 12963, + "time_per_iteration": 2.527585029602051 + }, + { + "auxiliary_loss_clip": 0.01088133, + "auxiliary_loss_mlp": 0.00749201, + "balance_loss_clip": 1.03520179, + "balance_loss_mlp": 1.00020576, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.9390427816944082, + "language_loss": 0.63374317, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65211642, + "num_input_tokens_seen": 279671175, + "step": 12964, + "time_per_iteration": 2.602482318878174 + }, + { + "auxiliary_loss_clip": 0.01078878, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.03395677, + "balance_loss_mlp": 1.02045369, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.9487555644808197, + "language_loss": 0.76535475, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78645474, + "num_input_tokens_seen": 279688675, + "step": 12965, + "time_per_iteration": 2.602778196334839 + }, + { + "auxiliary_loss_clip": 0.01074645, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.03059649, + "balance_loss_mlp": 1.02166533, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 1.4534426918578076, + "language_loss": 0.72568077, + "learning_rate": 4.883501539751289e-07, + "loss": 0.74675894, + "num_input_tokens_seen": 279710245, + "step": 12966, + "time_per_iteration": 2.577038526535034 + }, + { + "auxiliary_loss_clip": 0.01075501, + "auxiliary_loss_mlp": 0.00749078, + "balance_loss_clip": 1.0348506, + "balance_loss_mlp": 1.00022984, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.464395277995515, + "language_loss": 0.74141729, + "learning_rate": 4.880951733454768e-07, + "loss": 0.75966311, + "num_input_tokens_seen": 279729045, + "step": 12967, + "time_per_iteration": 2.5859110355377197 + }, + { + "auxiliary_loss_clip": 0.01099944, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.03532088, + "balance_loss_mlp": 1.01800466, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 6.247511193330654, + "language_loss": 0.71667695, + "learning_rate": 4.878402500474073e-07, + "loss": 0.73796904, + "num_input_tokens_seen": 279748350, + "step": 12968, + "time_per_iteration": 2.5102698802948 + }, + { + "auxiliary_loss_clip": 0.0107418, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.03424442, + "balance_loss_mlp": 1.02228951, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 1.8731076337090198, + "language_loss": 0.60881972, + "learning_rate": 4.875853840905874e-07, + "loss": 0.62989485, + "num_input_tokens_seen": 279765620, + "step": 12969, + "time_per_iteration": 2.631382942199707 + }, + { + "auxiliary_loss_clip": 0.01079954, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.03303552, + "balance_loss_mlp": 1.02091753, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.9935487727598984, + "language_loss": 0.70053065, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72164184, + "num_input_tokens_seen": 279782485, + "step": 12970, + "time_per_iteration": 4.010723829269409 + }, + { + "auxiliary_loss_clip": 0.01061902, + "auxiliary_loss_mlp": 0.00749413, + "balance_loss_clip": 1.0354116, + "balance_loss_mlp": 1.00024366, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.7135524618349818, + "language_loss": 0.72109675, + "learning_rate": 4.870758242393507e-07, + "loss": 0.73920989, + "num_input_tokens_seen": 279804170, + "step": 12971, + "time_per_iteration": 2.714226245880127 + }, + { + "auxiliary_loss_clip": 0.01050536, + "auxiliary_loss_mlp": 0.01032134, + "balance_loss_clip": 1.03000593, + "balance_loss_mlp": 1.02010596, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.6376936412453789, + "language_loss": 0.74114209, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76196873, + "num_input_tokens_seen": 279823730, + "step": 12972, + "time_per_iteration": 2.64863920211792 + }, + { + "auxiliary_loss_clip": 0.01097717, + "auxiliary_loss_mlp": 0.01023695, + "balance_loss_clip": 1.03389871, + "balance_loss_mlp": 1.01233411, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 2.740021122341962, + "language_loss": 0.7095865, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73080063, + "num_input_tokens_seen": 279843035, + "step": 12973, + "time_per_iteration": 2.5474796295166016 + }, + { + "auxiliary_loss_clip": 0.01084061, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.03263032, + "balance_loss_mlp": 1.02045131, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 1.8783465009550058, + "language_loss": 0.77780908, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79895294, + "num_input_tokens_seen": 279861450, + "step": 12974, + "time_per_iteration": 2.653385639190674 + }, + { + "auxiliary_loss_clip": 0.01062821, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.03125453, + "balance_loss_mlp": 1.016523, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.7128704507742964, + "language_loss": 0.69207108, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71297324, + "num_input_tokens_seen": 279878660, + "step": 12975, + "time_per_iteration": 2.659907579421997 + }, + { + "auxiliary_loss_clip": 0.0105992, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.03136849, + "balance_loss_mlp": 1.01732624, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 1.972076270356831, + "language_loss": 0.81667149, + "learning_rate": 4.858029287593739e-07, + "loss": 0.83754957, + "num_input_tokens_seen": 279895685, + "step": 12976, + "time_per_iteration": 2.5842442512512207 + }, + { + "auxiliary_loss_clip": 0.01075469, + "auxiliary_loss_mlp": 0.00749427, + "balance_loss_clip": 1.03087461, + "balance_loss_mlp": 1.00028276, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.3924745930284972, + "language_loss": 0.65864235, + "learning_rate": 4.85548521880289e-07, + "loss": 0.67689133, + "num_input_tokens_seen": 279917240, + "step": 12977, + "time_per_iteration": 2.6687092781066895 + }, + { + "auxiliary_loss_clip": 0.01074851, + "auxiliary_loss_mlp": 0.01024667, + "balance_loss_clip": 1.03333127, + "balance_loss_mlp": 1.01445067, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 1.4336179647789307, + "language_loss": 0.74949253, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77048773, + "num_input_tokens_seen": 279938665, + "step": 12978, + "time_per_iteration": 2.6764609813690186 + }, + { + "auxiliary_loss_clip": 0.0107065, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.03108788, + "balance_loss_mlp": 1.02499127, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 1.6745188102526132, + "language_loss": 0.62177765, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64287281, + "num_input_tokens_seen": 279957965, + "step": 12979, + "time_per_iteration": 2.65655255317688 + }, + { + "auxiliary_loss_clip": 0.0109853, + "auxiliary_loss_mlp": 0.01027082, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.01584029, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 2.586494179976858, + "language_loss": 0.7711798, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79243594, + "num_input_tokens_seen": 279977490, + "step": 12980, + "time_per_iteration": 2.6060967445373535 + }, + { + "auxiliary_loss_clip": 0.01099955, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.03496194, + "balance_loss_mlp": 1.01938105, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 2.2326449840470617, + "language_loss": 0.77782464, + "learning_rate": 4.845314687419046e-07, + "loss": 0.79912513, + "num_input_tokens_seen": 279994220, + "step": 12981, + "time_per_iteration": 2.524858236312866 + }, + { + "auxiliary_loss_clip": 0.01053306, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.03347445, + "balance_loss_mlp": 1.02206016, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 2.244406776767546, + "language_loss": 0.72827303, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74913895, + "num_input_tokens_seen": 280012590, + "step": 12982, + "time_per_iteration": 2.601585865020752 + }, + { + "auxiliary_loss_clip": 0.01071103, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.03189647, + "balance_loss_mlp": 1.02052283, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.697975767559689, + "language_loss": 0.7335549, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75457817, + "num_input_tokens_seen": 280033700, + "step": 12983, + "time_per_iteration": 4.135841131210327 + }, + { + "auxiliary_loss_clip": 0.01071374, + "auxiliary_loss_mlp": 0.01025155, + "balance_loss_clip": 1.03274059, + "balance_loss_mlp": 1.01442051, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 3.16292667891102, + "language_loss": 0.74995655, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77092183, + "num_input_tokens_seen": 280052215, + "step": 12984, + "time_per_iteration": 2.6139039993286133 + }, + { + "auxiliary_loss_clip": 0.01064973, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.02944434, + "balance_loss_mlp": 1.0226084, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 2.366457987070637, + "language_loss": 0.81411374, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83510095, + "num_input_tokens_seen": 280070525, + "step": 12985, + "time_per_iteration": 2.5522890090942383 + }, + { + "auxiliary_loss_clip": 0.01077209, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.03363013, + "balance_loss_mlp": 1.01789749, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.796160170245531, + "language_loss": 0.76895005, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79001087, + "num_input_tokens_seen": 280089855, + "step": 12986, + "time_per_iteration": 2.625642776489258 + }, + { + "auxiliary_loss_clip": 0.01087473, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.03299987, + "balance_loss_mlp": 1.0206778, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 1.9433094561474011, + "language_loss": 0.74339032, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76457787, + "num_input_tokens_seen": 280109960, + "step": 12987, + "time_per_iteration": 2.6562225818634033 + }, + { + "auxiliary_loss_clip": 0.01014896, + "auxiliary_loss_mlp": 0.01000814, + "balance_loss_clip": 1.00504434, + "balance_loss_mlp": 0.99980658, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7289829472145718, + "language_loss": 0.55011797, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57027507, + "num_input_tokens_seen": 280169805, + "step": 12988, + "time_per_iteration": 3.150269031524658 + }, + { + "auxiliary_loss_clip": 0.01063878, + "auxiliary_loss_mlp": 0.01034188, + "balance_loss_clip": 1.03176045, + "balance_loss_mlp": 1.02387607, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.6003955281405333, + "language_loss": 0.80629063, + "learning_rate": 4.82500121484009e-07, + "loss": 0.82727134, + "num_input_tokens_seen": 280184630, + "step": 12989, + "time_per_iteration": 2.6189348697662354 + }, + { + "auxiliary_loss_clip": 0.01063816, + "auxiliary_loss_mlp": 0.01025438, + "balance_loss_clip": 1.03213215, + "balance_loss_mlp": 1.01488161, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.4792186876547213, + "language_loss": 0.7013219, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72221446, + "num_input_tokens_seen": 280203880, + "step": 12990, + "time_per_iteration": 2.631406784057617 + }, + { + "auxiliary_loss_clip": 0.01073268, + "auxiliary_loss_mlp": 0.01028299, + "balance_loss_clip": 1.03302717, + "balance_loss_mlp": 1.01598454, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 2.035955396628939, + "language_loss": 0.78032082, + "learning_rate": 4.819928599145184e-07, + "loss": 0.80133653, + "num_input_tokens_seen": 280220460, + "step": 12991, + "time_per_iteration": 4.118035793304443 + }, + { + "auxiliary_loss_clip": 0.01059595, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.03081453, + "balance_loss_mlp": 1.02512324, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.5102118350888867, + "language_loss": 0.65949064, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68045306, + "num_input_tokens_seen": 280242680, + "step": 12992, + "time_per_iteration": 2.811768054962158 + }, + { + "auxiliary_loss_clip": 0.01099536, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.03492558, + "balance_loss_mlp": 1.01836169, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.836157031582451, + "language_loss": 0.61937696, + "learning_rate": 4.814858285969578e-07, + "loss": 0.64066494, + "num_input_tokens_seen": 280260655, + "step": 12993, + "time_per_iteration": 2.5525784492492676 + }, + { + "auxiliary_loss_clip": 0.01071306, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.03056335, + "balance_loss_mlp": 1.01817322, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.7800371617429909, + "language_loss": 0.68363225, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70464325, + "num_input_tokens_seen": 280281185, + "step": 12994, + "time_per_iteration": 2.6408028602600098 + }, + { + "auxiliary_loss_clip": 0.01095868, + "auxiliary_loss_mlp": 0.01026459, + "balance_loss_clip": 1.0339179, + "balance_loss_mlp": 1.01607037, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.9734543099942095, + "language_loss": 0.68936378, + "learning_rate": 4.809790276082335e-07, + "loss": 0.71058702, + "num_input_tokens_seen": 280298255, + "step": 12995, + "time_per_iteration": 2.5287435054779053 + }, + { + "auxiliary_loss_clip": 0.01052472, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.03143954, + "balance_loss_mlp": 1.01529694, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.7548864372854847, + "language_loss": 0.7503345, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77111238, + "num_input_tokens_seen": 280319000, + "step": 12996, + "time_per_iteration": 2.7444753646850586 + }, + { + "auxiliary_loss_clip": 0.01102644, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.03532839, + "balance_loss_mlp": 1.01835561, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.822763204466924, + "language_loss": 0.68204021, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70336986, + "num_input_tokens_seen": 280336375, + "step": 12997, + "time_per_iteration": 2.4848477840423584 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.03502393, + "balance_loss_mlp": 1.01906407, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.7420805274871487, + "language_loss": 0.82011569, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84145504, + "num_input_tokens_seen": 280358760, + "step": 12998, + "time_per_iteration": 2.5095126628875732 + }, + { + "auxiliary_loss_clip": 0.01066308, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.02905393, + "balance_loss_mlp": 1.02076077, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 2.02889107195166, + "language_loss": 0.748335, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76932567, + "num_input_tokens_seen": 280377085, + "step": 12999, + "time_per_iteration": 4.104809761047363 + }, + { + "auxiliary_loss_clip": 0.01081916, + "auxiliary_loss_mlp": 0.01039599, + "balance_loss_clip": 1.03207684, + "balance_loss_mlp": 1.02678406, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.4975640786458493, + "language_loss": 0.84720069, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86841583, + "num_input_tokens_seen": 280395465, + "step": 13000, + "time_per_iteration": 2.5615084171295166 + }, + { + "auxiliary_loss_clip": 0.01089361, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.03462148, + "balance_loss_mlp": 1.01880765, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 1.826131878122029, + "language_loss": 0.66645503, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68764889, + "num_input_tokens_seen": 280412775, + "step": 13001, + "time_per_iteration": 2.5014877319335938 + }, + { + "auxiliary_loss_clip": 0.0105793, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.0316186, + "balance_loss_mlp": 1.02086139, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.6977370621460537, + "language_loss": 0.66845858, + "learning_rate": 4.792070390968027e-07, + "loss": 0.68935812, + "num_input_tokens_seen": 280432905, + "step": 13002, + "time_per_iteration": 2.7586281299591064 + }, + { + "auxiliary_loss_clip": 0.01091527, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.03570342, + "balance_loss_mlp": 1.02238989, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.6182358171656785, + "language_loss": 0.7288844, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75014484, + "num_input_tokens_seen": 280450785, + "step": 13003, + "time_per_iteration": 2.5594887733459473 + }, + { + "auxiliary_loss_clip": 0.01087424, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.03449106, + "balance_loss_mlp": 1.01987016, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.6064947382554262, + "language_loss": 0.61795795, + "learning_rate": 4.787012755386233e-07, + "loss": 0.6391418, + "num_input_tokens_seen": 280468400, + "step": 13004, + "time_per_iteration": 2.576782703399658 + }, + { + "auxiliary_loss_clip": 0.01093133, + "auxiliary_loss_mlp": 0.0102805, + "balance_loss_clip": 1.0330925, + "balance_loss_mlp": 1.01801825, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 2.6067175056350695, + "language_loss": 0.82595438, + "learning_rate": 4.784484802864403e-07, + "loss": 0.84716618, + "num_input_tokens_seen": 280483930, + "step": 13005, + "time_per_iteration": 2.4821622371673584 + }, + { + "auxiliary_loss_clip": 0.01055256, + "auxiliary_loss_mlp": 0.0074948, + "balance_loss_clip": 1.02938986, + "balance_loss_mlp": 1.00024772, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 2.0480502622347383, + "language_loss": 0.72594833, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74399567, + "num_input_tokens_seen": 280503465, + "step": 13006, + "time_per_iteration": 2.621333360671997 + }, + { + "auxiliary_loss_clip": 0.01088612, + "auxiliary_loss_mlp": 0.00749571, + "balance_loss_clip": 1.03365684, + "balance_loss_mlp": 1.00024438, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.8516302553646913, + "language_loss": 0.72177678, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74015862, + "num_input_tokens_seen": 280523375, + "step": 13007, + "time_per_iteration": 2.538686752319336 + }, + { + "auxiliary_loss_clip": 0.01098995, + "auxiliary_loss_mlp": 0.01027535, + "balance_loss_clip": 1.0327425, + "balance_loss_mlp": 1.01594245, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 1.9365863703534425, + "language_loss": 0.68581533, + "learning_rate": 4.776904407525397e-07, + "loss": 0.70708072, + "num_input_tokens_seen": 280542920, + "step": 13008, + "time_per_iteration": 2.485785722732544 + }, + { + "auxiliary_loss_clip": 0.01071263, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.03244317, + "balance_loss_mlp": 1.01735449, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.9520921240508484, + "language_loss": 0.69760066, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71860403, + "num_input_tokens_seen": 280561700, + "step": 13009, + "time_per_iteration": 4.1552393436431885 + }, + { + "auxiliary_loss_clip": 0.01055104, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.02950025, + "balance_loss_mlp": 1.01636147, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 2.0298043468081284, + "language_loss": 0.81564164, + "learning_rate": 4.771853696779586e-07, + "loss": 0.83646888, + "num_input_tokens_seen": 280580605, + "step": 13010, + "time_per_iteration": 2.632211685180664 + }, + { + "auxiliary_loss_clip": 0.01083151, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.03113151, + "balance_loss_mlp": 1.02282572, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.5494289278158058, + "language_loss": 0.62208372, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64324689, + "num_input_tokens_seen": 280601495, + "step": 13011, + "time_per_iteration": 2.6091578006744385 + }, + { + "auxiliary_loss_clip": 0.01085699, + "auxiliary_loss_mlp": 0.01026618, + "balance_loss_clip": 1.03279138, + "balance_loss_mlp": 1.01695609, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.7719128141016611, + "language_loss": 0.70539612, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72651929, + "num_input_tokens_seen": 280622760, + "step": 13012, + "time_per_iteration": 2.6103012561798096 + }, + { + "auxiliary_loss_clip": 0.01023978, + "auxiliary_loss_mlp": 0.00999953, + "balance_loss_clip": 1.00395608, + "balance_loss_mlp": 0.99909449, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 1.5330355877313708, + "language_loss": 0.55081373, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57105303, + "num_input_tokens_seen": 280687115, + "step": 13013, + "time_per_iteration": 3.1522955894470215 + }, + { + "auxiliary_loss_clip": 0.01072335, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.02254832, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.9052329280286882, + "language_loss": 0.65320635, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67427033, + "num_input_tokens_seen": 280705000, + "step": 13014, + "time_per_iteration": 2.65604567527771 + }, + { + "auxiliary_loss_clip": 0.01004818, + "auxiliary_loss_mlp": 0.01001273, + "balance_loss_clip": 1.00780451, + "balance_loss_mlp": 1.00014067, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.7785093897976257, + "language_loss": 0.58453017, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60459107, + "num_input_tokens_seen": 280773525, + "step": 13015, + "time_per_iteration": 3.229757070541382 + }, + { + "auxiliary_loss_clip": 0.01069137, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.03369772, + "balance_loss_mlp": 1.01935875, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.5898709047693145, + "language_loss": 0.7426008, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76358902, + "num_input_tokens_seen": 280791915, + "step": 13016, + "time_per_iteration": 2.6014342308044434 + }, + { + "auxiliary_loss_clip": 0.01099224, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.03397369, + "balance_loss_mlp": 1.0183754, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.5690570845497074, + "language_loss": 0.75026202, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77156222, + "num_input_tokens_seen": 280811460, + "step": 13017, + "time_per_iteration": 2.526719331741333 + }, + { + "auxiliary_loss_clip": 0.0107384, + "auxiliary_loss_mlp": 0.01031332, + "balance_loss_clip": 1.03161955, + "balance_loss_mlp": 1.01932812, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 2.016816214242161, + "language_loss": 0.75254452, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77359623, + "num_input_tokens_seen": 280825415, + "step": 13018, + "time_per_iteration": 2.5766427516937256 + }, + { + "auxiliary_loss_clip": 0.0109629, + "auxiliary_loss_mlp": 0.01026491, + "balance_loss_clip": 1.03283536, + "balance_loss_mlp": 1.01495123, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 2.173654519522647, + "language_loss": 0.77223754, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79346538, + "num_input_tokens_seen": 280845335, + "step": 13019, + "time_per_iteration": 2.5289812088012695 + }, + { + "auxiliary_loss_clip": 0.01052628, + "auxiliary_loss_mlp": 0.01023294, + "balance_loss_clip": 1.03274512, + "balance_loss_mlp": 1.01259518, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.375001655736195, + "language_loss": 0.67653978, + "learning_rate": 4.746634805529852e-07, + "loss": 0.697299, + "num_input_tokens_seen": 280867145, + "step": 13020, + "time_per_iteration": 2.753213405609131 + }, + { + "auxiliary_loss_clip": 0.01086074, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.03560853, + "balance_loss_mlp": 1.01733184, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 2.006460533981, + "language_loss": 0.6262536, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64739949, + "num_input_tokens_seen": 280886185, + "step": 13021, + "time_per_iteration": 2.598081350326538 + }, + { + "auxiliary_loss_clip": 0.01095307, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.03306794, + "balance_loss_mlp": 1.02108252, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.8202591938145176, + "language_loss": 0.6923666, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.7136358, + "num_input_tokens_seen": 280907665, + "step": 13022, + "time_per_iteration": 2.581937789916992 + }, + { + "auxiliary_loss_clip": 0.00975524, + "auxiliary_loss_mlp": 0.01003748, + "balance_loss_clip": 1.00947595, + "balance_loss_mlp": 1.00272894, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.641660590595451, + "language_loss": 0.5614481, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58124077, + "num_input_tokens_seen": 280971405, + "step": 13023, + "time_per_iteration": 4.835548400878906 + }, + { + "auxiliary_loss_clip": 0.0106392, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.02819991, + "balance_loss_mlp": 1.02052021, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 2.0527647586086366, + "language_loss": 0.67276675, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69372487, + "num_input_tokens_seen": 280989615, + "step": 13024, + "time_per_iteration": 2.625718593597412 + }, + { + "auxiliary_loss_clip": 0.01099881, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.03482437, + "balance_loss_mlp": 1.0148797, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.702665772565967, + "language_loss": 0.77810347, + "learning_rate": 4.734047044272498e-07, + "loss": 0.7993651, + "num_input_tokens_seen": 281009450, + "step": 13025, + "time_per_iteration": 2.6185426712036133 + }, + { + "auxiliary_loss_clip": 0.01070582, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.03284359, + "balance_loss_mlp": 1.02275133, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.7175941578269276, + "language_loss": 0.78714049, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80818081, + "num_input_tokens_seen": 281028120, + "step": 13026, + "time_per_iteration": 2.655100107192993 + }, + { + "auxiliary_loss_clip": 0.01085159, + "auxiliary_loss_mlp": 0.01025092, + "balance_loss_clip": 1.03505445, + "balance_loss_mlp": 1.01447642, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 4.0263217864816925, + "language_loss": 0.7499401, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77104259, + "num_input_tokens_seen": 281042130, + "step": 13027, + "time_per_iteration": 2.576840400695801 + }, + { + "auxiliary_loss_clip": 0.01088974, + "auxiliary_loss_mlp": 0.0102513, + "balance_loss_clip": 1.03499401, + "balance_loss_mlp": 1.01479483, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.8231542379802526, + "language_loss": 0.70344996, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72459108, + "num_input_tokens_seen": 281060945, + "step": 13028, + "time_per_iteration": 2.5402233600616455 + }, + { + "auxiliary_loss_clip": 0.01050574, + "auxiliary_loss_mlp": 0.01039216, + "balance_loss_clip": 1.0346024, + "balance_loss_mlp": 1.02755165, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 2.006543536282768, + "language_loss": 0.68720138, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.70809925, + "num_input_tokens_seen": 281079270, + "step": 13029, + "time_per_iteration": 2.7377636432647705 + }, + { + "auxiliary_loss_clip": 0.01070186, + "auxiliary_loss_mlp": 0.01027184, + "balance_loss_clip": 1.03401554, + "balance_loss_mlp": 1.01552558, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.8965540784747195, + "language_loss": 0.80805391, + "learning_rate": 4.721473755175698e-07, + "loss": 0.82902753, + "num_input_tokens_seen": 281099500, + "step": 13030, + "time_per_iteration": 2.802244186401367 + }, + { + "auxiliary_loss_clip": 0.01091269, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.03421819, + "balance_loss_mlp": 1.01707029, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.751147444384061, + "language_loss": 0.70601904, + "learning_rate": 4.71896083506476e-07, + "loss": 0.7272169, + "num_input_tokens_seen": 281121250, + "step": 13031, + "time_per_iteration": 2.692859411239624 + }, + { + "auxiliary_loss_clip": 0.01059041, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.03095388, + "balance_loss_mlp": 1.0222553, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 1.7559000337353676, + "language_loss": 0.78859127, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80951738, + "num_input_tokens_seen": 281138760, + "step": 13032, + "time_per_iteration": 4.181833744049072 + }, + { + "auxiliary_loss_clip": 0.01093642, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.03685892, + "balance_loss_mlp": 1.02496731, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.650954255924689, + "language_loss": 0.62931895, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.65061855, + "num_input_tokens_seen": 281157420, + "step": 13033, + "time_per_iteration": 2.5905189514160156 + }, + { + "auxiliary_loss_clip": 0.01086069, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.03286064, + "balance_loss_mlp": 1.01958847, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.764935335829537, + "language_loss": 0.71951592, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74068564, + "num_input_tokens_seen": 281174620, + "step": 13034, + "time_per_iteration": 2.639228343963623 + }, + { + "auxiliary_loss_clip": 0.01100531, + "auxiliary_loss_mlp": 0.00749548, + "balance_loss_clip": 1.03522933, + "balance_loss_mlp": 1.00025177, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.7889475930810819, + "language_loss": 0.7209698, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.7394706, + "num_input_tokens_seen": 281193865, + "step": 13035, + "time_per_iteration": 2.6353259086608887 + }, + { + "auxiliary_loss_clip": 0.01099286, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.03472972, + "balance_loss_mlp": 1.0228529, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 2.6210582222331564, + "language_loss": 0.66527629, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68661582, + "num_input_tokens_seen": 281212250, + "step": 13036, + "time_per_iteration": 2.627842664718628 + }, + { + "auxiliary_loss_clip": 0.01093501, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.03544319, + "balance_loss_mlp": 1.0239048, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.3341266409903163, + "language_loss": 0.73391211, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75520647, + "num_input_tokens_seen": 281230850, + "step": 13037, + "time_per_iteration": 2.6162948608398438 + }, + { + "auxiliary_loss_clip": 0.01053662, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.02775884, + "balance_loss_mlp": 1.02153683, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.368365289337961, + "language_loss": 0.59964347, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62051243, + "num_input_tokens_seen": 281249810, + "step": 13038, + "time_per_iteration": 2.631896495819092 + }, + { + "auxiliary_loss_clip": 0.01075472, + "auxiliary_loss_mlp": 0.01025479, + "balance_loss_clip": 1.0333848, + "balance_loss_mlp": 1.01514912, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 2.3071739042594146, + "language_loss": 0.68126857, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70227808, + "num_input_tokens_seen": 281273730, + "step": 13039, + "time_per_iteration": 4.105385065078735 + }, + { + "auxiliary_loss_clip": 0.01057793, + "auxiliary_loss_mlp": 0.01023304, + "balance_loss_clip": 1.02898622, + "balance_loss_mlp": 1.01373768, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 2.2523576044522766, + "language_loss": 0.6923517, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71316266, + "num_input_tokens_seen": 281293670, + "step": 13040, + "time_per_iteration": 2.6812729835510254 + }, + { + "auxiliary_loss_clip": 0.01054761, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.03431082, + "balance_loss_mlp": 1.02084172, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 2.059011848354032, + "language_loss": 0.67526579, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69614291, + "num_input_tokens_seen": 281313070, + "step": 13041, + "time_per_iteration": 2.6930880546569824 + }, + { + "auxiliary_loss_clip": 0.01015248, + "auxiliary_loss_mlp": 0.00746531, + "balance_loss_clip": 1.00529242, + "balance_loss_mlp": 0.99971658, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6592183714851688, + "language_loss": 0.57412291, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59174067, + "num_input_tokens_seen": 281374880, + "step": 13042, + "time_per_iteration": 3.1664769649505615 + }, + { + "auxiliary_loss_clip": 0.01070636, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.03159976, + "balance_loss_mlp": 1.01934791, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 2.835540964537229, + "language_loss": 0.83696306, + "learning_rate": 4.688851018730369e-07, + "loss": 0.85797697, + "num_input_tokens_seen": 281392620, + "step": 13043, + "time_per_iteration": 2.874720573425293 + }, + { + "auxiliary_loss_clip": 0.01084924, + "auxiliary_loss_mlp": 0.01024488, + "balance_loss_clip": 1.03452802, + "balance_loss_mlp": 1.01392019, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.3964595303999017, + "language_loss": 0.88702118, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90811533, + "num_input_tokens_seen": 281413140, + "step": 13044, + "time_per_iteration": 2.697390079498291 + }, + { + "auxiliary_loss_clip": 0.01078926, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03260207, + "balance_loss_mlp": 1.01890254, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.78074383364826, + "language_loss": 0.78707743, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.80816936, + "num_input_tokens_seen": 281430860, + "step": 13045, + "time_per_iteration": 2.6233270168304443 + }, + { + "auxiliary_loss_clip": 0.0107149, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.03203988, + "balance_loss_mlp": 1.01830816, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.4235899424084852, + "language_loss": 0.72512758, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74613231, + "num_input_tokens_seen": 281451385, + "step": 13046, + "time_per_iteration": 2.6163110733032227 + }, + { + "auxiliary_loss_clip": 0.01054545, + "auxiliary_loss_mlp": 0.01034163, + "balance_loss_clip": 1.03552079, + "balance_loss_mlp": 1.02192664, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.8146973801727717, + "language_loss": 0.63174301, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65263009, + "num_input_tokens_seen": 281472255, + "step": 13047, + "time_per_iteration": 2.737078905105591 + }, + { + "auxiliary_loss_clip": 0.0108817, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.03541112, + "balance_loss_mlp": 1.01749659, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.5752883721132667, + "language_loss": 0.72945237, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75061613, + "num_input_tokens_seen": 281492860, + "step": 13048, + "time_per_iteration": 2.7118782997131348 + }, + { + "auxiliary_loss_clip": 0.01075335, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.03438973, + "balance_loss_mlp": 1.01847816, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 2.1842266486716504, + "language_loss": 0.74053693, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76159024, + "num_input_tokens_seen": 281511815, + "step": 13049, + "time_per_iteration": 4.151455402374268 + }, + { + "auxiliary_loss_clip": 0.01101017, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.03366053, + "balance_loss_mlp": 1.01888645, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 3.535002394323985, + "language_loss": 0.72908449, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.75040829, + "num_input_tokens_seen": 281530090, + "step": 13050, + "time_per_iteration": 2.54844069480896 + }, + { + "auxiliary_loss_clip": 0.01083451, + "auxiliary_loss_mlp": 0.01033266, + "balance_loss_clip": 1.03186417, + "balance_loss_mlp": 1.0221262, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.13796627915212, + "language_loss": 0.74085146, + "learning_rate": 4.668824245713825e-07, + "loss": 0.76201868, + "num_input_tokens_seen": 281547075, + "step": 13051, + "time_per_iteration": 2.568241834640503 + }, + { + "auxiliary_loss_clip": 0.01100804, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.03593409, + "balance_loss_mlp": 1.02243304, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.015679553093839, + "language_loss": 0.72909689, + "learning_rate": 4.666323514209227e-07, + "loss": 0.75045025, + "num_input_tokens_seen": 281568080, + "step": 13052, + "time_per_iteration": 2.653505325317383 + }, + { + "auxiliary_loss_clip": 0.01072183, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.03285146, + "balance_loss_mlp": 1.02304864, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 1.902807022135397, + "language_loss": 0.6916343, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71269315, + "num_input_tokens_seen": 281586925, + "step": 13053, + "time_per_iteration": 2.5698719024658203 + }, + { + "auxiliary_loss_clip": 0.0107932, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.03278089, + "balance_loss_mlp": 1.01754642, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 1.9317375124330416, + "language_loss": 0.70183802, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72291303, + "num_input_tokens_seen": 281603915, + "step": 13054, + "time_per_iteration": 2.5592000484466553 + }, + { + "auxiliary_loss_clip": 0.0108879, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.03398895, + "balance_loss_mlp": 1.02059865, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.7821242930663053, + "language_loss": 0.75579631, + "learning_rate": 4.658824808801938e-07, + "loss": 0.777004, + "num_input_tokens_seen": 281624220, + "step": 13055, + "time_per_iteration": 2.5856947898864746 + }, + { + "auxiliary_loss_clip": 0.01103631, + "auxiliary_loss_mlp": 0.01034765, + "balance_loss_clip": 1.03603375, + "balance_loss_mlp": 1.02302265, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 1.8876855271543285, + "language_loss": 0.74515057, + "learning_rate": 4.656326403684283e-07, + "loss": 0.76653451, + "num_input_tokens_seen": 281642325, + "step": 13056, + "time_per_iteration": 2.575216770172119 + }, + { + "auxiliary_loss_clip": 0.01036219, + "auxiliary_loss_mlp": 0.01027946, + "balance_loss_clip": 1.03289747, + "balance_loss_mlp": 1.01640666, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.907812010829499, + "language_loss": 0.69820309, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.71884471, + "num_input_tokens_seen": 281663065, + "step": 13057, + "time_per_iteration": 2.7510077953338623 + }, + { + "auxiliary_loss_clip": 0.01052734, + "auxiliary_loss_mlp": 0.01032661, + "balance_loss_clip": 1.03394079, + "balance_loss_mlp": 1.02170551, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 3.1601303175462934, + "language_loss": 0.7694186, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.79027259, + "num_input_tokens_seen": 281681005, + "step": 13058, + "time_per_iteration": 2.692715883255005 + }, + { + "auxiliary_loss_clip": 0.01086322, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.03335083, + "balance_loss_mlp": 1.02175057, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.842843624220876, + "language_loss": 0.70682627, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72802091, + "num_input_tokens_seen": 281697965, + "step": 13059, + "time_per_iteration": 2.6546390056610107 + }, + { + "auxiliary_loss_clip": 0.01065958, + "auxiliary_loss_mlp": 0.01037756, + "balance_loss_clip": 1.03144145, + "balance_loss_mlp": 1.02538812, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 2.035230643193086, + "language_loss": 0.76884091, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78987801, + "num_input_tokens_seen": 281716035, + "step": 13060, + "time_per_iteration": 2.6633520126342773 + }, + { + "auxiliary_loss_clip": 0.01070576, + "auxiliary_loss_mlp": 0.01027668, + "balance_loss_clip": 1.03385973, + "balance_loss_mlp": 1.01577091, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 1.9706210581136088, + "language_loss": 0.76882815, + "learning_rate": 4.643843107494654e-07, + "loss": 0.78981054, + "num_input_tokens_seen": 281732815, + "step": 13061, + "time_per_iteration": 2.6162471771240234 + }, + { + "auxiliary_loss_clip": 0.01060865, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.03023815, + "balance_loss_mlp": 1.0189836, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 12.131121858772733, + "language_loss": 0.74629056, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76720834, + "num_input_tokens_seen": 281751980, + "step": 13062, + "time_per_iteration": 2.696916103363037 + }, + { + "auxiliary_loss_clip": 0.01083751, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.03166556, + "balance_loss_mlp": 1.01954818, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 2.069451688814598, + "language_loss": 0.68448919, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70563143, + "num_input_tokens_seen": 281772670, + "step": 13063, + "time_per_iteration": 2.561878204345703 + }, + { + "auxiliary_loss_clip": 0.0108567, + "auxiliary_loss_mlp": 0.0102966, + "balance_loss_clip": 1.03631854, + "balance_loss_mlp": 1.01831722, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 1.7631811970498972, + "language_loss": 0.72559315, + "learning_rate": 4.636360116707625e-07, + "loss": 0.74674648, + "num_input_tokens_seen": 281792930, + "step": 13064, + "time_per_iteration": 2.676856756210327 + }, + { + "auxiliary_loss_clip": 0.01069323, + "auxiliary_loss_mlp": 0.01032445, + "balance_loss_clip": 1.03396225, + "balance_loss_mlp": 1.02122164, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 1.669933656097659, + "language_loss": 0.68120253, + "learning_rate": 4.633866951500718e-07, + "loss": 0.7022202, + "num_input_tokens_seen": 281811805, + "step": 13065, + "time_per_iteration": 4.132867813110352 + }, + { + "auxiliary_loss_clip": 0.0108726, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.03712487, + "balance_loss_mlp": 1.02238154, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.8146041825144903, + "language_loss": 0.76347315, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78468335, + "num_input_tokens_seen": 281831885, + "step": 13066, + "time_per_iteration": 2.560377597808838 + }, + { + "auxiliary_loss_clip": 0.01024206, + "auxiliary_loss_mlp": 0.01001564, + "balance_loss_clip": 1.0042268, + "balance_loss_mlp": 1.00064039, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.69921390162706, + "language_loss": 0.53380364, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55406135, + "num_input_tokens_seen": 281900310, + "step": 13067, + "time_per_iteration": 3.141031503677368 + }, + { + "auxiliary_loss_clip": 0.01054999, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.03317714, + "balance_loss_mlp": 1.01660275, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 2.0481096227930142, + "language_loss": 0.67586631, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69669843, + "num_input_tokens_seen": 281918870, + "step": 13068, + "time_per_iteration": 2.6917672157287598 + }, + { + "auxiliary_loss_clip": 0.01064967, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.03362405, + "balance_loss_mlp": 1.01985002, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 1.973180844901316, + "language_loss": 0.68224573, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70320261, + "num_input_tokens_seen": 281936905, + "step": 13069, + "time_per_iteration": 2.6460630893707275 + }, + { + "auxiliary_loss_clip": 0.01090896, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.03597188, + "balance_loss_mlp": 1.02203918, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.7723313884302088, + "language_loss": 0.76984048, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79108608, + "num_input_tokens_seen": 281955625, + "step": 13070, + "time_per_iteration": 2.624246120452881 + }, + { + "auxiliary_loss_clip": 0.0103465, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.02754092, + "balance_loss_mlp": 1.01965809, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.5502791761402386, + "language_loss": 0.6583873, + "learning_rate": 4.618920199958083e-07, + "loss": 0.67905724, + "num_input_tokens_seen": 281973285, + "step": 13071, + "time_per_iteration": 2.7248952388763428 + }, + { + "auxiliary_loss_clip": 0.01045946, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.02791882, + "balance_loss_mlp": 1.01858473, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.699371969686211, + "language_loss": 0.74090469, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76166105, + "num_input_tokens_seen": 281991410, + "step": 13072, + "time_per_iteration": 4.2648680210113525 + }, + { + "auxiliary_loss_clip": 0.01086937, + "auxiliary_loss_mlp": 0.01027009, + "balance_loss_clip": 1.03698301, + "balance_loss_mlp": 1.01510036, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 2.158890320150035, + "language_loss": 0.71533465, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73647416, + "num_input_tokens_seen": 282010845, + "step": 13073, + "time_per_iteration": 2.5954134464263916 + }, + { + "auxiliary_loss_clip": 0.0107276, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.0346092, + "balance_loss_mlp": 1.02049994, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.6871840281740498, + "language_loss": 0.7651161, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78616804, + "num_input_tokens_seen": 282029635, + "step": 13074, + "time_per_iteration": 2.6043295860290527 + }, + { + "auxiliary_loss_clip": 0.01052267, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.03104496, + "balance_loss_mlp": 1.0174824, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.5796370706705607, + "language_loss": 0.75084478, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77165121, + "num_input_tokens_seen": 282050285, + "step": 13075, + "time_per_iteration": 2.635038375854492 + }, + { + "auxiliary_loss_clip": 0.0107073, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.03589296, + "balance_loss_mlp": 1.01588345, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.8152738356397, + "language_loss": 0.69151175, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71248007, + "num_input_tokens_seen": 282071040, + "step": 13076, + "time_per_iteration": 2.6516737937927246 + }, + { + "auxiliary_loss_clip": 0.01082433, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.0327704, + "balance_loss_mlp": 1.01728582, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 2.1473647105229934, + "language_loss": 0.79836547, + "learning_rate": 4.603994445488282e-07, + "loss": 0.81947911, + "num_input_tokens_seen": 282086610, + "step": 13077, + "time_per_iteration": 2.535564661026001 + }, + { + "auxiliary_loss_clip": 0.01087066, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.03434527, + "balance_loss_mlp": 1.02005458, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.7717838369035428, + "language_loss": 0.70893073, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.730115, + "num_input_tokens_seen": 282107440, + "step": 13078, + "time_per_iteration": 2.6302075386047363 + }, + { + "auxiliary_loss_clip": 0.01082916, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.03408122, + "balance_loss_mlp": 1.01925063, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.5358466024009416, + "language_loss": 0.81053078, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83166301, + "num_input_tokens_seen": 282127290, + "step": 13079, + "time_per_iteration": 4.19733190536499 + }, + { + "auxiliary_loss_clip": 0.01062792, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.03518295, + "balance_loss_mlp": 1.01790345, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.5569875540868898, + "language_loss": 0.68416566, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70508087, + "num_input_tokens_seen": 282147505, + "step": 13080, + "time_per_iteration": 2.684400796890259 + }, + { + "auxiliary_loss_clip": 0.01086904, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.03407001, + "balance_loss_mlp": 1.02140605, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.7347773579665313, + "language_loss": 0.69434309, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71554315, + "num_input_tokens_seen": 282166450, + "step": 13081, + "time_per_iteration": 2.5332627296447754 + }, + { + "auxiliary_loss_clip": 0.01075409, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.03282785, + "balance_loss_mlp": 1.02280545, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.8954160314418478, + "language_loss": 0.68361008, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70469815, + "num_input_tokens_seen": 282186465, + "step": 13082, + "time_per_iteration": 2.631516933441162 + }, + { + "auxiliary_loss_clip": 0.01067659, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.03230631, + "balance_loss_mlp": 1.0224402, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.8332379875291351, + "language_loss": 0.66101742, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68203193, + "num_input_tokens_seen": 282207180, + "step": 13083, + "time_per_iteration": 2.609575033187866 + }, + { + "auxiliary_loss_clip": 0.01078204, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.03425527, + "balance_loss_mlp": 1.01945639, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.260426741101082, + "language_loss": 0.74180096, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76289654, + "num_input_tokens_seen": 282225865, + "step": 13084, + "time_per_iteration": 2.6292872428894043 + }, + { + "auxiliary_loss_clip": 0.01068514, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.03150368, + "balance_loss_mlp": 1.0232017, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 2.305657013268059, + "language_loss": 0.70535994, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72639596, + "num_input_tokens_seen": 282242895, + "step": 13085, + "time_per_iteration": 2.5745933055877686 + }, + { + "auxiliary_loss_clip": 0.01073289, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.03218758, + "balance_loss_mlp": 1.01894009, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 2.123238971981659, + "language_loss": 0.72849566, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.7495352, + "num_input_tokens_seen": 282260425, + "step": 13086, + "time_per_iteration": 2.6272380352020264 + }, + { + "auxiliary_loss_clip": 0.01096135, + "auxiliary_loss_mlp": 0.01026076, + "balance_loss_clip": 1.0323751, + "balance_loss_mlp": 1.01505482, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.8041849397520853, + "language_loss": 0.74617553, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.76739764, + "num_input_tokens_seen": 282279335, + "step": 13087, + "time_per_iteration": 2.542023181915283 + }, + { + "auxiliary_loss_clip": 0.01074962, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.03253388, + "balance_loss_mlp": 1.01821637, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.637018877888885, + "language_loss": 0.71525097, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73628891, + "num_input_tokens_seen": 282299905, + "step": 13088, + "time_per_iteration": 2.6362826824188232 + }, + { + "auxiliary_loss_clip": 0.01023731, + "auxiliary_loss_mlp": 0.01002286, + "balance_loss_clip": 1.00382483, + "balance_loss_mlp": 1.00134981, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6794462530536347, + "language_loss": 0.55448043, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57474065, + "num_input_tokens_seen": 282367620, + "step": 13089, + "time_per_iteration": 4.7193567752838135 + }, + { + "auxiliary_loss_clip": 0.01015232, + "auxiliary_loss_mlp": 0.01001933, + "balance_loss_clip": 1.00561261, + "balance_loss_mlp": 1.00102127, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7281553393762508, + "language_loss": 0.49952894, + "learning_rate": 4.571727439470976e-07, + "loss": 0.51970065, + "num_input_tokens_seen": 282435695, + "step": 13090, + "time_per_iteration": 3.171635389328003 + }, + { + "auxiliary_loss_clip": 0.01088445, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.03564262, + "balance_loss_mlp": 1.01822352, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.714154151897209, + "language_loss": 0.83797693, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.85914767, + "num_input_tokens_seen": 282456025, + "step": 13091, + "time_per_iteration": 2.5775420665740967 + }, + { + "auxiliary_loss_clip": 0.01014172, + "auxiliary_loss_mlp": 0.01001761, + "balance_loss_clip": 1.00448656, + "balance_loss_mlp": 1.00088489, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7152765800714898, + "language_loss": 0.63924295, + "learning_rate": 4.566772055150947e-07, + "loss": 0.65940231, + "num_input_tokens_seen": 282520995, + "step": 13092, + "time_per_iteration": 3.125236749649048 + }, + { + "auxiliary_loss_clip": 0.01078734, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.03604984, + "balance_loss_mlp": 1.0237509, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.0876340949280525, + "language_loss": 0.79135227, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81249619, + "num_input_tokens_seen": 282539355, + "step": 13093, + "time_per_iteration": 2.602855682373047 + }, + { + "auxiliary_loss_clip": 0.01071315, + "auxiliary_loss_mlp": 0.01025289, + "balance_loss_clip": 1.03582549, + "balance_loss_mlp": 1.01470351, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 2.2772561635775914, + "language_loss": 0.75443453, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77540052, + "num_input_tokens_seen": 282555735, + "step": 13094, + "time_per_iteration": 2.6046810150146484 + }, + { + "auxiliary_loss_clip": 0.01045021, + "auxiliary_loss_mlp": 0.01040401, + "balance_loss_clip": 1.02892065, + "balance_loss_mlp": 1.02774704, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.7296378208936687, + "language_loss": 0.79782611, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81868035, + "num_input_tokens_seen": 282574550, + "step": 13095, + "time_per_iteration": 2.6923937797546387 + }, + { + "auxiliary_loss_clip": 0.01087391, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.03267944, + "balance_loss_mlp": 1.02062583, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 1.8897891947740943, + "language_loss": 0.67975783, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70094979, + "num_input_tokens_seen": 282596520, + "step": 13096, + "time_per_iteration": 2.6413166522979736 + }, + { + "auxiliary_loss_clip": 0.01067892, + "auxiliary_loss_mlp": 0.01022472, + "balance_loss_clip": 1.02977443, + "balance_loss_mlp": 1.01272702, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.5099772811863812, + "language_loss": 0.70553088, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72643459, + "num_input_tokens_seen": 282620560, + "step": 13097, + "time_per_iteration": 2.802647352218628 + }, + { + "auxiliary_loss_clip": 0.01079081, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.03552318, + "balance_loss_mlp": 1.02195525, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.6300328120906087, + "language_loss": 0.80400562, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82512802, + "num_input_tokens_seen": 282639830, + "step": 13098, + "time_per_iteration": 2.607403516769409 + }, + { + "auxiliary_loss_clip": 0.01054129, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.03005552, + "balance_loss_mlp": 1.0192225, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.7365598555136619, + "language_loss": 0.74641025, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76724964, + "num_input_tokens_seen": 282660130, + "step": 13099, + "time_per_iteration": 2.6084258556365967 + }, + { + "auxiliary_loss_clip": 0.01073298, + "auxiliary_loss_mlp": 0.01025596, + "balance_loss_clip": 1.03140211, + "balance_loss_mlp": 1.01387215, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.5853461276971486, + "language_loss": 0.78458005, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80556905, + "num_input_tokens_seen": 282681125, + "step": 13100, + "time_per_iteration": 2.6194117069244385 + }, + { + "auxiliary_loss_clip": 0.010873, + "auxiliary_loss_mlp": 0.00749546, + "balance_loss_clip": 1.03378451, + "balance_loss_mlp": 1.00020289, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.4831139755859177, + "language_loss": 0.66228032, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.6806488, + "num_input_tokens_seen": 282696690, + "step": 13101, + "time_per_iteration": 2.5604376792907715 + }, + { + "auxiliary_loss_clip": 0.01075082, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.0331769, + "balance_loss_mlp": 1.01778817, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.521006414995975, + "language_loss": 0.77893126, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79996288, + "num_input_tokens_seen": 282721210, + "step": 13102, + "time_per_iteration": 2.7369656562805176 + }, + { + "auxiliary_loss_clip": 0.01082473, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.03195953, + "balance_loss_mlp": 1.02308297, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 1.8043365430179528, + "language_loss": 0.82095528, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84211552, + "num_input_tokens_seen": 282738505, + "step": 13103, + "time_per_iteration": 2.5126945972442627 + }, + { + "auxiliary_loss_clip": 0.01090061, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.03452253, + "balance_loss_mlp": 1.02040052, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 2.0382875301941605, + "language_loss": 0.80430627, + "learning_rate": 4.537088934794913e-07, + "loss": 0.8255288, + "num_input_tokens_seen": 282756895, + "step": 13104, + "time_per_iteration": 4.031643390655518 + }, + { + "auxiliary_loss_clip": 0.01099892, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.03450942, + "balance_loss_mlp": 1.02218533, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.6707973799070246, + "language_loss": 0.74080491, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76214051, + "num_input_tokens_seen": 282774955, + "step": 13105, + "time_per_iteration": 2.518738031387329 + }, + { + "auxiliary_loss_clip": 0.01038521, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.02979386, + "balance_loss_mlp": 1.02239037, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.6447637664881083, + "language_loss": 0.76420009, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.78492153, + "num_input_tokens_seen": 282793165, + "step": 13106, + "time_per_iteration": 2.6944546699523926 + }, + { + "auxiliary_loss_clip": 0.01048372, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.03228092, + "balance_loss_mlp": 1.02029824, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.4309488282609197, + "language_loss": 0.73169494, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75248659, + "num_input_tokens_seen": 282809820, + "step": 13107, + "time_per_iteration": 2.6058857440948486 + }, + { + "auxiliary_loss_clip": 0.01096932, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.03332269, + "balance_loss_mlp": 1.01815748, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.670368099724553, + "language_loss": 0.73285711, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75411797, + "num_input_tokens_seen": 282828600, + "step": 13108, + "time_per_iteration": 2.4599268436431885 + }, + { + "auxiliary_loss_clip": 0.01023531, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00355744, + "balance_loss_mlp": 1.00215662, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8833145497312732, + "language_loss": 0.60345435, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62372142, + "num_input_tokens_seen": 282882775, + "step": 13109, + "time_per_iteration": 2.9742562770843506 + }, + { + "auxiliary_loss_clip": 0.01052045, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.03198934, + "balance_loss_mlp": 1.01849103, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.7252323281162836, + "language_loss": 0.72536695, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74618334, + "num_input_tokens_seen": 282902680, + "step": 13110, + "time_per_iteration": 2.553821563720703 + }, + { + "auxiliary_loss_clip": 0.01045733, + "auxiliary_loss_mlp": 0.01028055, + "balance_loss_clip": 1.03088999, + "balance_loss_mlp": 1.0180769, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3681313273087379, + "language_loss": 0.7513063, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77204418, + "num_input_tokens_seen": 282923625, + "step": 13111, + "time_per_iteration": 4.144834995269775 + }, + { + "auxiliary_loss_clip": 0.01080156, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.03302097, + "balance_loss_mlp": 1.02065206, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 2.542918263327984, + "language_loss": 0.61369097, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63481623, + "num_input_tokens_seen": 282941955, + "step": 13112, + "time_per_iteration": 2.6252989768981934 + }, + { + "auxiliary_loss_clip": 0.01077024, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.0338521, + "balance_loss_mlp": 1.01632738, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.7879788171471487, + "language_loss": 0.673527, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69457459, + "num_input_tokens_seen": 282961280, + "step": 13113, + "time_per_iteration": 2.667740821838379 + }, + { + "auxiliary_loss_clip": 0.01056609, + "auxiliary_loss_mlp": 0.01030574, + "balance_loss_clip": 1.03084147, + "balance_loss_mlp": 1.01846242, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 2.7926845534669353, + "language_loss": 0.57723486, + "learning_rate": 4.5124174933361e-07, + "loss": 0.59810668, + "num_input_tokens_seen": 282978210, + "step": 13114, + "time_per_iteration": 2.6026315689086914 + }, + { + "auxiliary_loss_clip": 0.01049864, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.03261054, + "balance_loss_mlp": 1.01764345, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.5301397217443038, + "language_loss": 0.66894013, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.68973494, + "num_input_tokens_seen": 282998845, + "step": 13115, + "time_per_iteration": 2.7140042781829834 + }, + { + "auxiliary_loss_clip": 0.01070248, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.03183901, + "balance_loss_mlp": 1.01869428, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 1.9316579290874532, + "language_loss": 0.88574576, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90675467, + "num_input_tokens_seen": 283015200, + "step": 13116, + "time_per_iteration": 2.524517059326172 + }, + { + "auxiliary_loss_clip": 0.0108051, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.03245938, + "balance_loss_mlp": 1.02064502, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 1.7736654306152453, + "language_loss": 0.72657204, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74770898, + "num_input_tokens_seen": 283033680, + "step": 13117, + "time_per_iteration": 2.5629611015319824 + }, + { + "auxiliary_loss_clip": 0.0108392, + "auxiliary_loss_mlp": 0.01025309, + "balance_loss_clip": 1.03276598, + "balance_loss_mlp": 1.01533127, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.6446431024561297, + "language_loss": 0.79875833, + "learning_rate": 4.502565355654926e-07, + "loss": 0.81985068, + "num_input_tokens_seen": 283050620, + "step": 13118, + "time_per_iteration": 3.9756789207458496 + }, + { + "auxiliary_loss_clip": 0.0108581, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_clip": 1.03356314, + "balance_loss_mlp": 1.01524901, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.737078896930347, + "language_loss": 0.72911882, + "learning_rate": 4.500103790161878e-07, + "loss": 0.7502377, + "num_input_tokens_seen": 283070215, + "step": 13119, + "time_per_iteration": 2.5807745456695557 + }, + { + "auxiliary_loss_clip": 0.01081447, + "auxiliary_loss_mlp": 0.01024974, + "balance_loss_clip": 1.03168523, + "balance_loss_mlp": 1.01363742, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.4118486890070074, + "language_loss": 0.71861053, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.73967475, + "num_input_tokens_seen": 283091485, + "step": 13120, + "time_per_iteration": 2.539675235748291 + }, + { + "auxiliary_loss_clip": 0.01067953, + "auxiliary_loss_mlp": 0.00749624, + "balance_loss_clip": 1.03032184, + "balance_loss_mlp": 1.00023913, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.4616219682268359, + "language_loss": 0.78632736, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80450308, + "num_input_tokens_seen": 283115040, + "step": 13121, + "time_per_iteration": 2.6807358264923096 + }, + { + "auxiliary_loss_clip": 0.0108179, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.0314424, + "balance_loss_mlp": 1.01709676, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.3924956475305756, + "language_loss": 0.80171645, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82281625, + "num_input_tokens_seen": 283136925, + "step": 13122, + "time_per_iteration": 2.585319995880127 + }, + { + "auxiliary_loss_clip": 0.01069566, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.03272355, + "balance_loss_mlp": 1.01608276, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 2.230672335196501, + "language_loss": 0.7804448, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80140674, + "num_input_tokens_seen": 283155725, + "step": 13123, + "time_per_iteration": 2.5696449279785156 + }, + { + "auxiliary_loss_clip": 0.01077379, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.03754163, + "balance_loss_mlp": 1.02070451, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.820431379731183, + "language_loss": 0.67300051, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69409788, + "num_input_tokens_seen": 283173845, + "step": 13124, + "time_per_iteration": 2.6193859577178955 + }, + { + "auxiliary_loss_clip": 0.01073826, + "auxiliary_loss_mlp": 0.01026829, + "balance_loss_clip": 1.03281188, + "balance_loss_mlp": 1.01511097, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 3.3203050817813504, + "language_loss": 0.72439235, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.74539888, + "num_input_tokens_seen": 283191985, + "step": 13125, + "time_per_iteration": 2.5881118774414062 + }, + { + "auxiliary_loss_clip": 0.01079264, + "auxiliary_loss_mlp": 0.01027278, + "balance_loss_clip": 1.03192735, + "balance_loss_mlp": 1.01548266, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 1.9132818329641328, + "language_loss": 0.72456652, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74563193, + "num_input_tokens_seen": 283210855, + "step": 13126, + "time_per_iteration": 2.617619514465332 + }, + { + "auxiliary_loss_clip": 0.01077559, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.03214526, + "balance_loss_mlp": 1.0184449, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 2.5875783504243066, + "language_loss": 0.76409417, + "learning_rate": 4.480432433327845e-07, + "loss": 0.78517294, + "num_input_tokens_seen": 283229665, + "step": 13127, + "time_per_iteration": 2.5524322986602783 + }, + { + "auxiliary_loss_clip": 0.01079408, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.03274703, + "balance_loss_mlp": 1.02246571, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 2.210623280866358, + "language_loss": 0.85608447, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87722123, + "num_input_tokens_seen": 283248615, + "step": 13128, + "time_per_iteration": 2.6219191551208496 + }, + { + "auxiliary_loss_clip": 0.01084853, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.03257322, + "balance_loss_mlp": 1.02352345, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 4.3115267604132, + "language_loss": 0.69367695, + "learning_rate": 4.475520477290904e-07, + "loss": 0.71486276, + "num_input_tokens_seen": 283267135, + "step": 13129, + "time_per_iteration": 4.068050861358643 + }, + { + "auxiliary_loss_clip": 0.01013562, + "auxiliary_loss_mlp": 0.01006702, + "balance_loss_clip": 1.00376487, + "balance_loss_mlp": 1.0053786, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7441317530483843, + "language_loss": 0.61650932, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63671196, + "num_input_tokens_seen": 283328940, + "step": 13130, + "time_per_iteration": 3.133321762084961 + }, + { + "auxiliary_loss_clip": 0.01089239, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.03565407, + "balance_loss_mlp": 1.01813006, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.551831461267575, + "language_loss": 0.74011648, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.76129287, + "num_input_tokens_seen": 283350000, + "step": 13131, + "time_per_iteration": 2.591108560562134 + }, + { + "auxiliary_loss_clip": 0.01075788, + "auxiliary_loss_mlp": 0.01025737, + "balance_loss_clip": 1.03326821, + "balance_loss_mlp": 1.0128628, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.429041343475568, + "language_loss": 0.68830037, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.7093156, + "num_input_tokens_seen": 283368020, + "step": 13132, + "time_per_iteration": 2.5399770736694336 + }, + { + "auxiliary_loss_clip": 0.01089851, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.03438222, + "balance_loss_mlp": 1.02439094, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 1.9909945803682858, + "language_loss": 0.6213271, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64258397, + "num_input_tokens_seen": 283387030, + "step": 13133, + "time_per_iteration": 2.4905996322631836 + }, + { + "auxiliary_loss_clip": 0.01069043, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.03330851, + "balance_loss_mlp": 1.02390063, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.1521862422135447, + "language_loss": 0.79314834, + "learning_rate": 4.463250890899195e-07, + "loss": 0.81420428, + "num_input_tokens_seen": 283402090, + "step": 13134, + "time_per_iteration": 2.522310972213745 + }, + { + "auxiliary_loss_clip": 0.01083637, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.03073001, + "balance_loss_mlp": 1.0205729, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.9325548509648565, + "language_loss": 0.79940975, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82056415, + "num_input_tokens_seen": 283421035, + "step": 13135, + "time_per_iteration": 2.541059970855713 + }, + { + "auxiliary_loss_clip": 0.01086322, + "auxiliary_loss_mlp": 0.01028629, + "balance_loss_clip": 1.03390181, + "balance_loss_mlp": 1.01720929, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.5521418194838135, + "language_loss": 0.72474468, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74589413, + "num_input_tokens_seen": 283441830, + "step": 13136, + "time_per_iteration": 2.6379337310791016 + }, + { + "auxiliary_loss_clip": 0.01104784, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.03523278, + "balance_loss_mlp": 1.02116334, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 5.642992142929285, + "language_loss": 0.70566082, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72703987, + "num_input_tokens_seen": 283459540, + "step": 13137, + "time_per_iteration": 2.4771084785461426 + }, + { + "auxiliary_loss_clip": 0.01097367, + "auxiliary_loss_mlp": 0.01033154, + "balance_loss_clip": 1.03411472, + "balance_loss_mlp": 1.0212208, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.6711444156155388, + "language_loss": 0.74357677, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.76488203, + "num_input_tokens_seen": 283478790, + "step": 13138, + "time_per_iteration": 2.4646694660186768 + }, + { + "auxiliary_loss_clip": 0.01058808, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.0337081, + "balance_loss_mlp": 1.01876974, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 2.917398490979954, + "language_loss": 0.69070518, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.71158946, + "num_input_tokens_seen": 283495720, + "step": 13139, + "time_per_iteration": 2.6589040756225586 + }, + { + "auxiliary_loss_clip": 0.0101426, + "auxiliary_loss_mlp": 0.0100762, + "balance_loss_clip": 1.00370479, + "balance_loss_mlp": 1.0067023, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8459435729322113, + "language_loss": 0.60283512, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62305391, + "num_input_tokens_seen": 283558795, + "step": 13140, + "time_per_iteration": 3.240189552307129 + }, + { + "auxiliary_loss_clip": 0.01099694, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.03516257, + "balance_loss_mlp": 1.02252412, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.773555619703708, + "language_loss": 0.75864482, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.77998269, + "num_input_tokens_seen": 283579305, + "step": 13141, + "time_per_iteration": 2.563168525695801 + }, + { + "auxiliary_loss_clip": 0.01088853, + "auxiliary_loss_mlp": 0.01032351, + "balance_loss_clip": 1.0339036, + "balance_loss_mlp": 1.02081156, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 3.3226711966543134, + "language_loss": 0.68228877, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70350087, + "num_input_tokens_seen": 283597840, + "step": 13142, + "time_per_iteration": 2.592756509780884 + }, + { + "auxiliary_loss_clip": 0.00976993, + "auxiliary_loss_mlp": 0.01004939, + "balance_loss_clip": 1.00836921, + "balance_loss_mlp": 1.00349104, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8229302921238688, + "language_loss": 0.60050666, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62032598, + "num_input_tokens_seen": 283647950, + "step": 13143, + "time_per_iteration": 3.059338331222534 + }, + { + "auxiliary_loss_clip": 0.01069514, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.03451419, + "balance_loss_mlp": 1.01846671, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.749682852943323, + "language_loss": 0.74408865, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76508176, + "num_input_tokens_seen": 283670645, + "step": 13144, + "time_per_iteration": 2.788278818130493 + }, + { + "auxiliary_loss_clip": 0.01091895, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.0346055, + "balance_loss_mlp": 1.0166285, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.8672914841705475, + "language_loss": 0.83086336, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85206616, + "num_input_tokens_seen": 283688830, + "step": 13145, + "time_per_iteration": 4.09409499168396 + }, + { + "auxiliary_loss_clip": 0.01083151, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.03215206, + "balance_loss_mlp": 1.01895452, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.696299854936624, + "language_loss": 0.72620809, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.74733317, + "num_input_tokens_seen": 283708625, + "step": 13146, + "time_per_iteration": 2.547469139099121 + }, + { + "auxiliary_loss_clip": 0.01098646, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.03307438, + "balance_loss_mlp": 1.01811147, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 2.0180536291143443, + "language_loss": 0.75603306, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77731037, + "num_input_tokens_seen": 283725710, + "step": 13147, + "time_per_iteration": 2.4761438369750977 + }, + { + "auxiliary_loss_clip": 0.01082321, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.03299642, + "balance_loss_mlp": 1.01962471, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.9885665496285043, + "language_loss": 0.72270888, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74384695, + "num_input_tokens_seen": 283744150, + "step": 13148, + "time_per_iteration": 2.543644666671753 + }, + { + "auxiliary_loss_clip": 0.01081689, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.01615489, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 1.7928829964103605, + "language_loss": 0.7137959, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73488903, + "num_input_tokens_seen": 283764170, + "step": 13149, + "time_per_iteration": 2.5777785778045654 + }, + { + "auxiliary_loss_clip": 0.01059778, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.03058529, + "balance_loss_mlp": 1.02041388, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 2.894828277750414, + "language_loss": 0.65443718, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67537475, + "num_input_tokens_seen": 283784305, + "step": 13150, + "time_per_iteration": 2.633639097213745 + }, + { + "auxiliary_loss_clip": 0.01096542, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.0332725, + "balance_loss_mlp": 1.01963735, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 1.9922029322572323, + "language_loss": 0.70086062, + "learning_rate": 4.421644538650231e-07, + "loss": 0.72212946, + "num_input_tokens_seen": 283804040, + "step": 13151, + "time_per_iteration": 4.021931886672974 + }, + { + "auxiliary_loss_clip": 0.01079941, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.03428102, + "balance_loss_mlp": 1.02170897, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 2.750189577255607, + "language_loss": 0.70031965, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72145337, + "num_input_tokens_seen": 283827120, + "step": 13152, + "time_per_iteration": 2.748460531234741 + }, + { + "auxiliary_loss_clip": 0.01062471, + "auxiliary_loss_mlp": 0.0074922, + "balance_loss_clip": 1.03077531, + "balance_loss_mlp": 1.00027347, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 3.1625839017194424, + "language_loss": 0.72895503, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74707186, + "num_input_tokens_seen": 283844820, + "step": 13153, + "time_per_iteration": 2.601700782775879 + }, + { + "auxiliary_loss_clip": 0.01098897, + "auxiliary_loss_mlp": 0.01024712, + "balance_loss_clip": 1.03370452, + "balance_loss_mlp": 1.01339352, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.5302883231446058, + "language_loss": 0.7886554, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.80989146, + "num_input_tokens_seen": 283862870, + "step": 13154, + "time_per_iteration": 2.518211841583252 + }, + { + "auxiliary_loss_clip": 0.0109163, + "auxiliary_loss_mlp": 0.01024439, + "balance_loss_clip": 1.03353417, + "balance_loss_mlp": 1.01180899, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 2.207308427717594, + "language_loss": 0.70079911, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72195983, + "num_input_tokens_seen": 283882405, + "step": 13155, + "time_per_iteration": 2.5383987426757812 + }, + { + "auxiliary_loss_clip": 0.01098074, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.0332588, + "balance_loss_mlp": 1.01753402, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 3.420316045545355, + "language_loss": 0.76826948, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.78953928, + "num_input_tokens_seen": 283902070, + "step": 13156, + "time_per_iteration": 2.5175588130950928 + }, + { + "auxiliary_loss_clip": 0.01063582, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.0312314, + "balance_loss_mlp": 1.02222204, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 2.0202811250079096, + "language_loss": 0.65316415, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67413545, + "num_input_tokens_seen": 283924100, + "step": 13157, + "time_per_iteration": 2.7160067558288574 + }, + { + "auxiliary_loss_clip": 0.01084717, + "auxiliary_loss_mlp": 0.01035413, + "balance_loss_clip": 1.0315311, + "balance_loss_mlp": 1.02322412, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.8055450065716712, + "language_loss": 0.73914611, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76034743, + "num_input_tokens_seen": 283944955, + "step": 13158, + "time_per_iteration": 2.61094331741333 + }, + { + "auxiliary_loss_clip": 0.01083777, + "auxiliary_loss_mlp": 0.01025577, + "balance_loss_clip": 1.03255081, + "balance_loss_mlp": 1.0154388, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.113368457786046, + "language_loss": 0.670977, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69207054, + "num_input_tokens_seen": 283963125, + "step": 13159, + "time_per_iteration": 4.051636457443237 + }, + { + "auxiliary_loss_clip": 0.010852, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.03340149, + "balance_loss_mlp": 1.02009702, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.7940506669403158, + "language_loss": 0.66777694, + "learning_rate": 4.399686733077206e-07, + "loss": 0.68893945, + "num_input_tokens_seen": 283982850, + "step": 13160, + "time_per_iteration": 2.570692539215088 + }, + { + "auxiliary_loss_clip": 0.01071753, + "auxiliary_loss_mlp": 0.01026267, + "balance_loss_clip": 1.03144789, + "balance_loss_mlp": 1.01674223, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 2.5735293391792746, + "language_loss": 0.72731954, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.74829972, + "num_input_tokens_seen": 283998275, + "step": 13161, + "time_per_iteration": 2.567274808883667 + }, + { + "auxiliary_loss_clip": 0.0106779, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.0329417, + "balance_loss_mlp": 1.01762474, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 1.7835666152917482, + "language_loss": 0.73118389, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75215322, + "num_input_tokens_seen": 284018750, + "step": 13162, + "time_per_iteration": 2.6813836097717285 + }, + { + "auxiliary_loss_clip": 0.01076656, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.03428173, + "balance_loss_mlp": 1.01831973, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 6.017806154014291, + "language_loss": 0.71895581, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74001551, + "num_input_tokens_seen": 284037850, + "step": 13163, + "time_per_iteration": 2.602478504180908 + }, + { + "auxiliary_loss_clip": 0.01059358, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.02946699, + "balance_loss_mlp": 1.01747215, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 12.161784496698397, + "language_loss": 0.69919157, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.72008157, + "num_input_tokens_seen": 284056380, + "step": 13164, + "time_per_iteration": 2.656808853149414 + }, + { + "auxiliary_loss_clip": 0.01060688, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.03320801, + "balance_loss_mlp": 1.0197953, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 2.4581874761233857, + "language_loss": 0.66796839, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68888319, + "num_input_tokens_seen": 284074945, + "step": 13165, + "time_per_iteration": 2.6599843502044678 + }, + { + "auxiliary_loss_clip": 0.01048904, + "auxiliary_loss_mlp": 0.01024235, + "balance_loss_clip": 1.03188848, + "balance_loss_mlp": 1.01394725, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 2.218900261320024, + "language_loss": 0.72395402, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74468541, + "num_input_tokens_seen": 284092070, + "step": 13166, + "time_per_iteration": 2.6743686199188232 + }, + { + "auxiliary_loss_clip": 0.01096934, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.03320718, + "balance_loss_mlp": 1.02052653, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.5800472157440657, + "language_loss": 0.77437699, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79567075, + "num_input_tokens_seen": 284112255, + "step": 13167, + "time_per_iteration": 2.5615956783294678 + }, + { + "auxiliary_loss_clip": 0.01064371, + "auxiliary_loss_mlp": 0.01028136, + "balance_loss_clip": 1.03280544, + "balance_loss_mlp": 1.01769328, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.5747254593975364, + "language_loss": 0.84327459, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86419964, + "num_input_tokens_seen": 284132330, + "step": 13168, + "time_per_iteration": 2.6821861267089844 + }, + { + "auxiliary_loss_clip": 0.01099331, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.03467751, + "balance_loss_mlp": 1.01611304, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 2.839988980740989, + "language_loss": 0.72366202, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74492681, + "num_input_tokens_seen": 284150640, + "step": 13169, + "time_per_iteration": 4.044264316558838 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.03455007, + "balance_loss_mlp": 1.02287149, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 3.5595181971141607, + "language_loss": 0.66896331, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69031429, + "num_input_tokens_seen": 284171910, + "step": 13170, + "time_per_iteration": 2.674539804458618 + }, + { + "auxiliary_loss_clip": 0.01085896, + "auxiliary_loss_mlp": 0.01023407, + "balance_loss_clip": 1.031829, + "balance_loss_mlp": 1.0128634, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.606139990035537, + "language_loss": 0.7059015, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72699451, + "num_input_tokens_seen": 284191340, + "step": 13171, + "time_per_iteration": 2.6668148040771484 + }, + { + "auxiliary_loss_clip": 0.01084669, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.03221333, + "balance_loss_mlp": 1.0159241, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 2.7969017185467098, + "language_loss": 0.67086905, + "learning_rate": 4.370484207842553e-07, + "loss": 0.69198531, + "num_input_tokens_seen": 284212495, + "step": 13172, + "time_per_iteration": 2.668419361114502 + }, + { + "auxiliary_loss_clip": 0.01070787, + "auxiliary_loss_mlp": 0.01033727, + "balance_loss_clip": 1.03219604, + "balance_loss_mlp": 1.02235448, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 2.356813467032105, + "language_loss": 0.79567659, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81672168, + "num_input_tokens_seen": 284230825, + "step": 13173, + "time_per_iteration": 2.618227481842041 + }, + { + "auxiliary_loss_clip": 0.0105685, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.0298723, + "balance_loss_mlp": 1.02135301, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.9524404935403339, + "language_loss": 0.76371783, + "learning_rate": 4.365625413419365e-07, + "loss": 0.78461635, + "num_input_tokens_seen": 284250365, + "step": 13174, + "time_per_iteration": 2.7292912006378174 + }, + { + "auxiliary_loss_clip": 0.01068579, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.02911985, + "balance_loss_mlp": 1.02195477, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.8243831025151833, + "language_loss": 0.71584994, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73686481, + "num_input_tokens_seen": 284269635, + "step": 13175, + "time_per_iteration": 2.6636598110198975 + }, + { + "auxiliary_loss_clip": 0.01084374, + "auxiliary_loss_mlp": 0.01027087, + "balance_loss_clip": 1.0313282, + "balance_loss_mlp": 1.01592278, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 2.0663044517657876, + "language_loss": 0.59429038, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61540496, + "num_input_tokens_seen": 284288380, + "step": 13176, + "time_per_iteration": 2.525705337524414 + }, + { + "auxiliary_loss_clip": 0.01098884, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.03668106, + "balance_loss_mlp": 1.0211482, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 2.589971538334111, + "language_loss": 0.73541403, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75672269, + "num_input_tokens_seen": 284306920, + "step": 13177, + "time_per_iteration": 2.498584747314453 + }, + { + "auxiliary_loss_clip": 0.01077288, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.03183389, + "balance_loss_mlp": 1.02016985, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 1.8697711501452279, + "language_loss": 0.63846976, + "learning_rate": 4.355914939594174e-07, + "loss": 0.65955859, + "num_input_tokens_seen": 284324700, + "step": 13178, + "time_per_iteration": 2.554647445678711 + }, + { + "auxiliary_loss_clip": 0.01072094, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.03002059, + "balance_loss_mlp": 1.02017963, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.7141973886935544, + "language_loss": 0.68656546, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70758796, + "num_input_tokens_seen": 284345985, + "step": 13179, + "time_per_iteration": 2.654243230819702 + }, + { + "auxiliary_loss_clip": 0.01096248, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.03303158, + "balance_loss_mlp": 1.01475954, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 2.293857651402089, + "language_loss": 0.74154294, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76276457, + "num_input_tokens_seen": 284364475, + "step": 13180, + "time_per_iteration": 2.559100866317749 + }, + { + "auxiliary_loss_clip": 0.01088323, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.03564644, + "balance_loss_mlp": 1.02415931, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.0642562589072053, + "language_loss": 0.81479514, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.83604032, + "num_input_tokens_seen": 284382125, + "step": 13181, + "time_per_iteration": 2.5859012603759766 + }, + { + "auxiliary_loss_clip": 0.01066365, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.03021991, + "balance_loss_mlp": 1.02182341, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.8111156309358916, + "language_loss": 0.77511144, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79610747, + "num_input_tokens_seen": 284401585, + "step": 13182, + "time_per_iteration": 2.5848069190979004 + }, + { + "auxiliary_loss_clip": 0.01082943, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.03366089, + "balance_loss_mlp": 1.02234149, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 1.8094737274925055, + "language_loss": 0.73978436, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76095772, + "num_input_tokens_seen": 284419125, + "step": 13183, + "time_per_iteration": 2.5487606525421143 + }, + { + "auxiliary_loss_clip": 0.01063999, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.03187799, + "balance_loss_mlp": 1.01917493, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.6862585880137708, + "language_loss": 0.68169427, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70265293, + "num_input_tokens_seen": 284440445, + "step": 13184, + "time_per_iteration": 4.266161918640137 + }, + { + "auxiliary_loss_clip": 0.01059523, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.03330851, + "balance_loss_mlp": 1.01919246, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.8951514728041365, + "language_loss": 0.70892388, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72982001, + "num_input_tokens_seen": 284459370, + "step": 13185, + "time_per_iteration": 2.569530963897705 + }, + { + "auxiliary_loss_clip": 0.01083935, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.03383815, + "balance_loss_mlp": 1.01601028, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 2.1109691565851967, + "language_loss": 0.65407556, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.6751889, + "num_input_tokens_seen": 284477525, + "step": 13186, + "time_per_iteration": 2.5400285720825195 + }, + { + "auxiliary_loss_clip": 0.01078502, + "auxiliary_loss_mlp": 0.0102793, + "balance_loss_clip": 1.03238964, + "balance_loss_mlp": 1.01696289, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 1.6468990128425918, + "language_loss": 0.76961935, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79068363, + "num_input_tokens_seen": 284496590, + "step": 13187, + "time_per_iteration": 2.5630075931549072 + }, + { + "auxiliary_loss_clip": 0.01075767, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.03285527, + "balance_loss_mlp": 1.01849973, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.2940894610496785, + "language_loss": 0.72688723, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74793822, + "num_input_tokens_seen": 284511470, + "step": 13188, + "time_per_iteration": 2.553344488143921 + }, + { + "auxiliary_loss_clip": 0.01098719, + "auxiliary_loss_mlp": 0.00749641, + "balance_loss_clip": 1.0333451, + "balance_loss_mlp": 1.00023055, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 2.2508873548382304, + "language_loss": 0.63139087, + "learning_rate": 4.329260095357725e-07, + "loss": 0.64987445, + "num_input_tokens_seen": 284531125, + "step": 13189, + "time_per_iteration": 2.503720998764038 + }, + { + "auxiliary_loss_clip": 0.01051113, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.0311532, + "balance_loss_mlp": 1.01930857, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 2.7920617055662547, + "language_loss": 0.72586107, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74666727, + "num_input_tokens_seen": 284549340, + "step": 13190, + "time_per_iteration": 2.646250009536743 + }, + { + "auxiliary_loss_clip": 0.01081286, + "auxiliary_loss_mlp": 0.01025507, + "balance_loss_clip": 1.03294265, + "balance_loss_mlp": 1.01613092, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.8772074691424936, + "language_loss": 0.73135787, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75242579, + "num_input_tokens_seen": 284567060, + "step": 13191, + "time_per_iteration": 4.14415168762207 + }, + { + "auxiliary_loss_clip": 0.01086046, + "auxiliary_loss_mlp": 0.01036125, + "balance_loss_clip": 1.03309584, + "balance_loss_mlp": 1.02487755, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.6887619029752299, + "language_loss": 0.69295084, + "learning_rate": 4.322003066198219e-07, + "loss": 0.7141726, + "num_input_tokens_seen": 284586600, + "step": 13192, + "time_per_iteration": 2.63181471824646 + }, + { + "auxiliary_loss_clip": 0.01061258, + "auxiliary_loss_mlp": 0.01034075, + "balance_loss_clip": 1.03133154, + "balance_loss_mlp": 1.02285707, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.9887467324302415, + "language_loss": 0.75059414, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77154744, + "num_input_tokens_seen": 284605715, + "step": 13193, + "time_per_iteration": 2.652920961380005 + }, + { + "auxiliary_loss_clip": 0.01085856, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.0346508, + "balance_loss_mlp": 1.02000165, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.4502592117004582, + "language_loss": 0.7212702, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74245054, + "num_input_tokens_seen": 284628540, + "step": 13194, + "time_per_iteration": 2.6566317081451416 + }, + { + "auxiliary_loss_clip": 0.01101996, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.03500366, + "balance_loss_mlp": 1.01936293, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 1.8129285301113436, + "language_loss": 0.70204055, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72337079, + "num_input_tokens_seen": 284646040, + "step": 13195, + "time_per_iteration": 2.521878480911255 + }, + { + "auxiliary_loss_clip": 0.01047423, + "auxiliary_loss_mlp": 0.01025016, + "balance_loss_clip": 1.03313196, + "balance_loss_mlp": 1.01363182, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 3.221887330002134, + "language_loss": 0.77529228, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79601669, + "num_input_tokens_seen": 284665110, + "step": 13196, + "time_per_iteration": 2.708508253097534 + }, + { + "auxiliary_loss_clip": 0.01067964, + "auxiliary_loss_mlp": 0.01038201, + "balance_loss_clip": 1.03460836, + "balance_loss_mlp": 1.02756715, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.5610824242191708, + "language_loss": 0.68905795, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71011961, + "num_input_tokens_seen": 284686515, + "step": 13197, + "time_per_iteration": 2.7611870765686035 + }, + { + "auxiliary_loss_clip": 0.01085017, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.03271663, + "balance_loss_mlp": 1.01884615, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 2.146663657236616, + "language_loss": 0.65033162, + "learning_rate": 4.30750506215646e-07, + "loss": 0.67148018, + "num_input_tokens_seen": 284707300, + "step": 13198, + "time_per_iteration": 2.6555330753326416 + }, + { + "auxiliary_loss_clip": 0.01048638, + "auxiliary_loss_mlp": 0.01039753, + "balance_loss_clip": 1.03198779, + "balance_loss_mlp": 1.02659225, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.179456951074488, + "language_loss": 0.72387213, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74475604, + "num_input_tokens_seen": 284723545, + "step": 13199, + "time_per_iteration": 4.106444835662842 + }, + { + "auxiliary_loss_clip": 0.01066272, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.03003657, + "balance_loss_mlp": 1.01897168, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 1.8690835043118676, + "language_loss": 0.80777705, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82874328, + "num_input_tokens_seen": 284742650, + "step": 13200, + "time_per_iteration": 2.6591196060180664 + }, + { + "auxiliary_loss_clip": 0.01085829, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.03442955, + "balance_loss_mlp": 1.01923454, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 5.88806361065971, + "language_loss": 0.7746352, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79579079, + "num_input_tokens_seen": 284760955, + "step": 13201, + "time_per_iteration": 2.585472345352173 + }, + { + "auxiliary_loss_clip": 0.01095468, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.03254616, + "balance_loss_mlp": 1.02028823, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.6028914678968798, + "language_loss": 0.670569, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69183356, + "num_input_tokens_seen": 284780745, + "step": 13202, + "time_per_iteration": 2.5609021186828613 + }, + { + "auxiliary_loss_clip": 0.01087797, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.03447819, + "balance_loss_mlp": 1.0192548, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.9992684369871736, + "language_loss": 0.7493279, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77051485, + "num_input_tokens_seen": 284799000, + "step": 13203, + "time_per_iteration": 2.550478458404541 + }, + { + "auxiliary_loss_clip": 0.01043579, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.03224564, + "balance_loss_mlp": 1.01899922, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 3.1042833653876527, + "language_loss": 0.66364855, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68438721, + "num_input_tokens_seen": 284817450, + "step": 13204, + "time_per_iteration": 2.7692129611968994 + }, + { + "auxiliary_loss_clip": 0.01037599, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.03001416, + "balance_loss_mlp": 1.01779222, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.6193854596236124, + "language_loss": 0.79644835, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81710696, + "num_input_tokens_seen": 284838865, + "step": 13205, + "time_per_iteration": 2.7907207012176514 + }, + { + "auxiliary_loss_clip": 0.01058454, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.02926707, + "balance_loss_mlp": 1.01576948, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.8613778993643222, + "language_loss": 0.77646482, + "learning_rate": 4.28820771692858e-07, + "loss": 0.79732329, + "num_input_tokens_seen": 284857975, + "step": 13206, + "time_per_iteration": 2.668447494506836 + }, + { + "auxiliary_loss_clip": 0.01071244, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.03254747, + "balance_loss_mlp": 1.02154469, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 2.109860398864022, + "language_loss": 0.78808755, + "learning_rate": 4.285798228882456e-07, + "loss": 0.80914962, + "num_input_tokens_seen": 284877145, + "step": 13207, + "time_per_iteration": 2.6075925827026367 + }, + { + "auxiliary_loss_clip": 0.01063798, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.03431869, + "balance_loss_mlp": 1.01964664, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 2.290955059721334, + "language_loss": 0.83886361, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.85981047, + "num_input_tokens_seen": 284895560, + "step": 13208, + "time_per_iteration": 2.663466453552246 + }, + { + "auxiliary_loss_clip": 0.00984647, + "auxiliary_loss_mlp": 0.01009593, + "balance_loss_clip": 1.00420618, + "balance_loss_mlp": 1.0087347, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7264300268765118, + "language_loss": 0.58342379, + "learning_rate": 4.280981040533875e-07, + "loss": 0.6033662, + "num_input_tokens_seen": 284963135, + "step": 13209, + "time_per_iteration": 4.768787384033203 + }, + { + "auxiliary_loss_clip": 0.01059524, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.03116632, + "balance_loss_mlp": 1.0170244, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 2.2597973819753046, + "language_loss": 0.63516128, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.65604639, + "num_input_tokens_seen": 284981755, + "step": 13210, + "time_per_iteration": 2.636502742767334 + }, + { + "auxiliary_loss_clip": 0.01082505, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.03385687, + "balance_loss_mlp": 1.02304637, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5109917783589975, + "language_loss": 0.69300061, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.7141633, + "num_input_tokens_seen": 285003060, + "step": 13211, + "time_per_iteration": 2.7492778301239014 + }, + { + "auxiliary_loss_clip": 0.01088505, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.03338325, + "balance_loss_mlp": 1.02431917, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.5855157143051974, + "language_loss": 0.72309279, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.7443409, + "num_input_tokens_seen": 285021640, + "step": 13212, + "time_per_iteration": 2.571389675140381 + }, + { + "auxiliary_loss_clip": 0.01082812, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.03162563, + "balance_loss_mlp": 1.01658154, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.6797503989893503, + "language_loss": 0.80759227, + "learning_rate": 4.271353817368246e-07, + "loss": 0.8286925, + "num_input_tokens_seen": 285040490, + "step": 13213, + "time_per_iteration": 2.5595757961273193 + }, + { + "auxiliary_loss_clip": 0.01091647, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.03517437, + "balance_loss_mlp": 1.01828122, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.0800762430068334, + "language_loss": 0.67960167, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70082355, + "num_input_tokens_seen": 285059270, + "step": 13214, + "time_per_iteration": 2.5723044872283936 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.03376639, + "balance_loss_mlp": 1.01694393, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 1.9287072208076252, + "language_loss": 0.72410387, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74534357, + "num_input_tokens_seen": 285075390, + "step": 13215, + "time_per_iteration": 2.502769708633423 + }, + { + "auxiliary_loss_clip": 0.01045176, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.03152215, + "balance_loss_mlp": 1.0194006, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5304464573705359, + "language_loss": 0.78691381, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.80768245, + "num_input_tokens_seen": 285096290, + "step": 13216, + "time_per_iteration": 2.689239501953125 + }, + { + "auxiliary_loss_clip": 0.01081706, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.03267467, + "balance_loss_mlp": 1.0185926, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.5458939685087616, + "language_loss": 0.74028242, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76139545, + "num_input_tokens_seen": 285116020, + "step": 13217, + "time_per_iteration": 2.6179535388946533 + }, + { + "auxiliary_loss_clip": 0.01068544, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.03175461, + "balance_loss_mlp": 1.01722085, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.8376756153049505, + "language_loss": 0.74394774, + "learning_rate": 4.259333208810907e-07, + "loss": 0.7649219, + "num_input_tokens_seen": 285133510, + "step": 13218, + "time_per_iteration": 2.5774965286254883 + }, + { + "auxiliary_loss_clip": 0.01088429, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.03295183, + "balance_loss_mlp": 1.02665424, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 2.2239636103630103, + "language_loss": 0.83164978, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85292345, + "num_input_tokens_seen": 285151690, + "step": 13219, + "time_per_iteration": 2.515735626220703 + }, + { + "auxiliary_loss_clip": 0.0108521, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.03397799, + "balance_loss_mlp": 1.02442217, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 1.8437586704059352, + "language_loss": 0.75586969, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77710462, + "num_input_tokens_seen": 285170485, + "step": 13220, + "time_per_iteration": 2.558588981628418 + }, + { + "auxiliary_loss_clip": 0.01072225, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.0331136, + "balance_loss_mlp": 1.02246857, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.7180447817144597, + "language_loss": 0.72338521, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74444902, + "num_input_tokens_seen": 285191050, + "step": 13221, + "time_per_iteration": 2.704303026199341 + }, + { + "auxiliary_loss_clip": 0.01088504, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.03608978, + "balance_loss_mlp": 1.01723647, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 2.2453149283039138, + "language_loss": 0.75133765, + "learning_rate": 4.249727465395634e-07, + "loss": 0.77250063, + "num_input_tokens_seen": 285208750, + "step": 13222, + "time_per_iteration": 2.5812854766845703 + }, + { + "auxiliary_loss_clip": 0.01004724, + "auxiliary_loss_mlp": 0.01002244, + "balance_loss_clip": 1.00493503, + "balance_loss_mlp": 1.00119519, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7799085448257884, + "language_loss": 0.67037845, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69044816, + "num_input_tokens_seen": 285264605, + "step": 13223, + "time_per_iteration": 2.99989914894104 + }, + { + "auxiliary_loss_clip": 0.0108501, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.03182888, + "balance_loss_mlp": 1.01850152, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 2.251817749809158, + "language_loss": 0.71286631, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73401695, + "num_input_tokens_seen": 285283940, + "step": 13224, + "time_per_iteration": 4.037494659423828 + }, + { + "auxiliary_loss_clip": 0.01024333, + "auxiliary_loss_mlp": 0.00997165, + "balance_loss_clip": 1.00450528, + "balance_loss_mlp": 0.99625874, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6683485331286684, + "language_loss": 0.55010009, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57031506, + "num_input_tokens_seen": 285349525, + "step": 13225, + "time_per_iteration": 3.126319646835327 + }, + { + "auxiliary_loss_clip": 0.01071676, + "auxiliary_loss_mlp": 0.01024937, + "balance_loss_clip": 1.03005934, + "balance_loss_mlp": 1.01437497, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 2.08786012489226, + "language_loss": 0.65434194, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67530811, + "num_input_tokens_seen": 285367355, + "step": 13226, + "time_per_iteration": 2.5949184894561768 + }, + { + "auxiliary_loss_clip": 0.01054969, + "auxiliary_loss_mlp": 0.01037427, + "balance_loss_clip": 1.03378749, + "balance_loss_mlp": 1.02632236, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 3.159486773877526, + "language_loss": 0.7023803, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72330427, + "num_input_tokens_seen": 285386190, + "step": 13227, + "time_per_iteration": 2.763690710067749 + }, + { + "auxiliary_loss_clip": 0.01049828, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.02936697, + "balance_loss_mlp": 1.01678801, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.691253691779206, + "language_loss": 0.69596004, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71672571, + "num_input_tokens_seen": 285406150, + "step": 13228, + "time_per_iteration": 2.7386562824249268 + }, + { + "auxiliary_loss_clip": 0.01047726, + "auxiliary_loss_mlp": 0.01043973, + "balance_loss_clip": 1.02918196, + "balance_loss_mlp": 1.0311048, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.725795086201759, + "language_loss": 0.70937181, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73028886, + "num_input_tokens_seen": 285429900, + "step": 13229, + "time_per_iteration": 2.849320650100708 + }, + { + "auxiliary_loss_clip": 0.01092715, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.03672338, + "balance_loss_mlp": 1.0199337, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.9206679713614356, + "language_loss": 0.71450651, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.73574823, + "num_input_tokens_seen": 285452555, + "step": 13230, + "time_per_iteration": 2.688788414001465 + }, + { + "auxiliary_loss_clip": 0.01003744, + "auxiliary_loss_mlp": 0.00999575, + "balance_loss_clip": 1.00444198, + "balance_loss_mlp": 0.99866909, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.9018127242376478, + "language_loss": 0.63564718, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65568036, + "num_input_tokens_seen": 285515700, + "step": 13231, + "time_per_iteration": 3.1551764011383057 + }, + { + "auxiliary_loss_clip": 0.01074791, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.03203869, + "balance_loss_mlp": 1.01757121, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.8967689221848976, + "language_loss": 0.69954813, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72058094, + "num_input_tokens_seen": 285533910, + "step": 13232, + "time_per_iteration": 4.00319242477417 + }, + { + "auxiliary_loss_clip": 0.01087106, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.03292382, + "balance_loss_mlp": 1.01751447, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 2.2520719866644074, + "language_loss": 0.7811389, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80229867, + "num_input_tokens_seen": 285554080, + "step": 13233, + "time_per_iteration": 2.5650341510772705 + }, + { + "auxiliary_loss_clip": 0.01088938, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.0337255, + "balance_loss_mlp": 1.01917815, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 2.1002498929880598, + "language_loss": 0.78930718, + "learning_rate": 4.220967594613769e-07, + "loss": 0.8105011, + "num_input_tokens_seen": 285572325, + "step": 13234, + "time_per_iteration": 2.580301523208618 + }, + { + "auxiliary_loss_clip": 0.01078215, + "auxiliary_loss_mlp": 0.00749348, + "balance_loss_clip": 1.03486013, + "balance_loss_mlp": 1.00024247, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 2.4655881853289596, + "language_loss": 0.70431143, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72258699, + "num_input_tokens_seen": 285589770, + "step": 13235, + "time_per_iteration": 2.5799691677093506 + }, + { + "auxiliary_loss_clip": 0.01058156, + "auxiliary_loss_mlp": 0.01027579, + "balance_loss_clip": 1.03170204, + "balance_loss_mlp": 1.01590276, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 2.015129041661635, + "language_loss": 0.68032575, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70118314, + "num_input_tokens_seen": 285610065, + "step": 13236, + "time_per_iteration": 2.670621871948242 + }, + { + "auxiliary_loss_clip": 0.01045689, + "auxiliary_loss_mlp": 0.01025817, + "balance_loss_clip": 1.03117287, + "balance_loss_mlp": 1.01448619, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.6133898714258454, + "language_loss": 0.75043988, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77115488, + "num_input_tokens_seen": 285628480, + "step": 13237, + "time_per_iteration": 2.681211471557617 + }, + { + "auxiliary_loss_clip": 0.01088365, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.03507161, + "balance_loss_mlp": 1.02217758, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 2.909632891942786, + "language_loss": 0.71512949, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73635221, + "num_input_tokens_seen": 285647805, + "step": 13238, + "time_per_iteration": 2.5201334953308105 + }, + { + "auxiliary_loss_clip": 0.01083893, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.03225267, + "balance_loss_mlp": 1.01491976, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 2.637557069757887, + "language_loss": 0.7401101, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.76120913, + "num_input_tokens_seen": 285665505, + "step": 13239, + "time_per_iteration": 2.6897575855255127 + }, + { + "auxiliary_loss_clip": 0.01102651, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.0354259, + "balance_loss_mlp": 1.02008355, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.9510492362396967, + "language_loss": 0.69261104, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71395212, + "num_input_tokens_seen": 285685855, + "step": 13240, + "time_per_iteration": 4.048253059387207 + }, + { + "auxiliary_loss_clip": 0.01015788, + "auxiliary_loss_mlp": 0.0100039, + "balance_loss_clip": 1.00540638, + "balance_loss_mlp": 0.99939507, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8909862161986567, + "language_loss": 0.58704066, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60720247, + "num_input_tokens_seen": 285735710, + "step": 13241, + "time_per_iteration": 2.8601014614105225 + }, + { + "auxiliary_loss_clip": 0.0106872, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03527975, + "balance_loss_mlp": 1.01903474, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 1.777826657650411, + "language_loss": 0.63978553, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66076505, + "num_input_tokens_seen": 285757045, + "step": 13242, + "time_per_iteration": 2.7651383876800537 + }, + { + "auxiliary_loss_clip": 0.0109936, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03450847, + "balance_loss_mlp": 1.0195893, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 1.9533249608060885, + "language_loss": 0.76001549, + "learning_rate": 4.199454226296526e-07, + "loss": 0.7813226, + "num_input_tokens_seen": 285776050, + "step": 13243, + "time_per_iteration": 2.544447422027588 + }, + { + "auxiliary_loss_clip": 0.0107065, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.03455901, + "balance_loss_mlp": 1.01831818, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.7459056059584157, + "language_loss": 0.79928112, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.82028902, + "num_input_tokens_seen": 285796830, + "step": 13244, + "time_per_iteration": 2.6624388694763184 + }, + { + "auxiliary_loss_clip": 0.01089805, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.03252029, + "balance_loss_mlp": 1.01698232, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.1751331373040372, + "language_loss": 0.6840381, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70522624, + "num_input_tokens_seen": 285814755, + "step": 13245, + "time_per_iteration": 2.493844747543335 + }, + { + "auxiliary_loss_clip": 0.01076372, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.03222108, + "balance_loss_mlp": 1.0202105, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 4.251041723448844, + "language_loss": 0.7914691, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81254697, + "num_input_tokens_seen": 285834255, + "step": 13246, + "time_per_iteration": 2.6137948036193848 + }, + { + "auxiliary_loss_clip": 0.01078318, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.03309357, + "balance_loss_mlp": 1.01683688, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 2.0537117167401675, + "language_loss": 0.66153526, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.6826002, + "num_input_tokens_seen": 285853540, + "step": 13247, + "time_per_iteration": 2.616403818130493 + }, + { + "auxiliary_loss_clip": 0.01074859, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.03281534, + "balance_loss_mlp": 1.01722217, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 1.79630461773789, + "language_loss": 0.71524042, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73626482, + "num_input_tokens_seen": 285872705, + "step": 13248, + "time_per_iteration": 2.640376329421997 + }, + { + "auxiliary_loss_clip": 0.01082263, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.03479743, + "balance_loss_mlp": 1.01827049, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 1.9427561179954969, + "language_loss": 0.76172924, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.7828536, + "num_input_tokens_seen": 285890290, + "step": 13249, + "time_per_iteration": 2.604990243911743 + }, + { + "auxiliary_loss_clip": 0.01072076, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.03518963, + "balance_loss_mlp": 1.01959383, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.370284755532375, + "language_loss": 0.61349213, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63451856, + "num_input_tokens_seen": 285909190, + "step": 13250, + "time_per_iteration": 4.1154563426971436 + }, + { + "auxiliary_loss_clip": 0.01075884, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.03268242, + "balance_loss_mlp": 1.01582968, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.020297054274996, + "language_loss": 0.7181685, + "learning_rate": 4.180371972938206e-07, + "loss": 0.73919725, + "num_input_tokens_seen": 285927570, + "step": 13251, + "time_per_iteration": 2.703719139099121 + }, + { + "auxiliary_loss_clip": 0.0110316, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.03581095, + "balance_loss_mlp": 1.01688802, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 1.9362878265626813, + "language_loss": 0.72773266, + "learning_rate": 4.177989389787624e-07, + "loss": 0.74905956, + "num_input_tokens_seen": 285945810, + "step": 13252, + "time_per_iteration": 2.541271448135376 + }, + { + "auxiliary_loss_clip": 0.01095481, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.03409386, + "balance_loss_mlp": 1.01868689, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.6261295468866142, + "language_loss": 0.66078281, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68203187, + "num_input_tokens_seen": 285964235, + "step": 13253, + "time_per_iteration": 2.572338104248047 + }, + { + "auxiliary_loss_clip": 0.01073031, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.03598523, + "balance_loss_mlp": 1.02314651, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 3.8328200752795363, + "language_loss": 0.68016899, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.70124996, + "num_input_tokens_seen": 285983710, + "step": 13254, + "time_per_iteration": 2.6451196670532227 + }, + { + "auxiliary_loss_clip": 0.01086838, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.03304362, + "balance_loss_mlp": 1.02060843, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.8906036911069235, + "language_loss": 0.69281018, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71399701, + "num_input_tokens_seen": 286003425, + "step": 13255, + "time_per_iteration": 2.600472927093506 + }, + { + "auxiliary_loss_clip": 0.01096037, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.03265786, + "balance_loss_mlp": 1.01987553, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 1.8112599150791053, + "language_loss": 0.79380286, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81506836, + "num_input_tokens_seen": 286020130, + "step": 13256, + "time_per_iteration": 2.5242345333099365 + }, + { + "auxiliary_loss_clip": 0.01086148, + "auxiliary_loss_mlp": 0.01027366, + "balance_loss_clip": 1.03440475, + "balance_loss_mlp": 1.01616669, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.7879029823987482, + "language_loss": 0.65817398, + "learning_rate": 4.166085475424315e-07, + "loss": 0.67930913, + "num_input_tokens_seen": 286040230, + "step": 13257, + "time_per_iteration": 2.569720506668091 + }, + { + "auxiliary_loss_clip": 0.01078563, + "auxiliary_loss_mlp": 0.01034545, + "balance_loss_clip": 1.03325927, + "balance_loss_mlp": 1.02303493, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 1.9118365358252594, + "language_loss": 0.72157192, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74270308, + "num_input_tokens_seen": 286059475, + "step": 13258, + "time_per_iteration": 2.6820695400238037 + }, + { + "auxiliary_loss_clip": 0.01089399, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.03346086, + "balance_loss_mlp": 1.02131295, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.7194206888456935, + "language_loss": 0.687599, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.70882422, + "num_input_tokens_seen": 286077820, + "step": 13259, + "time_per_iteration": 2.6078920364379883 + }, + { + "auxiliary_loss_clip": 0.01084459, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.03283, + "balance_loss_mlp": 1.02002275, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.7509572554440216, + "language_loss": 0.73507833, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75622392, + "num_input_tokens_seen": 286097285, + "step": 13260, + "time_per_iteration": 2.6037445068359375 + }, + { + "auxiliary_loss_clip": 0.01069126, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.02989674, + "balance_loss_mlp": 1.01823616, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.6873071182859267, + "language_loss": 0.78331572, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80429661, + "num_input_tokens_seen": 286116000, + "step": 13261, + "time_per_iteration": 2.622814655303955 + }, + { + "auxiliary_loss_clip": 0.01084751, + "auxiliary_loss_mlp": 0.01030756, + "balance_loss_clip": 1.03442121, + "balance_loss_mlp": 1.02123094, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.539331899314594, + "language_loss": 0.76302624, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78418124, + "num_input_tokens_seen": 286135110, + "step": 13262, + "time_per_iteration": 2.590702533721924 + }, + { + "auxiliary_loss_clip": 0.01071214, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.03627515, + "balance_loss_mlp": 1.01784849, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 4.001100241379787, + "language_loss": 0.70581818, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72683549, + "num_input_tokens_seen": 286152835, + "step": 13263, + "time_per_iteration": 2.6543450355529785 + }, + { + "auxiliary_loss_clip": 0.01091846, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.03357196, + "balance_loss_mlp": 1.02101803, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 2.635902798123518, + "language_loss": 0.70991611, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73117125, + "num_input_tokens_seen": 286171785, + "step": 13264, + "time_per_iteration": 4.072413444519043 + }, + { + "auxiliary_loss_clip": 0.01096694, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.03449059, + "balance_loss_mlp": 1.02151322, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.7360413364298575, + "language_loss": 0.77332836, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79461706, + "num_input_tokens_seen": 286190420, + "step": 13265, + "time_per_iteration": 2.493380308151245 + }, + { + "auxiliary_loss_clip": 0.01067488, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.03400767, + "balance_loss_mlp": 1.01629138, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 2.357086316181963, + "language_loss": 0.75941008, + "learning_rate": 4.144696263830285e-07, + "loss": 0.78035688, + "num_input_tokens_seen": 286210105, + "step": 13266, + "time_per_iteration": 2.6775777339935303 + }, + { + "auxiliary_loss_clip": 0.01070499, + "auxiliary_loss_mlp": 0.01024425, + "balance_loss_clip": 1.03090954, + "balance_loss_mlp": 1.01360083, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 1.5756484947987333, + "language_loss": 0.83845335, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.8594026, + "num_input_tokens_seen": 286228180, + "step": 13267, + "time_per_iteration": 2.5803780555725098 + }, + { + "auxiliary_loss_clip": 0.01086375, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.0331254, + "balance_loss_mlp": 1.02131033, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.809646352524198, + "language_loss": 0.76473278, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78592348, + "num_input_tokens_seen": 286247305, + "step": 13268, + "time_per_iteration": 2.5419225692749023 + }, + { + "auxiliary_loss_clip": 0.01098368, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.03387415, + "balance_loss_mlp": 1.01802075, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.6552044390219092, + "language_loss": 0.77891451, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.80018908, + "num_input_tokens_seen": 286268145, + "step": 13269, + "time_per_iteration": 2.5669310092926025 + }, + { + "auxiliary_loss_clip": 0.01076084, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.02967024, + "balance_loss_mlp": 1.02794755, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.665558451581459, + "language_loss": 0.82121116, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84236789, + "num_input_tokens_seen": 286286775, + "step": 13270, + "time_per_iteration": 2.5134437084198 + }, + { + "auxiliary_loss_clip": 0.01056824, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.03231502, + "balance_loss_mlp": 1.02146077, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.871331412185605, + "language_loss": 0.59607506, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.6169858, + "num_input_tokens_seen": 286305590, + "step": 13271, + "time_per_iteration": 2.6292884349823 + }, + { + "auxiliary_loss_clip": 0.01080006, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.03375483, + "balance_loss_mlp": 1.02359951, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.687936647004167, + "language_loss": 0.73459649, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75574541, + "num_input_tokens_seen": 286328050, + "step": 13272, + "time_per_iteration": 4.123977899551392 + }, + { + "auxiliary_loss_clip": 0.0103134, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.02895927, + "balance_loss_mlp": 1.01998889, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 2.1221111843346154, + "language_loss": 0.71628278, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73691809, + "num_input_tokens_seen": 286345265, + "step": 13273, + "time_per_iteration": 2.6920015811920166 + }, + { + "auxiliary_loss_clip": 0.01075067, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.03307772, + "balance_loss_mlp": 1.02319336, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 2.2093187623383113, + "language_loss": 0.75501013, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77611333, + "num_input_tokens_seen": 286364465, + "step": 13274, + "time_per_iteration": 2.5782387256622314 + }, + { + "auxiliary_loss_clip": 0.0104358, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.02957582, + "balance_loss_mlp": 1.01955783, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.4344834003455709, + "language_loss": 0.77770257, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79843748, + "num_input_tokens_seen": 286385565, + "step": 13275, + "time_per_iteration": 2.7213010787963867 + }, + { + "auxiliary_loss_clip": 0.01089275, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.03395891, + "balance_loss_mlp": 1.02328813, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 1.892519618087325, + "language_loss": 0.64478046, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66602576, + "num_input_tokens_seen": 286403950, + "step": 13276, + "time_per_iteration": 2.623075008392334 + }, + { + "auxiliary_loss_clip": 0.01055666, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.03066647, + "balance_loss_mlp": 1.01881683, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.7733128273624617, + "language_loss": 0.60939062, + "learning_rate": 4.118620036501945e-07, + "loss": 0.63024533, + "num_input_tokens_seen": 286426160, + "step": 13277, + "time_per_iteration": 2.7649195194244385 + }, + { + "auxiliary_loss_clip": 0.01070832, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.03282535, + "balance_loss_mlp": 1.02094996, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 3.039904656002855, + "language_loss": 0.79386604, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81490433, + "num_input_tokens_seen": 286446610, + "step": 13278, + "time_per_iteration": 2.669285535812378 + }, + { + "auxiliary_loss_clip": 0.01077574, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.03338063, + "balance_loss_mlp": 1.02411473, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.888051495755792, + "language_loss": 0.63655305, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65768766, + "num_input_tokens_seen": 286465460, + "step": 13279, + "time_per_iteration": 4.085223913192749 + }, + { + "auxiliary_loss_clip": 0.01082553, + "auxiliary_loss_mlp": 0.01024312, + "balance_loss_clip": 1.03250515, + "balance_loss_mlp": 1.01398873, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 2.1008953171439955, + "language_loss": 0.7103101, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73137879, + "num_input_tokens_seen": 286485720, + "step": 13280, + "time_per_iteration": 2.5887339115142822 + }, + { + "auxiliary_loss_clip": 0.01056186, + "auxiliary_loss_mlp": 0.0103671, + "balance_loss_clip": 1.03163528, + "balance_loss_mlp": 1.02285206, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 2.209851343338304, + "language_loss": 0.62829411, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64922309, + "num_input_tokens_seen": 286507465, + "step": 13281, + "time_per_iteration": 2.710651397705078 + }, + { + "auxiliary_loss_clip": 0.01086304, + "auxiliary_loss_mlp": 0.01033393, + "balance_loss_clip": 1.03181219, + "balance_loss_mlp": 1.02156711, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 1.809510766158866, + "language_loss": 0.8065455, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.8277424, + "num_input_tokens_seen": 286526345, + "step": 13282, + "time_per_iteration": 2.5766918659210205 + }, + { + "auxiliary_loss_clip": 0.01061, + "auxiliary_loss_mlp": 0.00749436, + "balance_loss_clip": 1.02961779, + "balance_loss_mlp": 1.00025928, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 2.1938722211897543, + "language_loss": 0.71692932, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73503363, + "num_input_tokens_seen": 286544095, + "step": 13283, + "time_per_iteration": 2.5689144134521484 + }, + { + "auxiliary_loss_clip": 0.01083287, + "auxiliary_loss_mlp": 0.01025391, + "balance_loss_clip": 1.03169847, + "balance_loss_mlp": 1.01502609, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 2.1570613999366937, + "language_loss": 0.73429132, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75537813, + "num_input_tokens_seen": 286560960, + "step": 13284, + "time_per_iteration": 2.536964178085327 + }, + { + "auxiliary_loss_clip": 0.01063412, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.02940595, + "balance_loss_mlp": 1.01911652, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.813832751176554, + "language_loss": 0.70279408, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72372425, + "num_input_tokens_seen": 286579865, + "step": 13285, + "time_per_iteration": 2.659219980239868 + }, + { + "auxiliary_loss_clip": 0.01066609, + "auxiliary_loss_mlp": 0.01026122, + "balance_loss_clip": 1.03041077, + "balance_loss_mlp": 1.01532793, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.8253119531130322, + "language_loss": 0.73957813, + "learning_rate": 4.097339136128437e-07, + "loss": 0.76050544, + "num_input_tokens_seen": 286597295, + "step": 13286, + "time_per_iteration": 2.570255756378174 + }, + { + "auxiliary_loss_clip": 0.0107645, + "auxiliary_loss_mlp": 0.01028574, + "balance_loss_clip": 1.03272498, + "balance_loss_mlp": 1.01771975, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 2.271871790046803, + "language_loss": 0.75146711, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77251726, + "num_input_tokens_seen": 286616270, + "step": 13287, + "time_per_iteration": 2.6133484840393066 + }, + { + "auxiliary_loss_clip": 0.01075143, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.03458285, + "balance_loss_mlp": 1.0203644, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 1.6478356748802996, + "language_loss": 0.61869973, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63976431, + "num_input_tokens_seen": 286638315, + "step": 13288, + "time_per_iteration": 2.6995010375976562 + }, + { + "auxiliary_loss_clip": 0.0108728, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.03508139, + "balance_loss_mlp": 1.01733279, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 1.9203675174520907, + "language_loss": 0.70127714, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72242844, + "num_input_tokens_seen": 286658630, + "step": 13289, + "time_per_iteration": 2.5914435386657715 + }, + { + "auxiliary_loss_clip": 0.01066284, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.03574371, + "balance_loss_mlp": 1.02043629, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 2.947525838108501, + "language_loss": 0.62477309, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64575636, + "num_input_tokens_seen": 286676870, + "step": 13290, + "time_per_iteration": 4.18906831741333 + }, + { + "auxiliary_loss_clip": 0.0109098, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.03542495, + "balance_loss_mlp": 1.01665044, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 2.551368988771108, + "language_loss": 0.71473169, + "learning_rate": 4.08553751558248e-07, + "loss": 0.7359246, + "num_input_tokens_seen": 286694300, + "step": 13291, + "time_per_iteration": 2.6100096702575684 + }, + { + "auxiliary_loss_clip": 0.01060365, + "auxiliary_loss_mlp": 0.0102542, + "balance_loss_clip": 1.03173983, + "balance_loss_mlp": 1.01482236, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 2.284441166369212, + "language_loss": 0.63849199, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65934992, + "num_input_tokens_seen": 286714545, + "step": 13292, + "time_per_iteration": 2.6209914684295654 + }, + { + "auxiliary_loss_clip": 0.01086761, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.03388214, + "balance_loss_mlp": 1.01866698, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 2.0216555628948365, + "language_loss": 0.56223941, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58339959, + "num_input_tokens_seen": 286734525, + "step": 13293, + "time_per_iteration": 2.6555745601654053 + }, + { + "auxiliary_loss_clip": 0.01072727, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.03511786, + "balance_loss_mlp": 1.01933265, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.8746138792756937, + "language_loss": 0.71428061, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73531711, + "num_input_tokens_seen": 286753430, + "step": 13294, + "time_per_iteration": 2.8030030727386475 + }, + { + "auxiliary_loss_clip": 0.01056124, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.02983284, + "balance_loss_mlp": 1.02102947, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 2.342622461353021, + "language_loss": 0.723831, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74471736, + "num_input_tokens_seen": 286771915, + "step": 13295, + "time_per_iteration": 2.6060757637023926 + }, + { + "auxiliary_loss_clip": 0.01062902, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.03467178, + "balance_loss_mlp": 1.01973915, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.8067317772392264, + "language_loss": 0.76476562, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78569847, + "num_input_tokens_seen": 286789835, + "step": 13296, + "time_per_iteration": 2.668292760848999 + }, + { + "auxiliary_loss_clip": 0.00995892, + "auxiliary_loss_mlp": 0.01002807, + "balance_loss_clip": 1.0076654, + "balance_loss_mlp": 1.00175846, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.7049010412759932, + "language_loss": 0.60833961, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62832659, + "num_input_tokens_seen": 286855580, + "step": 13297, + "time_per_iteration": 3.2461161613464355 + }, + { + "auxiliary_loss_clip": 0.01074711, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.03285789, + "balance_loss_mlp": 1.0216527, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.1911185597975025, + "language_loss": 0.70065534, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72172356, + "num_input_tokens_seen": 286874360, + "step": 13298, + "time_per_iteration": 2.5538909435272217 + }, + { + "auxiliary_loss_clip": 0.01059657, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.03246796, + "balance_loss_mlp": 1.02070904, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 1.9317028979524422, + "language_loss": 0.75553179, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77647185, + "num_input_tokens_seen": 286891950, + "step": 13299, + "time_per_iteration": 2.595334768295288 + }, + { + "auxiliary_loss_clip": 0.01068555, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.0305872, + "balance_loss_mlp": 1.01840591, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.7645527093833586, + "language_loss": 0.77381891, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79479611, + "num_input_tokens_seen": 286911725, + "step": 13300, + "time_per_iteration": 2.709429979324341 + }, + { + "auxiliary_loss_clip": 0.01050034, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02839839, + "balance_loss_mlp": 1.0230484, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 1.762558701527738, + "language_loss": 0.63720727, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65807664, + "num_input_tokens_seen": 286931400, + "step": 13301, + "time_per_iteration": 2.657330274581909 + }, + { + "auxiliary_loss_clip": 0.01086578, + "auxiliary_loss_mlp": 0.01034418, + "balance_loss_clip": 1.03411996, + "balance_loss_mlp": 1.02283096, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 2.153971380113537, + "language_loss": 0.71677637, + "learning_rate": 4.059627072173928e-07, + "loss": 0.73798633, + "num_input_tokens_seen": 286949795, + "step": 13302, + "time_per_iteration": 2.579831123352051 + }, + { + "auxiliary_loss_clip": 0.0109948, + "auxiliary_loss_mlp": 0.00749612, + "balance_loss_clip": 1.03372765, + "balance_loss_mlp": 1.00028157, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 2.190810836039716, + "language_loss": 0.83397174, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85246259, + "num_input_tokens_seen": 286968805, + "step": 13303, + "time_per_iteration": 2.614036798477173 + }, + { + "auxiliary_loss_clip": 0.01095288, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.03353119, + "balance_loss_mlp": 1.01856494, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 2.4086281193363446, + "language_loss": 0.58746111, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60869986, + "num_input_tokens_seen": 286990235, + "step": 13304, + "time_per_iteration": 4.03274393081665 + }, + { + "auxiliary_loss_clip": 0.01097817, + "auxiliary_loss_mlp": 0.01026204, + "balance_loss_clip": 1.0322113, + "balance_loss_mlp": 1.01441455, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.5999884106289897, + "language_loss": 0.69277179, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71401203, + "num_input_tokens_seen": 287011060, + "step": 13305, + "time_per_iteration": 2.5934982299804688 + }, + { + "auxiliary_loss_clip": 0.01063193, + "auxiliary_loss_mlp": 0.01025431, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.01544702, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5612197789938276, + "language_loss": 0.6903069, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71119308, + "num_input_tokens_seen": 287029215, + "step": 13306, + "time_per_iteration": 2.6255581378936768 + }, + { + "auxiliary_loss_clip": 0.01087596, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.03411984, + "balance_loss_mlp": 1.02133918, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 2.0723389002078427, + "language_loss": 0.69601989, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71721518, + "num_input_tokens_seen": 287050855, + "step": 13307, + "time_per_iteration": 2.638026475906372 + }, + { + "auxiliary_loss_clip": 0.01073715, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.033077, + "balance_loss_mlp": 1.02394116, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 2.499234705228126, + "language_loss": 0.771478, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.79256415, + "num_input_tokens_seen": 287069915, + "step": 13308, + "time_per_iteration": 2.6299092769622803 + }, + { + "auxiliary_loss_clip": 0.01051396, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.03222132, + "balance_loss_mlp": 1.0216428, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.694216268032632, + "language_loss": 0.79144037, + "learning_rate": 4.0431766816972e-07, + "loss": 0.81229579, + "num_input_tokens_seen": 287091450, + "step": 13309, + "time_per_iteration": 2.7899563312530518 + }, + { + "auxiliary_loss_clip": 0.0102444, + "auxiliary_loss_mlp": 0.0100329, + "balance_loss_clip": 1.00469553, + "balance_loss_mlp": 1.00225282, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9276158421087239, + "language_loss": 0.6473437, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66762102, + "num_input_tokens_seen": 287148365, + "step": 13310, + "time_per_iteration": 3.0333759784698486 + }, + { + "auxiliary_loss_clip": 0.01086311, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.03336585, + "balance_loss_mlp": 1.02366424, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 2.0035764559269893, + "language_loss": 0.82820237, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.84941077, + "num_input_tokens_seen": 287168280, + "step": 13311, + "time_per_iteration": 2.624918222427368 + }, + { + "auxiliary_loss_clip": 0.01088502, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.03483629, + "balance_loss_mlp": 1.02219653, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.868315539458999, + "language_loss": 0.66389948, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68511832, + "num_input_tokens_seen": 287185980, + "step": 13312, + "time_per_iteration": 4.078034400939941 + }, + { + "auxiliary_loss_clip": 0.01103421, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.037251, + "balance_loss_mlp": 1.02015865, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.6136861926687953, + "language_loss": 0.75435746, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77571052, + "num_input_tokens_seen": 287203875, + "step": 13313, + "time_per_iteration": 2.5952908992767334 + }, + { + "auxiliary_loss_clip": 0.01082978, + "auxiliary_loss_mlp": 0.01028733, + "balance_loss_clip": 1.03071237, + "balance_loss_mlp": 1.01726496, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3924175094074627, + "language_loss": 0.75679255, + "learning_rate": 4.031444553532575e-07, + "loss": 0.7779097, + "num_input_tokens_seen": 287226445, + "step": 13314, + "time_per_iteration": 2.722083330154419 + }, + { + "auxiliary_loss_clip": 0.00987277, + "auxiliary_loss_mlp": 0.00999741, + "balance_loss_clip": 1.00883508, + "balance_loss_mlp": 0.99863189, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8295934592754709, + "language_loss": 0.53759336, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55746353, + "num_input_tokens_seen": 287286240, + "step": 13315, + "time_per_iteration": 3.1624951362609863 + }, + { + "auxiliary_loss_clip": 0.01074851, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.03301096, + "balance_loss_mlp": 1.01838195, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 1.5862818657460198, + "language_loss": 0.71198744, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73303026, + "num_input_tokens_seen": 287310265, + "step": 13316, + "time_per_iteration": 2.775210380554199 + }, + { + "auxiliary_loss_clip": 0.01078456, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.03498876, + "balance_loss_mlp": 1.01767814, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 2.003078933670772, + "language_loss": 0.64429379, + "learning_rate": 4.024412542272706e-07, + "loss": 0.66536522, + "num_input_tokens_seen": 287331610, + "step": 13317, + "time_per_iteration": 2.7185299396514893 + }, + { + "auxiliary_loss_clip": 0.01024208, + "auxiliary_loss_mlp": 0.01002799, + "balance_loss_clip": 1.00432074, + "balance_loss_mlp": 1.00190449, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7659770505686126, + "language_loss": 0.5904603, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61073035, + "num_input_tokens_seen": 287394795, + "step": 13318, + "time_per_iteration": 3.171548843383789 + }, + { + "auxiliary_loss_clip": 0.01064057, + "auxiliary_loss_mlp": 0.01021717, + "balance_loss_clip": 1.03333426, + "balance_loss_mlp": 1.01094103, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.8064658401561948, + "language_loss": 0.66355097, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68440878, + "num_input_tokens_seen": 287414595, + "step": 13319, + "time_per_iteration": 4.234827518463135 + }, + { + "auxiliary_loss_clip": 0.01099355, + "auxiliary_loss_mlp": 0.00749525, + "balance_loss_clip": 1.03323913, + "balance_loss_mlp": 1.00023651, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 1.8760135729469136, + "language_loss": 0.74196744, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.7604562, + "num_input_tokens_seen": 287434395, + "step": 13320, + "time_per_iteration": 2.598381757736206 + }, + { + "auxiliary_loss_clip": 0.01086934, + "auxiliary_loss_mlp": 0.01024681, + "balance_loss_clip": 1.03409386, + "balance_loss_mlp": 1.01364851, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 1.905429745802323, + "language_loss": 0.80382013, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82493639, + "num_input_tokens_seen": 287450590, + "step": 13321, + "time_per_iteration": 2.6498711109161377 + }, + { + "auxiliary_loss_clip": 0.01014769, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.0238682, + "balance_loss_mlp": 1.01898718, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.790522680019217, + "language_loss": 0.65864992, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.67909801, + "num_input_tokens_seen": 287468455, + "step": 13322, + "time_per_iteration": 2.803884267807007 + }, + { + "auxiliary_loss_clip": 0.01089888, + "auxiliary_loss_mlp": 0.0102645, + "balance_loss_clip": 1.03479159, + "balance_loss_mlp": 1.01532722, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 2.3551854634379925, + "language_loss": 0.7766735, + "learning_rate": 4.010364878639265e-07, + "loss": 0.79783684, + "num_input_tokens_seen": 287486485, + "step": 13323, + "time_per_iteration": 2.578784465789795 + }, + { + "auxiliary_loss_clip": 0.01099377, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.03369236, + "balance_loss_mlp": 1.01882184, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 2.452597958327808, + "language_loss": 0.71139991, + "learning_rate": 4.00802572299932e-07, + "loss": 0.73269635, + "num_input_tokens_seen": 287503940, + "step": 13324, + "time_per_iteration": 2.6056411266326904 + }, + { + "auxiliary_loss_clip": 0.01055114, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0297122, + "balance_loss_mlp": 1.02142966, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.9047274775723717, + "language_loss": 0.76629835, + "learning_rate": 4.005687173776635e-07, + "loss": 0.7871809, + "num_input_tokens_seen": 287521660, + "step": 13325, + "time_per_iteration": 2.6036853790283203 + }, + { + "auxiliary_loss_clip": 0.01072462, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.03117251, + "balance_loss_mlp": 1.01776326, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.5531221236434904, + "language_loss": 0.79619712, + "learning_rate": 4.003349231059898e-07, + "loss": 0.81719697, + "num_input_tokens_seen": 287541505, + "step": 13326, + "time_per_iteration": 2.7395663261413574 + }, + { + "auxiliary_loss_clip": 0.01086616, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.03432119, + "balance_loss_mlp": 1.02377427, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 1.9346520239137548, + "language_loss": 0.65970087, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68090612, + "num_input_tokens_seen": 287560015, + "step": 13327, + "time_per_iteration": 2.6582789421081543 + }, + { + "auxiliary_loss_clip": 0.01082014, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.03239644, + "balance_loss_mlp": 1.01781678, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 1.7220908554617165, + "language_loss": 0.73665202, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75775301, + "num_input_tokens_seen": 287579150, + "step": 13328, + "time_per_iteration": 2.6265511512756348 + }, + { + "auxiliary_loss_clip": 0.0103628, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.03039074, + "balance_loss_mlp": 1.0200572, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 13.398848306770626, + "language_loss": 0.73861492, + "learning_rate": 3.996339042831798e-07, + "loss": 0.75930536, + "num_input_tokens_seen": 287597420, + "step": 13329, + "time_per_iteration": 4.216986894607544 + }, + { + "auxiliary_loss_clip": 0.01015502, + "auxiliary_loss_mlp": 0.01000012, + "balance_loss_clip": 1.00510359, + "balance_loss_mlp": 0.99908811, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.691262315106842, + "language_loss": 0.52913082, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54928589, + "num_input_tokens_seen": 287667280, + "step": 13330, + "time_per_iteration": 3.218142509460449 + }, + { + "auxiliary_loss_clip": 0.01076173, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.03311789, + "balance_loss_mlp": 1.02394199, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.8566472279605442, + "language_loss": 0.73259616, + "learning_rate": 3.991668618167519e-07, + "loss": 0.75372386, + "num_input_tokens_seen": 287687375, + "step": 13331, + "time_per_iteration": 2.6262948513031006 + }, + { + "auxiliary_loss_clip": 0.01082442, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.03302491, + "balance_loss_mlp": 1.0194087, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 2.161951231785414, + "language_loss": 0.77416885, + "learning_rate": 3.989334316347401e-07, + "loss": 0.7952832, + "num_input_tokens_seen": 287707895, + "step": 13332, + "time_per_iteration": 2.6682472229003906 + }, + { + "auxiliary_loss_clip": 0.01099198, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.03493154, + "balance_loss_mlp": 1.01665664, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 2.25494201329744, + "language_loss": 0.83733141, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85860389, + "num_input_tokens_seen": 287723990, + "step": 13333, + "time_per_iteration": 2.56612229347229 + }, + { + "auxiliary_loss_clip": 0.01076659, + "auxiliary_loss_mlp": 0.01024457, + "balance_loss_clip": 1.03250456, + "balance_loss_mlp": 1.01279187, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.883178779290438, + "language_loss": 0.73486698, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75587809, + "num_input_tokens_seen": 287742380, + "step": 13334, + "time_per_iteration": 2.5906870365142822 + }, + { + "auxiliary_loss_clip": 0.01056427, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.03161061, + "balance_loss_mlp": 1.0208509, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 2.0924266090900687, + "language_loss": 0.74836141, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76925492, + "num_input_tokens_seen": 287760130, + "step": 13335, + "time_per_iteration": 2.6788930892944336 + }, + { + "auxiliary_loss_clip": 0.01053392, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.02828538, + "balance_loss_mlp": 1.01752853, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 1.9132541213655125, + "language_loss": 0.75571501, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.7765398, + "num_input_tokens_seen": 287777565, + "step": 13336, + "time_per_iteration": 2.637129306793213 + }, + { + "auxiliary_loss_clip": 0.01068189, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.03440452, + "balance_loss_mlp": 1.0212307, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 3.0962454501695715, + "language_loss": 0.74928284, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77030057, + "num_input_tokens_seen": 287796310, + "step": 13337, + "time_per_iteration": 2.6243577003479004 + }, + { + "auxiliary_loss_clip": 0.01038155, + "auxiliary_loss_mlp": 0.00749656, + "balance_loss_clip": 1.03281689, + "balance_loss_mlp": 1.00028515, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.7152354287298452, + "language_loss": 0.80079341, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.81867146, + "num_input_tokens_seen": 287817330, + "step": 13338, + "time_per_iteration": 2.8104395866394043 + }, + { + "auxiliary_loss_clip": 0.01060099, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.03042078, + "balance_loss_mlp": 1.016101, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 2.031415660271632, + "language_loss": 0.74496496, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.76584893, + "num_input_tokens_seen": 287835095, + "step": 13339, + "time_per_iteration": 2.6285440921783447 + }, + { + "auxiliary_loss_clip": 0.01084774, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.03352869, + "balance_loss_mlp": 1.01636457, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.8011395788699442, + "language_loss": 0.78775197, + "learning_rate": 3.970681765754775e-07, + "loss": 0.80886668, + "num_input_tokens_seen": 287854595, + "step": 13340, + "time_per_iteration": 2.608863115310669 + }, + { + "auxiliary_loss_clip": 0.01069309, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.03469169, + "balance_loss_mlp": 1.01946831, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.8515692028521846, + "language_loss": 0.68082523, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70181918, + "num_input_tokens_seen": 287876960, + "step": 13341, + "time_per_iteration": 2.723050355911255 + }, + { + "auxiliary_loss_clip": 0.01005389, + "auxiliary_loss_mlp": 0.01003979, + "balance_loss_clip": 1.00465536, + "balance_loss_mlp": 1.00298345, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8083874485785764, + "language_loss": 0.61646086, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63655454, + "num_input_tokens_seen": 287936530, + "step": 13342, + "time_per_iteration": 3.1406235694885254 + }, + { + "auxiliary_loss_clip": 0.01080337, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.03541183, + "balance_loss_mlp": 1.0201925, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 1.9883522549369173, + "language_loss": 0.63617724, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65730095, + "num_input_tokens_seen": 287954285, + "step": 13343, + "time_per_iteration": 2.6499876976013184 + }, + { + "auxiliary_loss_clip": 0.01070924, + "auxiliary_loss_mlp": 0.01025751, + "balance_loss_clip": 1.0314939, + "balance_loss_mlp": 1.01536751, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 2.0760543713996245, + "language_loss": 0.69049239, + "learning_rate": 3.96137007563051e-07, + "loss": 0.71145916, + "num_input_tokens_seen": 287971595, + "step": 13344, + "time_per_iteration": 4.188579559326172 + }, + { + "auxiliary_loss_clip": 0.01088303, + "auxiliary_loss_mlp": 0.01026387, + "balance_loss_clip": 1.03491616, + "balance_loss_mlp": 1.01517558, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.6059103661091336, + "language_loss": 0.70008123, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72122806, + "num_input_tokens_seen": 287992540, + "step": 13345, + "time_per_iteration": 2.633235454559326 + }, + { + "auxiliary_loss_clip": 0.01004466, + "auxiliary_loss_mlp": 0.01007095, + "balance_loss_clip": 1.00462699, + "balance_loss_mlp": 1.0060637, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 1.0401480154842604, + "language_loss": 0.6294508, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64956635, + "num_input_tokens_seen": 288052810, + "step": 13346, + "time_per_iteration": 3.207489013671875 + }, + { + "auxiliary_loss_clip": 0.01074979, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.03579974, + "balance_loss_mlp": 1.01875103, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.419223364424878, + "language_loss": 0.72754449, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74859309, + "num_input_tokens_seen": 288073045, + "step": 13347, + "time_per_iteration": 2.682007074356079 + }, + { + "auxiliary_loss_clip": 0.01088071, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.03427804, + "balance_loss_mlp": 1.01785207, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 1.9947822755350133, + "language_loss": 0.72995889, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.75113255, + "num_input_tokens_seen": 288091165, + "step": 13348, + "time_per_iteration": 2.5575695037841797 + }, + { + "auxiliary_loss_clip": 0.01070874, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.03565812, + "balance_loss_mlp": 1.01643991, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 1.76408303630871, + "language_loss": 0.76048923, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.78147417, + "num_input_tokens_seen": 288110595, + "step": 13349, + "time_per_iteration": 2.610689163208008 + }, + { + "auxiliary_loss_clip": 0.01100476, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.03623509, + "balance_loss_mlp": 1.02208781, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 2.4873260660729466, + "language_loss": 0.83601463, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85734367, + "num_input_tokens_seen": 288128995, + "step": 13350, + "time_per_iteration": 4.103788375854492 + }, + { + "auxiliary_loss_clip": 0.01088478, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.03507423, + "balance_loss_mlp": 1.02167475, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 2.4411962830834026, + "language_loss": 0.71060401, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73181581, + "num_input_tokens_seen": 288149265, + "step": 13351, + "time_per_iteration": 2.6734321117401123 + }, + { + "auxiliary_loss_clip": 0.01054808, + "auxiliary_loss_mlp": 0.01026848, + "balance_loss_clip": 1.03159559, + "balance_loss_mlp": 1.01523149, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 3.0076169356083535, + "language_loss": 0.61657411, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63739067, + "num_input_tokens_seen": 288170745, + "step": 13352, + "time_per_iteration": 2.6799612045288086 + }, + { + "auxiliary_loss_clip": 0.01087235, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.03548098, + "balance_loss_mlp": 1.02148116, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.9924099137837856, + "language_loss": 0.76786268, + "learning_rate": 3.940454360354046e-07, + "loss": 0.78905559, + "num_input_tokens_seen": 288189415, + "step": 13353, + "time_per_iteration": 2.566800594329834 + }, + { + "auxiliary_loss_clip": 0.01039894, + "auxiliary_loss_mlp": 0.01028081, + "balance_loss_clip": 1.03219032, + "balance_loss_mlp": 1.01555836, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.4551153260875895, + "language_loss": 0.73727167, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75795144, + "num_input_tokens_seen": 288206900, + "step": 13354, + "time_per_iteration": 2.748063087463379 + }, + { + "auxiliary_loss_clip": 0.01056589, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.03122115, + "balance_loss_mlp": 1.0271306, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 1.7496704541203905, + "language_loss": 0.65536284, + "learning_rate": 3.935813120140714e-07, + "loss": 0.67632264, + "num_input_tokens_seen": 288224800, + "step": 13355, + "time_per_iteration": 2.7457282543182373 + }, + { + "auxiliary_loss_clip": 0.01058624, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.03015852, + "balance_loss_mlp": 1.02084291, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 2.1698975887168928, + "language_loss": 0.68689942, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70781839, + "num_input_tokens_seen": 288249400, + "step": 13356, + "time_per_iteration": 2.9316935539245605 + }, + { + "auxiliary_loss_clip": 0.01054472, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.03563714, + "balance_loss_mlp": 1.01809239, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.5122479945443312, + "language_loss": 0.77154517, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79238617, + "num_input_tokens_seen": 288268780, + "step": 13357, + "time_per_iteration": 2.7147881984710693 + }, + { + "auxiliary_loss_clip": 0.01061255, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.03002977, + "balance_loss_mlp": 1.01825726, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.3790452894942593, + "language_loss": 0.76940119, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79032284, + "num_input_tokens_seen": 288290830, + "step": 13358, + "time_per_iteration": 2.700606107711792 + }, + { + "auxiliary_loss_clip": 0.01086847, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.03331184, + "balance_loss_mlp": 1.01678252, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.6916813330340346, + "language_loss": 0.84775716, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.86890423, + "num_input_tokens_seen": 288308865, + "step": 13359, + "time_per_iteration": 4.122483730316162 + }, + { + "auxiliary_loss_clip": 0.01071785, + "auxiliary_loss_mlp": 0.01026489, + "balance_loss_clip": 1.03420687, + "balance_loss_mlp": 1.01607025, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 1.9052250175207797, + "language_loss": 0.73408699, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75506973, + "num_input_tokens_seen": 288327325, + "step": 13360, + "time_per_iteration": 2.635106325149536 + }, + { + "auxiliary_loss_clip": 0.01098866, + "auxiliary_loss_mlp": 0.01026571, + "balance_loss_clip": 1.03446531, + "balance_loss_mlp": 1.01590228, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 1.7706409457348946, + "language_loss": 0.69787228, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71912658, + "num_input_tokens_seen": 288347285, + "step": 13361, + "time_per_iteration": 2.855076551437378 + }, + { + "auxiliary_loss_clip": 0.01101395, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.0343281, + "balance_loss_mlp": 1.02576458, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.5973607165275963, + "language_loss": 0.70453906, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72592765, + "num_input_tokens_seen": 288367785, + "step": 13362, + "time_per_iteration": 2.7621865272521973 + }, + { + "auxiliary_loss_clip": 0.01105709, + "auxiliary_loss_mlp": 0.01038233, + "balance_loss_clip": 1.0362184, + "balance_loss_mlp": 1.02493501, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 9.54855177175729, + "language_loss": 0.7856285, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80706787, + "num_input_tokens_seen": 288384135, + "step": 13363, + "time_per_iteration": 2.626847743988037 + }, + { + "auxiliary_loss_clip": 0.01086443, + "auxiliary_loss_mlp": 0.01026532, + "balance_loss_clip": 1.03292334, + "balance_loss_mlp": 1.01489091, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.1074080881341497, + "language_loss": 0.74830902, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.7694388, + "num_input_tokens_seen": 288403805, + "step": 13364, + "time_per_iteration": 2.632758140563965 + }, + { + "auxiliary_loss_clip": 0.01087736, + "auxiliary_loss_mlp": 0.01029, + "balance_loss_clip": 1.03756595, + "balance_loss_mlp": 1.01836634, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 2.066336007183229, + "language_loss": 0.60122281, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62239021, + "num_input_tokens_seen": 288424895, + "step": 13365, + "time_per_iteration": 2.6433417797088623 + }, + { + "auxiliary_loss_clip": 0.01075426, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.0321362, + "balance_loss_mlp": 1.02321219, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 1.7650659031975562, + "language_loss": 0.65830213, + "learning_rate": 3.910329872447706e-07, + "loss": 0.679407, + "num_input_tokens_seen": 288443865, + "step": 13366, + "time_per_iteration": 2.6187727451324463 + }, + { + "auxiliary_loss_clip": 0.01095891, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.03370094, + "balance_loss_mlp": 1.0154705, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.1291739161233627, + "language_loss": 0.75045645, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77167475, + "num_input_tokens_seen": 288461065, + "step": 13367, + "time_per_iteration": 2.4450464248657227 + }, + { + "auxiliary_loss_clip": 0.01096764, + "auxiliary_loss_mlp": 0.01023829, + "balance_loss_clip": 1.03375936, + "balance_loss_mlp": 1.01315343, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.4994632613941778, + "language_loss": 0.74112368, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76232952, + "num_input_tokens_seen": 288481865, + "step": 13368, + "time_per_iteration": 2.616638422012329 + }, + { + "auxiliary_loss_clip": 0.01100056, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.03391147, + "balance_loss_mlp": 1.01962638, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 2.2160152720345674, + "language_loss": 0.70064926, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72195709, + "num_input_tokens_seen": 288499345, + "step": 13369, + "time_per_iteration": 3.98479962348938 + }, + { + "auxiliary_loss_clip": 0.01077095, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.03298795, + "balance_loss_mlp": 1.01952517, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.6050089075632357, + "language_loss": 0.73852295, + "learning_rate": 3.901081534434312e-07, + "loss": 0.7595948, + "num_input_tokens_seen": 288517660, + "step": 13370, + "time_per_iteration": 2.597405195236206 + }, + { + "auxiliary_loss_clip": 0.01074468, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.03127217, + "balance_loss_mlp": 1.0201869, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.826760252300658, + "language_loss": 0.87267506, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89374971, + "num_input_tokens_seen": 288534180, + "step": 13371, + "time_per_iteration": 2.577620267868042 + }, + { + "auxiliary_loss_clip": 0.01090166, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.03283143, + "balance_loss_mlp": 1.01896203, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 1.9204881408135375, + "language_loss": 0.74733686, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76854837, + "num_input_tokens_seen": 288553350, + "step": 13372, + "time_per_iteration": 2.5487680435180664 + }, + { + "auxiliary_loss_clip": 0.01056559, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.03351474, + "balance_loss_mlp": 1.01703978, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 2.248668067152856, + "language_loss": 0.79394341, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81480336, + "num_input_tokens_seen": 288571325, + "step": 13373, + "time_per_iteration": 2.686314821243286 + }, + { + "auxiliary_loss_clip": 0.01068963, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.03100348, + "balance_loss_mlp": 1.02172637, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 1.9037197470103497, + "language_loss": 0.74746382, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76848125, + "num_input_tokens_seen": 288592100, + "step": 13374, + "time_per_iteration": 2.623628854751587 + }, + { + "auxiliary_loss_clip": 0.01044962, + "auxiliary_loss_mlp": 0.01036547, + "balance_loss_clip": 1.02777481, + "balance_loss_mlp": 1.0232017, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 1.941362008634436, + "language_loss": 0.68319893, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70401394, + "num_input_tokens_seen": 288612305, + "step": 13375, + "time_per_iteration": 2.6492958068847656 + }, + { + "auxiliary_loss_clip": 0.00999642, + "auxiliary_loss_mlp": 0.01001383, + "balance_loss_clip": 1.00879252, + "balance_loss_mlp": 1.00025678, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.7288908428014025, + "language_loss": 0.55646241, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57647264, + "num_input_tokens_seen": 288676015, + "step": 13376, + "time_per_iteration": 3.2137198448181152 + }, + { + "auxiliary_loss_clip": 0.01039784, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.02868271, + "balance_loss_mlp": 1.01980686, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.668674207186408, + "language_loss": 0.72775906, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.74848557, + "num_input_tokens_seen": 288696455, + "step": 13377, + "time_per_iteration": 2.66584849357605 + }, + { + "auxiliary_loss_clip": 0.01085675, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.03219461, + "balance_loss_mlp": 1.01554, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.8330949712995592, + "language_loss": 0.69872457, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.71985233, + "num_input_tokens_seen": 288715560, + "step": 13378, + "time_per_iteration": 2.7060821056365967 + }, + { + "auxiliary_loss_clip": 0.01089172, + "auxiliary_loss_mlp": 0.01024583, + "balance_loss_clip": 1.03336167, + "balance_loss_mlp": 1.01357377, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.4040615252377833, + "language_loss": 0.69297338, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71411097, + "num_input_tokens_seen": 288739485, + "step": 13379, + "time_per_iteration": 2.663337230682373 + }, + { + "auxiliary_loss_clip": 0.01103486, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.0366745, + "balance_loss_mlp": 1.01848567, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.7426138789687826, + "language_loss": 0.76377594, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78512013, + "num_input_tokens_seen": 288757420, + "step": 13380, + "time_per_iteration": 2.5611350536346436 + }, + { + "auxiliary_loss_clip": 0.01048919, + "auxiliary_loss_mlp": 0.01025432, + "balance_loss_clip": 1.03053975, + "balance_loss_mlp": 1.01455426, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 3.003548858278935, + "language_loss": 0.69143158, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71217507, + "num_input_tokens_seen": 288775535, + "step": 13381, + "time_per_iteration": 2.6817009449005127 + }, + { + "auxiliary_loss_clip": 0.01089167, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.03472865, + "balance_loss_mlp": 1.01897264, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 1.890400817845552, + "language_loss": 0.6380614, + "learning_rate": 3.873395148176135e-07, + "loss": 0.65925741, + "num_input_tokens_seen": 288795035, + "step": 13382, + "time_per_iteration": 2.595236301422119 + }, + { + "auxiliary_loss_clip": 0.01076761, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.03424442, + "balance_loss_mlp": 1.0261451, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 1.9512262126519486, + "language_loss": 0.76151979, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78265178, + "num_input_tokens_seen": 288816270, + "step": 13383, + "time_per_iteration": 2.673574924468994 + }, + { + "auxiliary_loss_clip": 0.0108101, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.03255916, + "balance_loss_mlp": 1.02248192, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 2.0063687859843036, + "language_loss": 0.69753766, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71869564, + "num_input_tokens_seen": 288836050, + "step": 13384, + "time_per_iteration": 2.6078226566314697 + }, + { + "auxiliary_loss_clip": 0.01088614, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.03239167, + "balance_loss_mlp": 1.02266765, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.0464747895389093, + "language_loss": 0.7953831, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81661904, + "num_input_tokens_seen": 288852900, + "step": 13385, + "time_per_iteration": 4.018157482147217 + }, + { + "auxiliary_loss_clip": 0.01098273, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.03318763, + "balance_loss_mlp": 1.02045131, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.8016920207561828, + "language_loss": 0.72315508, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74446309, + "num_input_tokens_seen": 288872625, + "step": 13386, + "time_per_iteration": 2.562385320663452 + }, + { + "auxiliary_loss_clip": 0.00993493, + "auxiliary_loss_mlp": 0.01000895, + "balance_loss_clip": 1.00542116, + "balance_loss_mlp": 1.00009084, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.667307754674109, + "language_loss": 0.51212931, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53207314, + "num_input_tokens_seen": 288939180, + "step": 13387, + "time_per_iteration": 3.177888870239258 + }, + { + "auxiliary_loss_clip": 0.01097904, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.03302717, + "balance_loss_mlp": 1.01777196, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.9156334266453028, + "language_loss": 0.74034023, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76161885, + "num_input_tokens_seen": 288958925, + "step": 13388, + "time_per_iteration": 2.518395185470581 + }, + { + "auxiliary_loss_clip": 0.01067592, + "auxiliary_loss_mlp": 0.01025711, + "balance_loss_clip": 1.03391159, + "balance_loss_mlp": 1.01492286, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.5096763324300635, + "language_loss": 0.71459293, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73552591, + "num_input_tokens_seen": 288980935, + "step": 13389, + "time_per_iteration": 2.725985527038574 + }, + { + "auxiliary_loss_clip": 0.01073117, + "auxiliary_loss_mlp": 0.01034545, + "balance_loss_clip": 1.03558588, + "balance_loss_mlp": 1.0231967, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 2.087146065726268, + "language_loss": 0.82664633, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.84772301, + "num_input_tokens_seen": 288996780, + "step": 13390, + "time_per_iteration": 4.021292448043823 + }, + { + "auxiliary_loss_clip": 0.01016088, + "auxiliary_loss_mlp": 0.01002546, + "balance_loss_clip": 1.00630784, + "balance_loss_mlp": 1.00161576, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7876542736865884, + "language_loss": 0.55511421, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57530051, + "num_input_tokens_seen": 289057590, + "step": 13391, + "time_per_iteration": 3.0989115238189697 + }, + { + "auxiliary_loss_clip": 0.01084297, + "auxiliary_loss_mlp": 0.01029707, + "balance_loss_clip": 1.03295779, + "balance_loss_mlp": 1.01895452, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.6427583767186031, + "language_loss": 0.84504509, + "learning_rate": 3.850390420667762e-07, + "loss": 0.86618507, + "num_input_tokens_seen": 289076285, + "step": 13392, + "time_per_iteration": 2.5749902725219727 + }, + { + "auxiliary_loss_clip": 0.01063608, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.03074908, + "balance_loss_mlp": 1.02064133, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.5415660201329107, + "language_loss": 0.70392966, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72487605, + "num_input_tokens_seen": 289097585, + "step": 13393, + "time_per_iteration": 2.6899590492248535 + }, + { + "auxiliary_loss_clip": 0.01088283, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.03373957, + "balance_loss_mlp": 1.01682234, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 2.027540333337947, + "language_loss": 0.76185787, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78302628, + "num_input_tokens_seen": 289116890, + "step": 13394, + "time_per_iteration": 2.602858066558838 + }, + { + "auxiliary_loss_clip": 0.01102039, + "auxiliary_loss_mlp": 0.01027885, + "balance_loss_clip": 1.03726399, + "balance_loss_mlp": 1.01728714, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.6849821690434255, + "language_loss": 0.65026885, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67156804, + "num_input_tokens_seen": 289136670, + "step": 13395, + "time_per_iteration": 2.5551199913024902 + }, + { + "auxiliary_loss_clip": 0.01013656, + "auxiliary_loss_mlp": 0.01009068, + "balance_loss_clip": 1.00345969, + "balance_loss_mlp": 1.0081557, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.8019504271037716, + "language_loss": 0.57343686, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59366405, + "num_input_tokens_seen": 289200150, + "step": 13396, + "time_per_iteration": 3.2631213665008545 + }, + { + "auxiliary_loss_clip": 0.01088236, + "auxiliary_loss_mlp": 0.01033448, + "balance_loss_clip": 1.03383803, + "balance_loss_mlp": 1.02123475, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.681031902396977, + "language_loss": 0.77450919, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79572606, + "num_input_tokens_seen": 289218125, + "step": 13397, + "time_per_iteration": 2.570416212081909 + }, + { + "auxiliary_loss_clip": 0.01084825, + "auxiliary_loss_mlp": 0.0102727, + "balance_loss_clip": 1.03690732, + "balance_loss_mlp": 1.01678538, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.589344001581762, + "language_loss": 0.70269686, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72381783, + "num_input_tokens_seen": 289237115, + "step": 13398, + "time_per_iteration": 2.54986572265625 + }, + { + "auxiliary_loss_clip": 0.01073801, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.03139234, + "balance_loss_mlp": 1.01981187, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.458590263904604, + "language_loss": 0.69350022, + "learning_rate": 3.834323543710805e-07, + "loss": 0.7145381, + "num_input_tokens_seen": 289253635, + "step": 13399, + "time_per_iteration": 2.581465244293213 + }, + { + "auxiliary_loss_clip": 0.01099272, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.03546882, + "balance_loss_mlp": 1.01871538, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.309624322719785, + "language_loss": 0.72713381, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74842131, + "num_input_tokens_seen": 289270085, + "step": 13400, + "time_per_iteration": 4.019591808319092 + }, + { + "auxiliary_loss_clip": 0.01082299, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.02966976, + "balance_loss_mlp": 1.0190742, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.7755338908129297, + "language_loss": 0.63791049, + "learning_rate": 3.829738523169037e-07, + "loss": 0.65903425, + "num_input_tokens_seen": 289289645, + "step": 13401, + "time_per_iteration": 2.5807149410247803 + }, + { + "auxiliary_loss_clip": 0.01084449, + "auxiliary_loss_mlp": 0.01027061, + "balance_loss_clip": 1.0316124, + "balance_loss_mlp": 1.01611161, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.1522506700673474, + "language_loss": 0.83698761, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.8581028, + "num_input_tokens_seen": 289306630, + "step": 13402, + "time_per_iteration": 2.562943696975708 + }, + { + "auxiliary_loss_clip": 0.01045001, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.03224993, + "balance_loss_mlp": 1.01920342, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 2.0231930366737494, + "language_loss": 0.67382085, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.69457871, + "num_input_tokens_seen": 289324960, + "step": 13403, + "time_per_iteration": 2.6490955352783203 + }, + { + "auxiliary_loss_clip": 0.01055771, + "auxiliary_loss_mlp": 0.00748965, + "balance_loss_clip": 1.03173697, + "balance_loss_mlp": 1.00013995, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.6394644302582908, + "language_loss": 0.84874129, + "learning_rate": 3.822865591408084e-07, + "loss": 0.86678863, + "num_input_tokens_seen": 289344980, + "step": 13404, + "time_per_iteration": 2.660135269165039 + }, + { + "auxiliary_loss_clip": 0.01055335, + "auxiliary_loss_mlp": 0.0102819, + "balance_loss_clip": 1.03212249, + "balance_loss_mlp": 1.01789021, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.7916921044374858, + "language_loss": 0.70349073, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72432601, + "num_input_tokens_seen": 289367500, + "step": 13405, + "time_per_iteration": 2.7113001346588135 + }, + { + "auxiliary_loss_clip": 0.01087826, + "auxiliary_loss_mlp": 0.01023758, + "balance_loss_clip": 1.03396773, + "balance_loss_mlp": 1.0133096, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 2.2976272443328694, + "language_loss": 0.75136405, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77247989, + "num_input_tokens_seen": 289385930, + "step": 13406, + "time_per_iteration": 2.606337547302246 + }, + { + "auxiliary_loss_clip": 0.01088506, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.03419304, + "balance_loss_mlp": 1.0199784, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.5099676049616684, + "language_loss": 0.75872469, + "learning_rate": 3.815998180594018e-07, + "loss": 0.77992558, + "num_input_tokens_seen": 289408025, + "step": 13407, + "time_per_iteration": 2.5752923488616943 + }, + { + "auxiliary_loss_clip": 0.010656, + "auxiliary_loss_mlp": 0.00749402, + "balance_loss_clip": 1.03009725, + "balance_loss_mlp": 1.00021911, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.9858845288018285, + "language_loss": 0.73531574, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75346577, + "num_input_tokens_seen": 289426575, + "step": 13408, + "time_per_iteration": 2.63850474357605 + }, + { + "auxiliary_loss_clip": 0.01066039, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.03073132, + "balance_loss_mlp": 1.01817465, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 2.212908654678974, + "language_loss": 0.70823383, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72919667, + "num_input_tokens_seen": 289447760, + "step": 13409, + "time_per_iteration": 4.129429578781128 + }, + { + "auxiliary_loss_clip": 0.01097424, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.03330421, + "balance_loss_mlp": 1.01650894, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 2.5840869884377318, + "language_loss": 0.76989436, + "learning_rate": 3.809136293070545e-07, + "loss": 0.79114491, + "num_input_tokens_seen": 289463920, + "step": 13410, + "time_per_iteration": 2.4555583000183105 + }, + { + "auxiliary_loss_clip": 0.01086256, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.03420854, + "balance_loss_mlp": 1.02236903, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 2.0866605283879465, + "language_loss": 0.68493575, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70612961, + "num_input_tokens_seen": 289482635, + "step": 13411, + "time_per_iteration": 2.666391372680664 + }, + { + "auxiliary_loss_clip": 0.01061079, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.03131354, + "balance_loss_mlp": 1.01612234, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.9618216664784625, + "language_loss": 0.68044084, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70131874, + "num_input_tokens_seen": 289502040, + "step": 13412, + "time_per_iteration": 2.6797947883605957 + }, + { + "auxiliary_loss_clip": 0.01089845, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.03660929, + "balance_loss_mlp": 1.02087379, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.567290590673189, + "language_loss": 0.81654531, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.8377791, + "num_input_tokens_seen": 289520740, + "step": 13413, + "time_per_iteration": 2.607699155807495 + }, + { + "auxiliary_loss_clip": 0.01078914, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.03129435, + "balance_loss_mlp": 1.02194929, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.9357616860472766, + "language_loss": 0.85170203, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87283194, + "num_input_tokens_seen": 289535840, + "step": 13414, + "time_per_iteration": 2.6309194564819336 + }, + { + "auxiliary_loss_clip": 0.01070471, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.03244185, + "balance_loss_mlp": 1.01675153, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.769943620223012, + "language_loss": 0.66925251, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69023085, + "num_input_tokens_seen": 289555205, + "step": 13415, + "time_per_iteration": 2.6352055072784424 + }, + { + "auxiliary_loss_clip": 0.01060906, + "auxiliary_loss_mlp": 0.01022197, + "balance_loss_clip": 1.03119826, + "balance_loss_mlp": 1.01187348, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.6338247490056685, + "language_loss": 0.7644642, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78529525, + "num_input_tokens_seen": 289573000, + "step": 13416, + "time_per_iteration": 2.683793306350708 + }, + { + "auxiliary_loss_clip": 0.01080893, + "auxiliary_loss_mlp": 0.01029329, + "balance_loss_clip": 1.03143239, + "balance_loss_mlp": 1.0183202, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.7845308332025442, + "language_loss": 0.65594256, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67704475, + "num_input_tokens_seen": 289592625, + "step": 13417, + "time_per_iteration": 2.680764675140381 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.01049213, + "balance_loss_clip": 1.03021872, + "balance_loss_mlp": 1.03649366, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.5948408372679523, + "language_loss": 0.80658698, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82767534, + "num_input_tokens_seen": 289610780, + "step": 13418, + "time_per_iteration": 2.706465721130371 + }, + { + "auxiliary_loss_clip": 0.01078932, + "auxiliary_loss_mlp": 0.01027009, + "balance_loss_clip": 1.03505111, + "balance_loss_mlp": 1.01567793, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.620534357847039, + "language_loss": 0.8496691, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87072849, + "num_input_tokens_seen": 289628890, + "step": 13419, + "time_per_iteration": 2.6015818119049072 + }, + { + "auxiliary_loss_clip": 0.01057551, + "auxiliary_loss_mlp": 0.00749526, + "balance_loss_clip": 1.03173661, + "balance_loss_mlp": 1.00028908, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.6572213196139975, + "language_loss": 0.76023471, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.77830547, + "num_input_tokens_seen": 289647220, + "step": 13420, + "time_per_iteration": 2.7252087593078613 + }, + { + "auxiliary_loss_clip": 0.01075809, + "auxiliary_loss_mlp": 0.00749346, + "balance_loss_clip": 1.03011465, + "balance_loss_mlp": 1.00019443, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.71964321960547, + "language_loss": 0.78638434, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80463588, + "num_input_tokens_seen": 289665800, + "step": 13421, + "time_per_iteration": 2.627936840057373 + }, + { + "auxiliary_loss_clip": 0.01066475, + "auxiliary_loss_mlp": 0.01023123, + "balance_loss_clip": 1.0341574, + "balance_loss_mlp": 1.01224542, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 2.6387383913384697, + "language_loss": 0.79571778, + "learning_rate": 3.78174402269098e-07, + "loss": 0.81661379, + "num_input_tokens_seen": 289682705, + "step": 13422, + "time_per_iteration": 2.7014832496643066 + }, + { + "auxiliary_loss_clip": 0.01096418, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.03320992, + "balance_loss_mlp": 1.0192796, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 2.12468396334183, + "language_loss": 0.6818496, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70311505, + "num_input_tokens_seen": 289702920, + "step": 13423, + "time_per_iteration": 2.680246114730835 + }, + { + "auxiliary_loss_clip": 0.01073927, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.03425312, + "balance_loss_mlp": 1.02208328, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 2.1405639316149787, + "language_loss": 0.80091393, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82199728, + "num_input_tokens_seen": 289723280, + "step": 13424, + "time_per_iteration": 4.198224782943726 + }, + { + "auxiliary_loss_clip": 0.01087804, + "auxiliary_loss_mlp": 0.01025855, + "balance_loss_clip": 1.03243828, + "balance_loss_mlp": 1.01489401, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.484114604633589, + "language_loss": 0.78704906, + "learning_rate": 3.774909786710232e-07, + "loss": 0.8081857, + "num_input_tokens_seen": 289743475, + "step": 13425, + "time_per_iteration": 2.6084976196289062 + }, + { + "auxiliary_loss_clip": 0.01063131, + "auxiliary_loss_mlp": 0.0102902, + "balance_loss_clip": 1.03039789, + "balance_loss_mlp": 1.01796353, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.2645195073598217, + "language_loss": 0.75332797, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77424949, + "num_input_tokens_seen": 289761400, + "step": 13426, + "time_per_iteration": 2.6228668689727783 + }, + { + "auxiliary_loss_clip": 0.01086017, + "auxiliary_loss_mlp": 0.01025038, + "balance_loss_clip": 1.03237224, + "balance_loss_mlp": 1.01442266, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.630924658858975, + "language_loss": 0.72700137, + "learning_rate": 3.770356705530997e-07, + "loss": 0.74811196, + "num_input_tokens_seen": 289781025, + "step": 13427, + "time_per_iteration": 2.6386117935180664 + }, + { + "auxiliary_loss_clip": 0.01046931, + "auxiliary_loss_mlp": 0.01036419, + "balance_loss_clip": 1.03293943, + "balance_loss_mlp": 1.02408695, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 2.0009040253583787, + "language_loss": 0.70022136, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72105485, + "num_input_tokens_seen": 289798380, + "step": 13428, + "time_per_iteration": 2.700838088989258 + }, + { + "auxiliary_loss_clip": 0.01074024, + "auxiliary_loss_mlp": 0.01028759, + "balance_loss_clip": 1.03214359, + "balance_loss_mlp": 1.01830411, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 2.5641386111463706, + "language_loss": 0.74268496, + "learning_rate": 3.765806086070544e-07, + "loss": 0.76371282, + "num_input_tokens_seen": 289814515, + "step": 13429, + "time_per_iteration": 2.640028238296509 + }, + { + "auxiliary_loss_clip": 0.01084911, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.03376317, + "balance_loss_mlp": 1.01795697, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 1.81355455278889, + "language_loss": 0.66771793, + "learning_rate": 3.763531699700568e-07, + "loss": 0.68885386, + "num_input_tokens_seen": 289834315, + "step": 13430, + "time_per_iteration": 4.041198492050171 + }, + { + "auxiliary_loss_clip": 0.01056799, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.03120363, + "balance_loss_mlp": 1.01866603, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.7018893503093766, + "language_loss": 0.80189753, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82276213, + "num_input_tokens_seen": 289853770, + "step": 13431, + "time_per_iteration": 2.6522421836853027 + }, + { + "auxiliary_loss_clip": 0.0107441, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.03323174, + "balance_loss_mlp": 1.01585555, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 3.1812250585684514, + "language_loss": 0.8057403, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82676017, + "num_input_tokens_seen": 289870480, + "step": 13432, + "time_per_iteration": 2.5799622535705566 + }, + { + "auxiliary_loss_clip": 0.01071038, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.03541851, + "balance_loss_mlp": 1.02029538, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 2.1836272372365033, + "language_loss": 0.70361173, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72464329, + "num_input_tokens_seen": 289888275, + "step": 13433, + "time_per_iteration": 2.600029945373535 + }, + { + "auxiliary_loss_clip": 0.01072451, + "auxiliary_loss_mlp": 0.01024534, + "balance_loss_clip": 1.03339648, + "balance_loss_mlp": 1.01407337, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.4462619947088984, + "language_loss": 0.7242738, + "learning_rate": 3.754440311967828e-07, + "loss": 0.74524367, + "num_input_tokens_seen": 289911495, + "step": 13434, + "time_per_iteration": 2.8232696056365967 + }, + { + "auxiliary_loss_clip": 0.01056586, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.03318155, + "balance_loss_mlp": 1.01479495, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 2.5039462905760104, + "language_loss": 0.67875153, + "learning_rate": 3.752169004902361e-07, + "loss": 0.69957626, + "num_input_tokens_seen": 289930045, + "step": 13435, + "time_per_iteration": 2.6945767402648926 + }, + { + "auxiliary_loss_clip": 0.0105699, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.03463054, + "balance_loss_mlp": 1.02254558, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.6293265880957526, + "language_loss": 0.75204629, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77297246, + "num_input_tokens_seen": 289950815, + "step": 13436, + "time_per_iteration": 2.69102144241333 + }, + { + "auxiliary_loss_clip": 0.01093182, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.03180623, + "balance_loss_mlp": 1.0184828, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.704417733475709, + "language_loss": 0.70198178, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72321153, + "num_input_tokens_seen": 289971730, + "step": 13437, + "time_per_iteration": 2.560406446456909 + }, + { + "auxiliary_loss_clip": 0.01069634, + "auxiliary_loss_mlp": 0.01027065, + "balance_loss_clip": 1.03541565, + "balance_loss_mlp": 1.01695585, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.9830379825126732, + "language_loss": 0.72692281, + "learning_rate": 3.745358780766636e-07, + "loss": 0.74788976, + "num_input_tokens_seen": 289992995, + "step": 13438, + "time_per_iteration": 2.7113115787506104 + }, + { + "auxiliary_loss_clip": 0.01073271, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.03230882, + "balance_loss_mlp": 1.02204251, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 1.9236785383446355, + "language_loss": 0.77264273, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79370266, + "num_input_tokens_seen": 290009405, + "step": 13439, + "time_per_iteration": 2.637129068374634 + }, + { + "auxiliary_loss_clip": 0.01096817, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.03325379, + "balance_loss_mlp": 1.0168128, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.5566016990632918, + "language_loss": 0.78511465, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80636275, + "num_input_tokens_seen": 290031085, + "step": 13440, + "time_per_iteration": 4.090130805969238 + }, + { + "auxiliary_loss_clip": 0.01076723, + "auxiliary_loss_mlp": 0.00749618, + "balance_loss_clip": 1.03258502, + "balance_loss_mlp": 1.00028265, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 2.185601393038017, + "language_loss": 0.5912922, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.60955566, + "num_input_tokens_seen": 290048670, + "step": 13441, + "time_per_iteration": 2.6376664638519287 + }, + { + "auxiliary_loss_clip": 0.010847, + "auxiliary_loss_mlp": 0.01031917, + "balance_loss_clip": 1.03320539, + "balance_loss_mlp": 1.02073479, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 2.4146103303039825, + "language_loss": 0.75811458, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.77928078, + "num_input_tokens_seen": 290064085, + "step": 13442, + "time_per_iteration": 2.6538777351379395 + }, + { + "auxiliary_loss_clip": 0.01074813, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.032794, + "balance_loss_mlp": 1.01834714, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.441880837944711, + "language_loss": 0.70647156, + "learning_rate": 3.734020735906169e-07, + "loss": 0.7275095, + "num_input_tokens_seen": 290086255, + "step": 13443, + "time_per_iteration": 2.819411516189575 + }, + { + "auxiliary_loss_clip": 0.01053193, + "auxiliary_loss_mlp": 0.01034122, + "balance_loss_clip": 1.03286016, + "balance_loss_mlp": 1.02328014, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 2.5361700297575602, + "language_loss": 0.82433546, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84520859, + "num_input_tokens_seen": 290103995, + "step": 13444, + "time_per_iteration": 2.6292622089385986 + }, + { + "auxiliary_loss_clip": 0.0099151, + "auxiliary_loss_mlp": 0.0074666, + "balance_loss_clip": 1.01091397, + "balance_loss_mlp": 0.99980658, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8506896610414008, + "language_loss": 0.53587246, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55325425, + "num_input_tokens_seen": 290157245, + "step": 13445, + "time_per_iteration": 3.065211534500122 + }, + { + "auxiliary_loss_clip": 0.0105681, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.03085232, + "balance_loss_mlp": 1.01620877, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 2.1521734976231204, + "language_loss": 0.72424877, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.74509978, + "num_input_tokens_seen": 290174970, + "step": 13446, + "time_per_iteration": 2.6815364360809326 + }, + { + "auxiliary_loss_clip": 0.0107101, + "auxiliary_loss_mlp": 0.01030389, + "balance_loss_clip": 1.03309655, + "balance_loss_mlp": 1.01824141, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 2.0917176163354063, + "language_loss": 0.71248257, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73349655, + "num_input_tokens_seen": 290194395, + "step": 13447, + "time_per_iteration": 2.6357998847961426 + }, + { + "auxiliary_loss_clip": 0.0103291, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.03229213, + "balance_loss_mlp": 1.01896167, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.0460220763434673, + "language_loss": 0.74771172, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.76835883, + "num_input_tokens_seen": 290209200, + "step": 13448, + "time_per_iteration": 2.7104151248931885 + }, + { + "auxiliary_loss_clip": 0.01023552, + "auxiliary_loss_mlp": 0.00998174, + "balance_loss_clip": 1.00385284, + "balance_loss_mlp": 0.99720269, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 1.1672685522854087, + "language_loss": 0.63830239, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65851969, + "num_input_tokens_seen": 290274565, + "step": 13449, + "time_per_iteration": 3.231750011444092 + }, + { + "auxiliary_loss_clip": 0.010894, + "auxiliary_loss_mlp": 0.01025878, + "balance_loss_clip": 1.0347358, + "balance_loss_mlp": 1.01433873, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.700085329962796, + "language_loss": 0.73947525, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76062799, + "num_input_tokens_seen": 290293630, + "step": 13450, + "time_per_iteration": 4.171797275543213 + }, + { + "auxiliary_loss_clip": 0.01074092, + "auxiliary_loss_mlp": 0.00749402, + "balance_loss_clip": 1.03191257, + "balance_loss_mlp": 1.00030637, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.7824535367554861, + "language_loss": 0.74224967, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.7604847, + "num_input_tokens_seen": 290311450, + "step": 13451, + "time_per_iteration": 2.59619402885437 + }, + { + "auxiliary_loss_clip": 0.01071369, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.03112984, + "balance_loss_mlp": 1.01630855, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.7577202674283448, + "language_loss": 0.80119997, + "learning_rate": 3.713651121244543e-07, + "loss": 0.822209, + "num_input_tokens_seen": 290330165, + "step": 13452, + "time_per_iteration": 2.6089422702789307 + }, + { + "auxiliary_loss_clip": 0.01089534, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.03495467, + "balance_loss_mlp": 1.02328539, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 2.0254907456367137, + "language_loss": 0.786421, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80765688, + "num_input_tokens_seen": 290350815, + "step": 13453, + "time_per_iteration": 2.5664918422698975 + }, + { + "auxiliary_loss_clip": 0.01044389, + "auxiliary_loss_mlp": 0.01032356, + "balance_loss_clip": 1.02884102, + "balance_loss_mlp": 1.02052474, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 27.109371187949435, + "language_loss": 0.77782273, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79859018, + "num_input_tokens_seen": 290367380, + "step": 13454, + "time_per_iteration": 2.835139274597168 + }, + { + "auxiliary_loss_clip": 0.0106331, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.03110743, + "balance_loss_mlp": 1.02299976, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 1.7271742471365132, + "language_loss": 0.76637918, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78735822, + "num_input_tokens_seen": 290387965, + "step": 13455, + "time_per_iteration": 2.783329963684082 + }, + { + "auxiliary_loss_clip": 0.01068171, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.03004837, + "balance_loss_mlp": 1.02134132, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.8260330808899186, + "language_loss": 0.78457689, + "learning_rate": 3.70461401253471e-07, + "loss": 0.80559325, + "num_input_tokens_seen": 290404150, + "step": 13456, + "time_per_iteration": 2.667520761489868 + }, + { + "auxiliary_loss_clip": 0.01097332, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.0353483, + "balance_loss_mlp": 1.02495217, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 1.8404621594900699, + "language_loss": 0.7179209, + "learning_rate": 3.702356279949801e-07, + "loss": 0.73925138, + "num_input_tokens_seen": 290422370, + "step": 13457, + "time_per_iteration": 2.5685808658599854 + }, + { + "auxiliary_loss_clip": 0.01075068, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.03245497, + "balance_loss_mlp": 1.01857364, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 1.8384143922330216, + "language_loss": 0.72827888, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74932086, + "num_input_tokens_seen": 290442645, + "step": 13458, + "time_per_iteration": 2.660276174545288 + }, + { + "auxiliary_loss_clip": 0.01085642, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.03278899, + "balance_loss_mlp": 1.02152777, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 3.5864400730484056, + "language_loss": 0.79166311, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.81284916, + "num_input_tokens_seen": 290458520, + "step": 13459, + "time_per_iteration": 2.4780025482177734 + }, + { + "auxiliary_loss_clip": 0.01073084, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.03457212, + "balance_loss_mlp": 1.01707518, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 2.084512718758406, + "language_loss": 0.7978065, + "learning_rate": 3.695586790587113e-07, + "loss": 0.81882429, + "num_input_tokens_seen": 290474465, + "step": 13460, + "time_per_iteration": 2.581963539123535 + }, + { + "auxiliary_loss_clip": 0.01073722, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.03131115, + "balance_loss_mlp": 1.02053165, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.7171467451689477, + "language_loss": 0.84670293, + "learning_rate": 3.693331530548789e-07, + "loss": 0.8677696, + "num_input_tokens_seen": 290492060, + "step": 13461, + "time_per_iteration": 2.6148505210876465 + }, + { + "auxiliary_loss_clip": 0.01087911, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.033849, + "balance_loss_mlp": 1.02138972, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.8605375911340252, + "language_loss": 0.76418865, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78539431, + "num_input_tokens_seen": 290511510, + "step": 13462, + "time_per_iteration": 2.580796241760254 + }, + { + "auxiliary_loss_clip": 0.01077684, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.03531933, + "balance_loss_mlp": 1.01778364, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 1.699769864255784, + "language_loss": 0.83305395, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85412943, + "num_input_tokens_seen": 290530035, + "step": 13463, + "time_per_iteration": 2.6356289386749268 + }, + { + "auxiliary_loss_clip": 0.01096557, + "auxiliary_loss_mlp": 0.01033812, + "balance_loss_clip": 1.03429389, + "balance_loss_mlp": 1.02374458, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 1.88709476317238, + "language_loss": 0.62523985, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64654356, + "num_input_tokens_seen": 290548245, + "step": 13464, + "time_per_iteration": 3.975297451019287 + }, + { + "auxiliary_loss_clip": 0.01095266, + "auxiliary_loss_mlp": 0.01023628, + "balance_loss_clip": 1.03336906, + "balance_loss_mlp": 1.01370406, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.4884194156890875, + "language_loss": 0.6165024, + "learning_rate": 3.684316674755341e-07, + "loss": 0.63769132, + "num_input_tokens_seen": 290568625, + "step": 13465, + "time_per_iteration": 2.5369603633880615 + }, + { + "auxiliary_loss_clip": 0.01087284, + "auxiliary_loss_mlp": 0.01034736, + "balance_loss_clip": 1.03512788, + "balance_loss_mlp": 1.02393579, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 1.826794556628704, + "language_loss": 0.82020634, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84142655, + "num_input_tokens_seen": 290586575, + "step": 13466, + "time_per_iteration": 2.5561959743499756 + }, + { + "auxiliary_loss_clip": 0.01081748, + "auxiliary_loss_mlp": 0.00749454, + "balance_loss_clip": 1.03569973, + "balance_loss_mlp": 1.00024045, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 2.214990215572089, + "language_loss": 0.7586453, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.77695739, + "num_input_tokens_seen": 290606790, + "step": 13467, + "time_per_iteration": 2.716783285140991 + }, + { + "auxiliary_loss_clip": 0.01059505, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.0276109, + "balance_loss_mlp": 1.01647246, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 2.5445176518816575, + "language_loss": 0.79354537, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81441569, + "num_input_tokens_seen": 290625525, + "step": 13468, + "time_per_iteration": 2.6849164962768555 + }, + { + "auxiliary_loss_clip": 0.0108293, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.03161287, + "balance_loss_mlp": 1.01578712, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 2.236301025440437, + "language_loss": 0.67616385, + "learning_rate": 3.675311718038978e-07, + "loss": 0.69725466, + "num_input_tokens_seen": 290644935, + "step": 13469, + "time_per_iteration": 2.5825397968292236 + }, + { + "auxiliary_loss_clip": 0.00996086, + "auxiliary_loss_mlp": 0.01001373, + "balance_loss_clip": 1.00602794, + "balance_loss_mlp": 1.00013912, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.6968552625030927, + "language_loss": 0.54655623, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56653082, + "num_input_tokens_seen": 290710735, + "step": 13470, + "time_per_iteration": 3.3166255950927734 + }, + { + "auxiliary_loss_clip": 0.0109648, + "auxiliary_loss_mlp": 0.01029097, + "balance_loss_clip": 1.03297281, + "balance_loss_mlp": 1.01891613, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 2.4654978913936567, + "language_loss": 0.69296789, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71422362, + "num_input_tokens_seen": 290729565, + "step": 13471, + "time_per_iteration": 4.241477727890015 + }, + { + "auxiliary_loss_clip": 0.01087656, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.0338738, + "balance_loss_mlp": 1.01624298, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.6854161103843412, + "language_loss": 0.79899192, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.82013577, + "num_input_tokens_seen": 290749360, + "step": 13472, + "time_per_iteration": 2.6819756031036377 + }, + { + "auxiliary_loss_clip": 0.01015995, + "auxiliary_loss_mlp": 0.00999204, + "balance_loss_clip": 1.00604212, + "balance_loss_mlp": 0.9982565, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7481524380613808, + "language_loss": 0.57805586, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59820795, + "num_input_tokens_seen": 290812145, + "step": 13473, + "time_per_iteration": 3.0641448497772217 + }, + { + "auxiliary_loss_clip": 0.01045398, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.03274465, + "balance_loss_mlp": 1.01784515, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 1.7693734877069633, + "language_loss": 0.73684561, + "learning_rate": 3.664069451043399e-07, + "loss": 0.75759208, + "num_input_tokens_seen": 290829845, + "step": 13474, + "time_per_iteration": 2.685502052307129 + }, + { + "auxiliary_loss_clip": 0.01088463, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.03611016, + "balance_loss_mlp": 1.02363813, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.6600595459041432, + "language_loss": 0.78802466, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80925632, + "num_input_tokens_seen": 290848815, + "step": 13475, + "time_per_iteration": 2.5221848487854004 + }, + { + "auxiliary_loss_clip": 0.01085531, + "auxiliary_loss_mlp": 0.01029465, + "balance_loss_clip": 1.03293037, + "balance_loss_mlp": 1.01886129, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.6074220681944071, + "language_loss": 0.75254351, + "learning_rate": 3.659576879869364e-07, + "loss": 0.7736935, + "num_input_tokens_seen": 290868580, + "step": 13476, + "time_per_iteration": 2.6202547550201416 + }, + { + "auxiliary_loss_clip": 0.01076756, + "auxiliary_loss_mlp": 0.01034105, + "balance_loss_clip": 1.03176451, + "balance_loss_mlp": 1.02224934, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 2.8983613179357017, + "language_loss": 0.74107432, + "learning_rate": 3.657331523685485e-07, + "loss": 0.76218289, + "num_input_tokens_seen": 290883540, + "step": 13477, + "time_per_iteration": 2.633667469024658 + }, + { + "auxiliary_loss_clip": 0.01070961, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.03389752, + "balance_loss_mlp": 1.01979387, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 1.776546682550784, + "language_loss": 0.69301462, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.71403432, + "num_input_tokens_seen": 290901560, + "step": 13478, + "time_per_iteration": 2.6777477264404297 + }, + { + "auxiliary_loss_clip": 0.01023481, + "auxiliary_loss_mlp": 0.00998085, + "balance_loss_clip": 1.00351989, + "balance_loss_mlp": 0.99701846, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6951177840036883, + "language_loss": 0.52162158, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54183733, + "num_input_tokens_seen": 290959185, + "step": 13479, + "time_per_iteration": 4.521913051605225 + }, + { + "auxiliary_loss_clip": 0.01039874, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.0287056, + "balance_loss_mlp": 1.01995862, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.6152180888058425, + "language_loss": 0.71918756, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73989826, + "num_input_tokens_seen": 290979585, + "step": 13480, + "time_per_iteration": 2.6696527004241943 + }, + { + "auxiliary_loss_clip": 0.01096701, + "auxiliary_loss_mlp": 0.01026812, + "balance_loss_clip": 1.0332551, + "balance_loss_mlp": 1.016536, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 2.1740671885646328, + "language_loss": 0.79800546, + "learning_rate": 3.648356296957327e-07, + "loss": 0.81924057, + "num_input_tokens_seen": 291000865, + "step": 13481, + "time_per_iteration": 2.549821376800537 + }, + { + "auxiliary_loss_clip": 0.01072715, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.03129363, + "balance_loss_mlp": 1.01805079, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.8130354660672099, + "language_loss": 0.72615647, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74717391, + "num_input_tokens_seen": 291018285, + "step": 13482, + "time_per_iteration": 2.58217716217041 + }, + { + "auxiliary_loss_clip": 0.01033336, + "auxiliary_loss_mlp": 0.01025418, + "balance_loss_clip": 1.02978516, + "balance_loss_mlp": 1.01427174, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.3720406060702075, + "language_loss": 0.65422308, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67481065, + "num_input_tokens_seen": 291035745, + "step": 13483, + "time_per_iteration": 2.688680410385132 + }, + { + "auxiliary_loss_clip": 0.0106763, + "auxiliary_loss_mlp": 0.01025648, + "balance_loss_clip": 1.03136396, + "balance_loss_mlp": 1.01399553, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.7892189410450245, + "language_loss": 0.75996792, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78090078, + "num_input_tokens_seen": 291053280, + "step": 13484, + "time_per_iteration": 2.646131753921509 + }, + { + "auxiliary_loss_clip": 0.0107832, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.03369594, + "balance_loss_mlp": 1.02023196, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.5611533393382644, + "language_loss": 0.72248721, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74359488, + "num_input_tokens_seen": 291072855, + "step": 13485, + "time_per_iteration": 2.5936660766601562 + }, + { + "auxiliary_loss_clip": 0.01042045, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.02662373, + "balance_loss_mlp": 1.02085447, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.9876678554993825, + "language_loss": 0.76185077, + "learning_rate": 3.637151215443308e-07, + "loss": 0.78260434, + "num_input_tokens_seen": 291090285, + "step": 13486, + "time_per_iteration": 2.6475672721862793 + }, + { + "auxiliary_loss_clip": 0.01077719, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03298807, + "balance_loss_mlp": 1.01945233, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 1.9374712784183739, + "language_loss": 0.72201014, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74309176, + "num_input_tokens_seen": 291107675, + "step": 13487, + "time_per_iteration": 2.577749013900757 + }, + { + "auxiliary_loss_clip": 0.01033286, + "auxiliary_loss_mlp": 0.01032158, + "balance_loss_clip": 1.0326339, + "balance_loss_mlp": 1.02189398, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.5787792104045766, + "language_loss": 0.84267157, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86332601, + "num_input_tokens_seen": 291126900, + "step": 13488, + "time_per_iteration": 2.753434419631958 + }, + { + "auxiliary_loss_clip": 0.01099774, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.03534889, + "balance_loss_mlp": 1.01915669, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 2.006611525078567, + "language_loss": 0.74140429, + "learning_rate": 3.630435611625502e-07, + "loss": 0.76270723, + "num_input_tokens_seen": 291145285, + "step": 13489, + "time_per_iteration": 2.59306001663208 + }, + { + "auxiliary_loss_clip": 0.01053925, + "auxiliary_loss_mlp": 0.00749353, + "balance_loss_clip": 1.03307521, + "balance_loss_mlp": 1.00029778, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.696282047548993, + "language_loss": 0.71589828, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73393106, + "num_input_tokens_seen": 291163485, + "step": 13490, + "time_per_iteration": 4.169475317001343 + }, + { + "auxiliary_loss_clip": 0.01062896, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.03306448, + "balance_loss_mlp": 1.02556324, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 2.656558851956208, + "language_loss": 0.71876204, + "learning_rate": 3.625961645949762e-07, + "loss": 0.73978543, + "num_input_tokens_seen": 291182215, + "step": 13491, + "time_per_iteration": 2.6797807216644287 + }, + { + "auxiliary_loss_clip": 0.01097123, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.03279185, + "balance_loss_mlp": 1.01756084, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.6146188335145044, + "language_loss": 0.6816712, + "learning_rate": 3.623725594427245e-07, + "loss": 0.70292687, + "num_input_tokens_seen": 291203145, + "step": 13492, + "time_per_iteration": 2.5469141006469727 + }, + { + "auxiliary_loss_clip": 0.01054533, + "auxiliary_loss_mlp": 0.01030704, + "balance_loss_clip": 1.03304517, + "balance_loss_mlp": 1.01931381, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 2.157719699638554, + "language_loss": 0.71996635, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74081874, + "num_input_tokens_seen": 291220600, + "step": 13493, + "time_per_iteration": 2.7198519706726074 + }, + { + "auxiliary_loss_clip": 0.01077088, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.03048229, + "balance_loss_mlp": 1.02411437, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.7564040835328372, + "language_loss": 0.7066288, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72776634, + "num_input_tokens_seen": 291241195, + "step": 13494, + "time_per_iteration": 2.6720101833343506 + }, + { + "auxiliary_loss_clip": 0.01090045, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.03521299, + "balance_loss_mlp": 1.02025676, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 2.1178317184189868, + "language_loss": 0.7645123, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78573895, + "num_input_tokens_seen": 291258715, + "step": 13495, + "time_per_iteration": 2.577118158340454 + }, + { + "auxiliary_loss_clip": 0.01077306, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.03282773, + "balance_loss_mlp": 1.02223718, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 2.311801535333784, + "language_loss": 0.80561209, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82672143, + "num_input_tokens_seen": 291278030, + "step": 13496, + "time_per_iteration": 2.651017665863037 + }, + { + "auxiliary_loss_clip": 0.01086173, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.03224754, + "balance_loss_mlp": 1.0202949, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.614931329257403, + "language_loss": 0.70982486, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73100907, + "num_input_tokens_seen": 291296740, + "step": 13497, + "time_per_iteration": 2.544966459274292 + }, + { + "auxiliary_loss_clip": 0.01059461, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.03077388, + "balance_loss_mlp": 1.02007186, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.9095207722057936, + "language_loss": 0.7685141, + "learning_rate": 3.610322329047508e-07, + "loss": 0.78941584, + "num_input_tokens_seen": 291318730, + "step": 13498, + "time_per_iteration": 2.696096897125244 + }, + { + "auxiliary_loss_clip": 0.01098002, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.02137327, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 1.9629795564563548, + "language_loss": 0.83879024, + "learning_rate": 3.608090626234055e-07, + "loss": 0.86009312, + "num_input_tokens_seen": 291336755, + "step": 13499, + "time_per_iteration": 2.5768699645996094 + }, + { + "auxiliary_loss_clip": 0.01063287, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.03293955, + "balance_loss_mlp": 1.01715529, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.5321968178788978, + "language_loss": 0.76259899, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78352618, + "num_input_tokens_seen": 291356795, + "step": 13500, + "time_per_iteration": 2.7509164810180664 + }, + { + "auxiliary_loss_clip": 0.01005084, + "auxiliary_loss_mlp": 0.01002559, + "balance_loss_clip": 1.00481224, + "balance_loss_mlp": 1.00157571, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8023582334317666, + "language_loss": 0.59945995, + "learning_rate": 3.603629085440303e-07, + "loss": 0.6195364, + "num_input_tokens_seen": 291416005, + "step": 13501, + "time_per_iteration": 3.295264482498169 + }, + { + "auxiliary_loss_clip": 0.01082852, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.03237045, + "balance_loss_mlp": 1.01857114, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.4852899901223344, + "language_loss": 0.78890467, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81002218, + "num_input_tokens_seen": 291434870, + "step": 13502, + "time_per_iteration": 2.695035934448242 + }, + { + "auxiliary_loss_clip": 0.01066986, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.0293237, + "balance_loss_mlp": 1.02223659, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.7612737589959815, + "language_loss": 0.71738207, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73840004, + "num_input_tokens_seen": 291452230, + "step": 13503, + "time_per_iteration": 2.6130805015563965 + }, + { + "auxiliary_loss_clip": 0.01065073, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.03081739, + "balance_loss_mlp": 1.01486766, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.4881171420280337, + "language_loss": 0.67717791, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.69810283, + "num_input_tokens_seen": 291477425, + "step": 13504, + "time_per_iteration": 4.333235263824463 + }, + { + "auxiliary_loss_clip": 0.01071651, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.03320909, + "balance_loss_mlp": 1.01978731, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 2.5365566299968134, + "language_loss": 0.7455157, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76655388, + "num_input_tokens_seen": 291501070, + "step": 13505, + "time_per_iteration": 2.9168550968170166 + }, + { + "auxiliary_loss_clip": 0.01075863, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.03315306, + "balance_loss_mlp": 1.0158093, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 2.070113576761675, + "language_loss": 0.72483683, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.7458756, + "num_input_tokens_seen": 291524945, + "step": 13506, + "time_per_iteration": 2.715977907180786 + }, + { + "auxiliary_loss_clip": 0.01103132, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.03483319, + "balance_loss_mlp": 1.02005625, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 3.6256548580700914, + "language_loss": 0.75752521, + "learning_rate": 3.590259387812593e-07, + "loss": 0.77887034, + "num_input_tokens_seen": 291544605, + "step": 13507, + "time_per_iteration": 2.557852029800415 + }, + { + "auxiliary_loss_clip": 0.01098236, + "auxiliary_loss_mlp": 0.01026656, + "balance_loss_clip": 1.03250229, + "balance_loss_mlp": 1.01548588, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 5.848654631491006, + "language_loss": 0.70320272, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.7244516, + "num_input_tokens_seen": 291563850, + "step": 13508, + "time_per_iteration": 2.536414384841919 + }, + { + "auxiliary_loss_clip": 0.01086414, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.03379798, + "balance_loss_mlp": 1.0193572, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 1.5636056052017486, + "language_loss": 0.75824022, + "learning_rate": 3.585807799107785e-07, + "loss": 0.7794072, + "num_input_tokens_seen": 291581730, + "step": 13509, + "time_per_iteration": 2.576570749282837 + }, + { + "auxiliary_loss_clip": 0.01099358, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03443003, + "balance_loss_mlp": 1.02008295, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 3.11495920301033, + "language_loss": 0.76881969, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79012764, + "num_input_tokens_seen": 291601225, + "step": 13510, + "time_per_iteration": 2.5751781463623047 + }, + { + "auxiliary_loss_clip": 0.01086968, + "auxiliary_loss_mlp": 0.01037346, + "balance_loss_clip": 1.03240418, + "balance_loss_mlp": 1.02525842, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 2.2886025685036047, + "language_loss": 0.69409513, + "learning_rate": 3.581358700114212e-07, + "loss": 0.71533823, + "num_input_tokens_seen": 291616995, + "step": 13511, + "time_per_iteration": 4.063548564910889 + }, + { + "auxiliary_loss_clip": 0.01078334, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.03364146, + "balance_loss_mlp": 1.02102482, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.6367011320296587, + "language_loss": 0.79622114, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81732619, + "num_input_tokens_seen": 291636145, + "step": 13512, + "time_per_iteration": 2.576241970062256 + }, + { + "auxiliary_loss_clip": 0.0108355, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.03158975, + "balance_loss_mlp": 1.02009952, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.6577241220000518, + "language_loss": 0.63223112, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65337563, + "num_input_tokens_seen": 291662440, + "step": 13513, + "time_per_iteration": 2.9026100635528564 + }, + { + "auxiliary_loss_clip": 0.01047153, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.0299716, + "balance_loss_mlp": 1.02056706, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.7331304867550188, + "language_loss": 0.7114023, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73219609, + "num_input_tokens_seen": 291680950, + "step": 13514, + "time_per_iteration": 2.697603702545166 + }, + { + "auxiliary_loss_clip": 0.0106193, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.0326345, + "balance_loss_mlp": 1.01734352, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.5494346461071964, + "language_loss": 0.63231951, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.65322161, + "num_input_tokens_seen": 291702395, + "step": 13515, + "time_per_iteration": 2.6527326107025146 + }, + { + "auxiliary_loss_clip": 0.01091403, + "auxiliary_loss_mlp": 0.0074926, + "balance_loss_clip": 1.03213263, + "balance_loss_mlp": 1.00023174, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 1.546630368310801, + "language_loss": 0.7527492, + "learning_rate": 3.570246849544616e-07, + "loss": 0.77115577, + "num_input_tokens_seen": 291721135, + "step": 13516, + "time_per_iteration": 2.508146047592163 + }, + { + "auxiliary_loss_clip": 0.01046945, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.031636, + "balance_loss_mlp": 1.01814234, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.8367908194268954, + "language_loss": 0.91514909, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93590522, + "num_input_tokens_seen": 291741235, + "step": 13517, + "time_per_iteration": 2.708935499191284 + }, + { + "auxiliary_loss_clip": 0.01089224, + "auxiliary_loss_mlp": 0.00749336, + "balance_loss_clip": 1.03568518, + "balance_loss_mlp": 1.00027776, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.5000530814687902, + "language_loss": 0.79046744, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80885303, + "num_input_tokens_seen": 291761430, + "step": 13518, + "time_per_iteration": 2.6137120723724365 + }, + { + "auxiliary_loss_clip": 0.01083965, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.03369343, + "balance_loss_mlp": 1.01921046, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.6365829786049328, + "language_loss": 0.79112422, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81225669, + "num_input_tokens_seen": 291781755, + "step": 13519, + "time_per_iteration": 4.075148344039917 + }, + { + "auxiliary_loss_clip": 0.01097547, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.03359318, + "balance_loss_mlp": 1.0224762, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.5518361080476555, + "language_loss": 0.70210201, + "learning_rate": 3.561368582904905e-07, + "loss": 0.7234121, + "num_input_tokens_seen": 291804410, + "step": 13520, + "time_per_iteration": 2.587343215942383 + }, + { + "auxiliary_loss_clip": 0.01080705, + "auxiliary_loss_mlp": 0.01028387, + "balance_loss_clip": 1.03484702, + "balance_loss_mlp": 1.01719928, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.509014093726639, + "language_loss": 0.72572601, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74681693, + "num_input_tokens_seen": 291823285, + "step": 13521, + "time_per_iteration": 2.6352040767669678 + }, + { + "auxiliary_loss_clip": 0.01087695, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.03244567, + "balance_loss_mlp": 1.01837397, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.7435503798575311, + "language_loss": 0.70062459, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.72179759, + "num_input_tokens_seen": 291845305, + "step": 13522, + "time_per_iteration": 2.684410333633423 + }, + { + "auxiliary_loss_clip": 0.01081813, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.03325188, + "balance_loss_mlp": 1.0212481, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.6306423161479269, + "language_loss": 0.70638108, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72751004, + "num_input_tokens_seen": 291863715, + "step": 13523, + "time_per_iteration": 2.559196710586548 + }, + { + "auxiliary_loss_clip": 0.01084077, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.03144681, + "balance_loss_mlp": 1.01887751, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 1.928767425387882, + "language_loss": 0.70538759, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.72653151, + "num_input_tokens_seen": 291880735, + "step": 13524, + "time_per_iteration": 2.4969053268432617 + }, + { + "auxiliary_loss_clip": 0.01084171, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.03172374, + "balance_loss_mlp": 1.01759946, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.7611008500679584, + "language_loss": 0.62603903, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64715934, + "num_input_tokens_seen": 291900535, + "step": 13525, + "time_per_iteration": 2.6023058891296387 + }, + { + "auxiliary_loss_clip": 0.01067594, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.03281462, + "balance_loss_mlp": 1.02067041, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.5060721683284164, + "language_loss": 0.65553832, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67652345, + "num_input_tokens_seen": 291919760, + "step": 13526, + "time_per_iteration": 2.7235617637634277 + }, + { + "auxiliary_loss_clip": 0.0107245, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.03147602, + "balance_loss_mlp": 1.01699126, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.621802997972747, + "language_loss": 0.75276518, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77376378, + "num_input_tokens_seen": 291938915, + "step": 13527, + "time_per_iteration": 2.698453187942505 + }, + { + "auxiliary_loss_clip": 0.0109659, + "auxiliary_loss_mlp": 0.01026358, + "balance_loss_clip": 1.03363705, + "balance_loss_mlp": 1.0157249, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 1.5763376612701243, + "language_loss": 0.71106184, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.73229134, + "num_input_tokens_seen": 291958145, + "step": 13528, + "time_per_iteration": 2.5690174102783203 + }, + { + "auxiliary_loss_clip": 0.01098223, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.03350282, + "balance_loss_mlp": 1.01749063, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 2.2823980797620265, + "language_loss": 0.68391496, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.70517671, + "num_input_tokens_seen": 291976860, + "step": 13529, + "time_per_iteration": 2.499581813812256 + }, + { + "auxiliary_loss_clip": 0.01076852, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.03250444, + "balance_loss_mlp": 1.01702642, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.3509480922943153, + "language_loss": 0.77809513, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79913682, + "num_input_tokens_seen": 291998085, + "step": 13530, + "time_per_iteration": 2.6428213119506836 + }, + { + "auxiliary_loss_clip": 0.01085438, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.03305399, + "balance_loss_mlp": 1.01992059, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 2.8000820581536447, + "language_loss": 0.8217153, + "learning_rate": 3.537004792574052e-07, + "loss": 0.84288549, + "num_input_tokens_seen": 292016585, + "step": 13531, + "time_per_iteration": 4.053797245025635 + }, + { + "auxiliary_loss_clip": 0.01071122, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.03037465, + "balance_loss_mlp": 1.0183847, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 2.293987684198012, + "language_loss": 0.71753263, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73856354, + "num_input_tokens_seen": 292033255, + "step": 13532, + "time_per_iteration": 2.707310438156128 + }, + { + "auxiliary_loss_clip": 0.01062624, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.03183377, + "balance_loss_mlp": 1.01601553, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 1.738028244594421, + "language_loss": 0.76265049, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78354412, + "num_input_tokens_seen": 292051800, + "step": 13533, + "time_per_iteration": 2.6901168823242188 + }, + { + "auxiliary_loss_clip": 0.01101613, + "auxiliary_loss_mlp": 0.00749436, + "balance_loss_clip": 1.03419793, + "balance_loss_mlp": 1.00020134, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 2.1119347910016053, + "language_loss": 0.76311028, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78162074, + "num_input_tokens_seen": 292072215, + "step": 13534, + "time_per_iteration": 2.549053430557251 + }, + { + "auxiliary_loss_clip": 0.01078597, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.03334737, + "balance_loss_mlp": 1.01858568, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.0918026342034635, + "language_loss": 0.92923921, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95030892, + "num_input_tokens_seen": 292088830, + "step": 13535, + "time_per_iteration": 2.4846603870391846 + }, + { + "auxiliary_loss_clip": 0.01058209, + "auxiliary_loss_mlp": 0.01025846, + "balance_loss_clip": 1.03397393, + "balance_loss_mlp": 1.01523602, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.689045696418099, + "language_loss": 0.70423406, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72507465, + "num_input_tokens_seen": 292109225, + "step": 13536, + "time_per_iteration": 2.607536554336548 + }, + { + "auxiliary_loss_clip": 0.01059152, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.03248823, + "balance_loss_mlp": 1.01908827, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 2.5062735581402507, + "language_loss": 0.75416446, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77505732, + "num_input_tokens_seen": 292129660, + "step": 13537, + "time_per_iteration": 2.610081911087036 + }, + { + "auxiliary_loss_clip": 0.01069873, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.03274846, + "balance_loss_mlp": 1.01979852, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.8931086537547372, + "language_loss": 0.76258504, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.7836079, + "num_input_tokens_seen": 292149090, + "step": 13538, + "time_per_iteration": 2.645601272583008 + }, + { + "auxiliary_loss_clip": 0.01080597, + "auxiliary_loss_mlp": 0.0102728, + "balance_loss_clip": 1.03125882, + "balance_loss_mlp": 1.01655674, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.6224041677604744, + "language_loss": 0.77946043, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80053926, + "num_input_tokens_seen": 292169260, + "step": 13539, + "time_per_iteration": 2.537677764892578 + }, + { + "auxiliary_loss_clip": 0.01060909, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.04182172, + "balance_loss_mlp": 1.01967454, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 2.4379172470882717, + "language_loss": 0.65554976, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.6764611, + "num_input_tokens_seen": 292188145, + "step": 13540, + "time_per_iteration": 2.8072831630706787 + }, + { + "auxiliary_loss_clip": 0.01089009, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.03495264, + "balance_loss_mlp": 1.02234948, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.7507147019082876, + "language_loss": 0.67600453, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69722044, + "num_input_tokens_seen": 292212135, + "step": 13541, + "time_per_iteration": 2.6177666187286377 + }, + { + "auxiliary_loss_clip": 0.01095309, + "auxiliary_loss_mlp": 0.01033926, + "balance_loss_clip": 1.03295565, + "balance_loss_mlp": 1.02278602, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 1.7946713050795342, + "language_loss": 0.68765867, + "learning_rate": 3.512716539904355e-07, + "loss": 0.708951, + "num_input_tokens_seen": 292230645, + "step": 13542, + "time_per_iteration": 2.5461373329162598 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.03341413, + "balance_loss_mlp": 1.02268577, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 2.6870108747265697, + "language_loss": 0.79363108, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.814973, + "num_input_tokens_seen": 292243540, + "step": 13543, + "time_per_iteration": 2.4487545490264893 + }, + { + "auxiliary_loss_clip": 0.0106887, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.03372037, + "balance_loss_mlp": 1.02648592, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 2.451557478313559, + "language_loss": 0.77591264, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79698128, + "num_input_tokens_seen": 292261715, + "step": 13544, + "time_per_iteration": 2.593214511871338 + }, + { + "auxiliary_loss_clip": 0.01103743, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0350759, + "balance_loss_mlp": 1.01764822, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 2.9244370322735076, + "language_loss": 0.73639023, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75772917, + "num_input_tokens_seen": 292275080, + "step": 13545, + "time_per_iteration": 4.000625848770142 + }, + { + "auxiliary_loss_clip": 0.01084965, + "auxiliary_loss_mlp": 0.01027891, + "balance_loss_clip": 1.03311217, + "balance_loss_mlp": 1.01740062, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 1.7027499005461413, + "language_loss": 0.76697838, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78810692, + "num_input_tokens_seen": 292294635, + "step": 13546, + "time_per_iteration": 2.539142370223999 + }, + { + "auxiliary_loss_clip": 0.01090176, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.03600919, + "balance_loss_mlp": 1.01746082, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 2.1497207129568134, + "language_loss": 0.70541728, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72659618, + "num_input_tokens_seen": 292312695, + "step": 13547, + "time_per_iteration": 2.5142781734466553 + }, + { + "auxiliary_loss_clip": 0.0110113, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.03511643, + "balance_loss_mlp": 1.02065289, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 1.8511713191108818, + "language_loss": 0.70550501, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72685033, + "num_input_tokens_seen": 292332005, + "step": 13548, + "time_per_iteration": 2.6263232231140137 + }, + { + "auxiliary_loss_clip": 0.01087804, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.03530836, + "balance_loss_mlp": 1.02141976, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.046603846435358, + "language_loss": 0.77148819, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.79269898, + "num_input_tokens_seen": 292348365, + "step": 13549, + "time_per_iteration": 2.5676772594451904 + }, + { + "auxiliary_loss_clip": 0.01098175, + "auxiliary_loss_mlp": 0.01025638, + "balance_loss_clip": 1.03499806, + "balance_loss_mlp": 1.01443803, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 4.716891115005829, + "language_loss": 0.7092309, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73046911, + "num_input_tokens_seen": 292368050, + "step": 13550, + "time_per_iteration": 3.9495534896850586 + }, + { + "auxiliary_loss_clip": 0.01081057, + "auxiliary_loss_mlp": 0.0102436, + "balance_loss_clip": 1.03225207, + "balance_loss_mlp": 1.01419771, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.8427434825430655, + "language_loss": 0.72256482, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74361897, + "num_input_tokens_seen": 292385315, + "step": 13551, + "time_per_iteration": 2.5382399559020996 + }, + { + "auxiliary_loss_clip": 0.01059372, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.03407335, + "balance_loss_mlp": 1.0183084, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 2.0005594648415763, + "language_loss": 0.68606585, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70696515, + "num_input_tokens_seen": 292403375, + "step": 13552, + "time_per_iteration": 2.5766851902008057 + }, + { + "auxiliary_loss_clip": 0.01096566, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.03303981, + "balance_loss_mlp": 1.03021932, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 1.8030635330976492, + "language_loss": 0.82367897, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84505427, + "num_input_tokens_seen": 292419260, + "step": 13553, + "time_per_iteration": 2.4941277503967285 + }, + { + "auxiliary_loss_clip": 0.0108349, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.03116107, + "balance_loss_mlp": 1.01806188, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 2.7516110696608562, + "language_loss": 0.68056452, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70169026, + "num_input_tokens_seen": 292436095, + "step": 13554, + "time_per_iteration": 2.497912645339966 + }, + { + "auxiliary_loss_clip": 0.01065697, + "auxiliary_loss_mlp": 0.01037221, + "balance_loss_clip": 1.03227425, + "balance_loss_mlp": 1.02423882, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.6485039377232222, + "language_loss": 0.66173267, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68276179, + "num_input_tokens_seen": 292457190, + "step": 13555, + "time_per_iteration": 2.6504898071289062 + }, + { + "auxiliary_loss_clip": 0.0108876, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.03235316, + "balance_loss_mlp": 1.0214324, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 4.103515543150653, + "language_loss": 0.73247182, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75369346, + "num_input_tokens_seen": 292474300, + "step": 13556, + "time_per_iteration": 2.5459697246551514 + }, + { + "auxiliary_loss_clip": 0.01086863, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.03534842, + "balance_loss_mlp": 1.01783395, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 1.7366931809405703, + "language_loss": 0.80896091, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.83011049, + "num_input_tokens_seen": 292492420, + "step": 13557, + "time_per_iteration": 2.664745807647705 + }, + { + "auxiliary_loss_clip": 0.01077809, + "auxiliary_loss_mlp": 0.01034788, + "balance_loss_clip": 1.03323007, + "balance_loss_mlp": 1.0235641, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.7919954511282645, + "language_loss": 0.65601146, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.67713749, + "num_input_tokens_seen": 292512895, + "step": 13558, + "time_per_iteration": 2.6940996646881104 + }, + { + "auxiliary_loss_clip": 0.0102357, + "auxiliary_loss_mlp": 0.01010444, + "balance_loss_clip": 1.00360966, + "balance_loss_mlp": 1.00948429, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.7950630761480532, + "language_loss": 0.56944054, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.58978075, + "num_input_tokens_seen": 292566580, + "step": 13559, + "time_per_iteration": 4.591164827346802 + }, + { + "auxiliary_loss_clip": 0.01004206, + "auxiliary_loss_mlp": 0.01001502, + "balance_loss_clip": 1.00380731, + "balance_loss_mlp": 1.00021505, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6790419680181253, + "language_loss": 0.55288446, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57294154, + "num_input_tokens_seen": 292621490, + "step": 13560, + "time_per_iteration": 3.0266833305358887 + }, + { + "auxiliary_loss_clip": 0.01073071, + "auxiliary_loss_mlp": 0.01026578, + "balance_loss_clip": 1.03151727, + "balance_loss_mlp": 1.01596236, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.6508610987746568, + "language_loss": 0.6724273, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69342375, + "num_input_tokens_seen": 292638660, + "step": 13561, + "time_per_iteration": 2.5728814601898193 + }, + { + "auxiliary_loss_clip": 0.01091006, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.03491664, + "balance_loss_mlp": 1.02121949, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.5835952533292472, + "language_loss": 0.81574297, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83697498, + "num_input_tokens_seen": 292658545, + "step": 13562, + "time_per_iteration": 2.5608766078948975 + }, + { + "auxiliary_loss_clip": 0.01072996, + "auxiliary_loss_mlp": 0.01030397, + "balance_loss_clip": 1.03462934, + "balance_loss_mlp": 1.0193522, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.5977882897463023, + "language_loss": 0.7176016, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.73863554, + "num_input_tokens_seen": 292678460, + "step": 13563, + "time_per_iteration": 2.708317279815674 + }, + { + "auxiliary_loss_clip": 0.01011218, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.03186154, + "balance_loss_mlp": 1.01529145, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.48601572827096, + "language_loss": 0.6984396, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.71881986, + "num_input_tokens_seen": 292699815, + "step": 13564, + "time_per_iteration": 2.9313113689422607 + }, + { + "auxiliary_loss_clip": 0.01072988, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.03302073, + "balance_loss_mlp": 1.01624727, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 1.8097226927410361, + "language_loss": 0.69800133, + "learning_rate": 3.462176595017854e-07, + "loss": 0.71900487, + "num_input_tokens_seen": 292717370, + "step": 13565, + "time_per_iteration": 2.782510995864868 + }, + { + "auxiliary_loss_clip": 0.01086684, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.03363347, + "balance_loss_mlp": 1.01932871, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 2.123194559120468, + "language_loss": 0.79024374, + "learning_rate": 3.459986724180188e-07, + "loss": 0.8114115, + "num_input_tokens_seen": 292737110, + "step": 13566, + "time_per_iteration": 2.5738766193389893 + }, + { + "auxiliary_loss_clip": 0.01075557, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.0340333, + "balance_loss_mlp": 1.01925707, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.6626420017804986, + "language_loss": 0.82487273, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84591967, + "num_input_tokens_seen": 292756510, + "step": 13567, + "time_per_iteration": 2.6971399784088135 + }, + { + "auxiliary_loss_clip": 0.01095794, + "auxiliary_loss_mlp": 0.01028942, + "balance_loss_clip": 1.03424048, + "balance_loss_mlp": 1.01953673, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.047150985925312, + "language_loss": 0.79566061, + "learning_rate": 3.455608864184771e-07, + "loss": 0.816908, + "num_input_tokens_seen": 292776710, + "step": 13568, + "time_per_iteration": 2.5448203086853027 + }, + { + "auxiliary_loss_clip": 0.01076896, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.035653, + "balance_loss_mlp": 1.01613021, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.859329012832393, + "language_loss": 0.77158499, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79262042, + "num_input_tokens_seen": 292794350, + "step": 13569, + "time_per_iteration": 2.6273672580718994 + }, + { + "auxiliary_loss_clip": 0.01095352, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03339934, + "balance_loss_mlp": 1.02033734, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.3720434129873533, + "language_loss": 0.5808903, + "learning_rate": 3.451233513649199e-07, + "loss": 0.60214746, + "num_input_tokens_seen": 292814005, + "step": 13570, + "time_per_iteration": 2.5492358207702637 + }, + { + "auxiliary_loss_clip": 0.01086226, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.032511, + "balance_loss_mlp": 1.02504301, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 2.5397878232998012, + "language_loss": 0.82260621, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84383893, + "num_input_tokens_seen": 292833485, + "step": 13571, + "time_per_iteration": 4.2150397300720215 + }, + { + "auxiliary_loss_clip": 0.01067569, + "auxiliary_loss_mlp": 0.01041166, + "balance_loss_clip": 1.03176689, + "balance_loss_mlp": 1.02944207, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.1866000075131917, + "language_loss": 0.78315538, + "learning_rate": 3.446860673237142e-07, + "loss": 0.80424273, + "num_input_tokens_seen": 292848045, + "step": 13572, + "time_per_iteration": 2.577907085418701 + }, + { + "auxiliary_loss_clip": 0.01099058, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.03403735, + "balance_loss_mlp": 1.0215137, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.5608386013047408, + "language_loss": 0.65130854, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.67262095, + "num_input_tokens_seen": 292869965, + "step": 13573, + "time_per_iteration": 2.502681255340576 + }, + { + "auxiliary_loss_clip": 0.0106368, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.03425407, + "balance_loss_mlp": 1.01980925, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.4698851356723988, + "language_loss": 0.75197923, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77291751, + "num_input_tokens_seen": 292889680, + "step": 13574, + "time_per_iteration": 2.658029556274414 + }, + { + "auxiliary_loss_clip": 0.01089887, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.03447604, + "balance_loss_mlp": 1.01988494, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 1.6117238648539882, + "language_loss": 0.59244579, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61365694, + "num_input_tokens_seen": 292912360, + "step": 13575, + "time_per_iteration": 2.6408519744873047 + }, + { + "auxiliary_loss_clip": 0.01013994, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.02782416, + "balance_loss_mlp": 1.02285504, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 2.1274455906316674, + "language_loss": 0.74284697, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76334643, + "num_input_tokens_seen": 292928325, + "step": 13576, + "time_per_iteration": 2.741075277328491 + }, + { + "auxiliary_loss_clip": 0.01007628, + "auxiliary_loss_mlp": 0.00997889, + "balance_loss_clip": 1.00714064, + "balance_loss_mlp": 0.99691105, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8366328192823993, + "language_loss": 0.58620447, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60625964, + "num_input_tokens_seen": 292992795, + "step": 13577, + "time_per_iteration": 3.162802219390869 + }, + { + "auxiliary_loss_clip": 0.01031675, + "auxiliary_loss_mlp": 0.01028775, + "balance_loss_clip": 1.02805686, + "balance_loss_mlp": 1.01788497, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 2.0453995045108253, + "language_loss": 0.7094875, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73009199, + "num_input_tokens_seen": 293011950, + "step": 13578, + "time_per_iteration": 2.6645026206970215 + }, + { + "auxiliary_loss_clip": 0.01054555, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.03111315, + "balance_loss_mlp": 1.01944435, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 2.0085030577015965, + "language_loss": 0.73431283, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75515795, + "num_input_tokens_seen": 293030175, + "step": 13579, + "time_per_iteration": 2.6754069328308105 + }, + { + "auxiliary_loss_clip": 0.01097622, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.03259778, + "balance_loss_mlp": 1.01798439, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 1.9122945256960526, + "language_loss": 0.7920469, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81331253, + "num_input_tokens_seen": 293047980, + "step": 13580, + "time_per_iteration": 2.5340914726257324 + }, + { + "auxiliary_loss_clip": 0.01058503, + "auxiliary_loss_mlp": 0.01034035, + "balance_loss_clip": 1.03031206, + "balance_loss_mlp": 1.02288938, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.6731672906791095, + "language_loss": 0.68845981, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.70938516, + "num_input_tokens_seen": 293067030, + "step": 13581, + "time_per_iteration": 2.636247396469116 + }, + { + "auxiliary_loss_clip": 0.01095707, + "auxiliary_loss_mlp": 0.01025596, + "balance_loss_clip": 1.03296804, + "balance_loss_mlp": 1.01483142, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.6918991381651582, + "language_loss": 0.59813452, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61934757, + "num_input_tokens_seen": 293085575, + "step": 13582, + "time_per_iteration": 2.5259835720062256 + }, + { + "auxiliary_loss_clip": 0.0106213, + "auxiliary_loss_mlp": 0.00749219, + "balance_loss_clip": 1.03206563, + "balance_loss_mlp": 1.00024557, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.374678008703701, + "language_loss": 0.8242619, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.8423754, + "num_input_tokens_seen": 293108200, + "step": 13583, + "time_per_iteration": 2.6840333938598633 + }, + { + "auxiliary_loss_clip": 0.01073173, + "auxiliary_loss_mlp": 0.01026224, + "balance_loss_clip": 1.03164244, + "balance_loss_mlp": 1.01585245, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.748147281837402, + "language_loss": 0.74437433, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76536834, + "num_input_tokens_seen": 293126020, + "step": 13584, + "time_per_iteration": 2.583674430847168 + }, + { + "auxiliary_loss_clip": 0.01087673, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.03521705, + "balance_loss_mlp": 1.01505041, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.7292965630469643, + "language_loss": 0.74253106, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76366931, + "num_input_tokens_seen": 293144620, + "step": 13585, + "time_per_iteration": 4.207075834274292 + }, + { + "auxiliary_loss_clip": 0.01070553, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.03342032, + "balance_loss_mlp": 1.01812541, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.5469740351182906, + "language_loss": 0.69589472, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71689677, + "num_input_tokens_seen": 293162850, + "step": 13586, + "time_per_iteration": 2.579552173614502 + }, + { + "auxiliary_loss_clip": 0.01034734, + "auxiliary_loss_mlp": 0.01038431, + "balance_loss_clip": 1.02995682, + "balance_loss_mlp": 1.02743959, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.9673548964352323, + "language_loss": 0.60972029, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63045192, + "num_input_tokens_seen": 293181620, + "step": 13587, + "time_per_iteration": 2.7628719806671143 + }, + { + "auxiliary_loss_clip": 0.01088612, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.03259063, + "balance_loss_mlp": 1.02284968, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.832456118138699, + "language_loss": 0.69993395, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.72116029, + "num_input_tokens_seen": 293200270, + "step": 13588, + "time_per_iteration": 2.676166296005249 + }, + { + "auxiliary_loss_clip": 0.01072434, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.03318548, + "balance_loss_mlp": 1.027421, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.675234953020032, + "language_loss": 0.72869885, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.74983817, + "num_input_tokens_seen": 293218960, + "step": 13589, + "time_per_iteration": 2.576112985610962 + }, + { + "auxiliary_loss_clip": 0.01082642, + "auxiliary_loss_mlp": 0.01028335, + "balance_loss_clip": 1.033705, + "balance_loss_mlp": 1.0171591, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.7184091498399552, + "language_loss": 0.7375415, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75865126, + "num_input_tokens_seen": 293236450, + "step": 13590, + "time_per_iteration": 4.14914345741272 + }, + { + "auxiliary_loss_clip": 0.01100983, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.03472996, + "balance_loss_mlp": 1.01945305, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 1.897666390010847, + "language_loss": 0.65090299, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67222583, + "num_input_tokens_seen": 293256480, + "step": 13591, + "time_per_iteration": 2.5866799354553223 + }, + { + "auxiliary_loss_clip": 0.01098878, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.03285372, + "balance_loss_mlp": 1.01982093, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 2.072223169975596, + "language_loss": 0.68304813, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70434815, + "num_input_tokens_seen": 293274960, + "step": 13592, + "time_per_iteration": 2.4975569248199463 + }, + { + "auxiliary_loss_clip": 0.01070339, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.03068876, + "balance_loss_mlp": 1.01736665, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 1.8176702577202108, + "language_loss": 0.66493261, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68592358, + "num_input_tokens_seen": 293295945, + "step": 13593, + "time_per_iteration": 2.637274742126465 + }, + { + "auxiliary_loss_clip": 0.01083368, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03029931, + "balance_loss_mlp": 1.02172053, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 3.3532644264217, + "language_loss": 0.69675159, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71790808, + "num_input_tokens_seen": 293313300, + "step": 13594, + "time_per_iteration": 2.540679931640625 + }, + { + "auxiliary_loss_clip": 0.01097311, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.03228319, + "balance_loss_mlp": 1.01801753, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.722156249432153, + "language_loss": 0.66019726, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.68146092, + "num_input_tokens_seen": 293333085, + "step": 13595, + "time_per_iteration": 2.516568422317505 + }, + { + "auxiliary_loss_clip": 0.01056801, + "auxiliary_loss_mlp": 0.01028055, + "balance_loss_clip": 1.03270805, + "balance_loss_mlp": 1.01662278, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.4938211124596081, + "language_loss": 0.78729004, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80813861, + "num_input_tokens_seen": 293351895, + "step": 13596, + "time_per_iteration": 2.616440534591675 + }, + { + "auxiliary_loss_clip": 0.01069036, + "auxiliary_loss_mlp": 0.01025407, + "balance_loss_clip": 1.03015494, + "balance_loss_mlp": 1.01443374, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 2.2508959122004137, + "language_loss": 0.57917035, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60011476, + "num_input_tokens_seen": 293371165, + "step": 13597, + "time_per_iteration": 2.573882818222046 + }, + { + "auxiliary_loss_clip": 0.01056811, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.03566432, + "balance_loss_mlp": 1.01770413, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.9702051378856158, + "language_loss": 0.82327092, + "learning_rate": 3.390242470389462e-07, + "loss": 0.8441205, + "num_input_tokens_seen": 293391150, + "step": 13598, + "time_per_iteration": 2.6391079425811768 + }, + { + "auxiliary_loss_clip": 0.01038157, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.03439736, + "balance_loss_mlp": 1.01881826, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 1.7080011499393863, + "language_loss": 0.82680011, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.84747469, + "num_input_tokens_seen": 293409440, + "step": 13599, + "time_per_iteration": 2.8165676593780518 + }, + { + "auxiliary_loss_clip": 0.01042046, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.02809298, + "balance_loss_mlp": 1.02341759, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.726222569915703, + "language_loss": 0.840334, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.86110896, + "num_input_tokens_seen": 293428995, + "step": 13600, + "time_per_iteration": 4.222832202911377 + }, + { + "auxiliary_loss_clip": 0.01062962, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.03125441, + "balance_loss_mlp": 1.01890457, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.941765313313321, + "language_loss": 0.74157691, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76250601, + "num_input_tokens_seen": 293449155, + "step": 13601, + "time_per_iteration": 2.6560521125793457 + }, + { + "auxiliary_loss_clip": 0.01067583, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.03429794, + "balance_loss_mlp": 1.01939487, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.406861444181366, + "language_loss": 0.68146098, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.702443, + "num_input_tokens_seen": 293466125, + "step": 13602, + "time_per_iteration": 2.5873982906341553 + }, + { + "auxiliary_loss_clip": 0.01055398, + "auxiliary_loss_mlp": 0.01030486, + "balance_loss_clip": 1.03106225, + "balance_loss_mlp": 1.01883888, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.312529510719308, + "language_loss": 0.83631587, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85717475, + "num_input_tokens_seen": 293481345, + "step": 13603, + "time_per_iteration": 2.6164047718048096 + }, + { + "auxiliary_loss_clip": 0.01050508, + "auxiliary_loss_mlp": 0.01025179, + "balance_loss_clip": 1.03388727, + "balance_loss_mlp": 1.01474237, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.7542663174553326, + "language_loss": 0.691401, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71215785, + "num_input_tokens_seen": 293502330, + "step": 13604, + "time_per_iteration": 2.7111949920654297 + }, + { + "auxiliary_loss_clip": 0.01068959, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.03498936, + "balance_loss_mlp": 1.01934206, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 2.02724544317168, + "language_loss": 0.74217451, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76316488, + "num_input_tokens_seen": 293521415, + "step": 13605, + "time_per_iteration": 2.5951120853424072 + }, + { + "auxiliary_loss_clip": 0.0106083, + "auxiliary_loss_mlp": 0.01040803, + "balance_loss_clip": 1.03555834, + "balance_loss_mlp": 1.02931094, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.7327580347534066, + "language_loss": 0.73933578, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76035202, + "num_input_tokens_seen": 293539245, + "step": 13606, + "time_per_iteration": 2.6081275939941406 + }, + { + "auxiliary_loss_clip": 0.01097435, + "auxiliary_loss_mlp": 0.01027218, + "balance_loss_clip": 1.03498983, + "balance_loss_mlp": 1.0165664, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.6397518933737965, + "language_loss": 0.65555024, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67679673, + "num_input_tokens_seen": 293560640, + "step": 13607, + "time_per_iteration": 2.6097218990325928 + }, + { + "auxiliary_loss_clip": 0.01075796, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.03362846, + "balance_loss_mlp": 1.01727998, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.7736237729673723, + "language_loss": 0.7051295, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72616988, + "num_input_tokens_seen": 293579465, + "step": 13608, + "time_per_iteration": 2.666520118713379 + }, + { + "auxiliary_loss_clip": 0.01080098, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.03069353, + "balance_loss_mlp": 1.02180636, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 1.7543258254749705, + "language_loss": 0.79712892, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81825405, + "num_input_tokens_seen": 293600540, + "step": 13609, + "time_per_iteration": 2.606492042541504 + }, + { + "auxiliary_loss_clip": 0.0099819, + "auxiliary_loss_mlp": 0.01000733, + "balance_loss_clip": 1.00715411, + "balance_loss_mlp": 0.99979174, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.7666250738879911, + "language_loss": 0.55961037, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.57959962, + "num_input_tokens_seen": 293665160, + "step": 13610, + "time_per_iteration": 3.2888386249542236 + }, + { + "auxiliary_loss_clip": 0.01029036, + "auxiliary_loss_mlp": 0.00749207, + "balance_loss_clip": 1.02777815, + "balance_loss_mlp": 1.00026953, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 2.3448662666869886, + "language_loss": 0.78070796, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79849041, + "num_input_tokens_seen": 293683995, + "step": 13611, + "time_per_iteration": 4.224119663238525 + }, + { + "auxiliary_loss_clip": 0.01062856, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.03065276, + "balance_loss_mlp": 1.01817322, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 1.9948363905561344, + "language_loss": 0.77135968, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.7922945, + "num_input_tokens_seen": 293704115, + "step": 13612, + "time_per_iteration": 2.704139232635498 + }, + { + "auxiliary_loss_clip": 0.01057696, + "auxiliary_loss_mlp": 0.01026527, + "balance_loss_clip": 1.02863014, + "balance_loss_mlp": 1.01587594, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 2.083735813764073, + "language_loss": 0.86162007, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88246238, + "num_input_tokens_seen": 293722225, + "step": 13613, + "time_per_iteration": 2.6758246421813965 + }, + { + "auxiliary_loss_clip": 0.01086905, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.03510594, + "balance_loss_mlp": 1.02689564, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 1.4321114338774974, + "language_loss": 0.72595298, + "learning_rate": 3.355612034397746e-07, + "loss": 0.74719262, + "num_input_tokens_seen": 293743995, + "step": 13614, + "time_per_iteration": 2.6404666900634766 + }, + { + "auxiliary_loss_clip": 0.01070156, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.03057086, + "balance_loss_mlp": 1.02441561, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.941990077707753, + "language_loss": 0.80860603, + "learning_rate": 3.353452993497479e-07, + "loss": 0.82966197, + "num_input_tokens_seen": 293764935, + "step": 13615, + "time_per_iteration": 2.6414387226104736 + }, + { + "auxiliary_loss_clip": 0.01080094, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.03020477, + "balance_loss_mlp": 1.0193553, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.9519401679041994, + "language_loss": 0.75616539, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77727145, + "num_input_tokens_seen": 293784035, + "step": 13616, + "time_per_iteration": 2.594846487045288 + }, + { + "auxiliary_loss_clip": 0.01051106, + "auxiliary_loss_mlp": 0.01037446, + "balance_loss_clip": 1.02707362, + "balance_loss_mlp": 1.02447581, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.8211141375415383, + "language_loss": 0.74863946, + "learning_rate": 3.349136805494979e-07, + "loss": 0.76952493, + "num_input_tokens_seen": 293803360, + "step": 13617, + "time_per_iteration": 2.616999387741089 + }, + { + "auxiliary_loss_clip": 0.0106735, + "auxiliary_loss_mlp": 0.01024287, + "balance_loss_clip": 1.02998626, + "balance_loss_mlp": 1.01462483, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 2.393579205996123, + "language_loss": 0.68411326, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70502967, + "num_input_tokens_seen": 293821325, + "step": 13618, + "time_per_iteration": 2.5765233039855957 + }, + { + "auxiliary_loss_clip": 0.01074804, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.03322744, + "balance_loss_mlp": 1.01983094, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 2.0628467030706426, + "language_loss": 0.70276046, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72382206, + "num_input_tokens_seen": 293840315, + "step": 13619, + "time_per_iteration": 2.6638734340667725 + }, + { + "auxiliary_loss_clip": 0.01038078, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.03406274, + "balance_loss_mlp": 1.01648843, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 2.0039144165216074, + "language_loss": 0.73861098, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.75927031, + "num_input_tokens_seen": 293855685, + "step": 13620, + "time_per_iteration": 2.6820952892303467 + }, + { + "auxiliary_loss_clip": 0.01065977, + "auxiliary_loss_mlp": 0.00749581, + "balance_loss_clip": 1.02986646, + "balance_loss_mlp": 1.00033236, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.5915108936488958, + "language_loss": 0.76102126, + "learning_rate": 3.340512006973011e-07, + "loss": 0.77917683, + "num_input_tokens_seen": 293875540, + "step": 13621, + "time_per_iteration": 2.611258029937744 + }, + { + "auxiliary_loss_clip": 0.01066839, + "auxiliary_loss_mlp": 0.01026999, + "balance_loss_clip": 1.02881086, + "balance_loss_mlp": 1.01571631, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.9555413063252725, + "language_loss": 0.65477574, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67571414, + "num_input_tokens_seen": 293896570, + "step": 13622, + "time_per_iteration": 2.628209352493286 + }, + { + "auxiliary_loss_clip": 0.01100597, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.03603923, + "balance_loss_mlp": 1.01649904, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.914913337693934, + "language_loss": 0.74988139, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.7711724, + "num_input_tokens_seen": 293914680, + "step": 13623, + "time_per_iteration": 2.5445566177368164 + }, + { + "auxiliary_loss_clip": 0.01073371, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.03135955, + "balance_loss_mlp": 1.02080452, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 1.7967241044598357, + "language_loss": 0.63250726, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.65356207, + "num_input_tokens_seen": 293936480, + "step": 13624, + "time_per_iteration": 2.8402397632598877 + }, + { + "auxiliary_loss_clip": 0.01094751, + "auxiliary_loss_mlp": 0.01031762, + "balance_loss_clip": 1.03291988, + "balance_loss_mlp": 1.02114069, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.4430080989775533, + "language_loss": 0.78101468, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80227977, + "num_input_tokens_seen": 293957815, + "step": 13625, + "time_per_iteration": 4.18574333190918 + }, + { + "auxiliary_loss_clip": 0.01088115, + "auxiliary_loss_mlp": 0.00749631, + "balance_loss_clip": 1.03128505, + "balance_loss_mlp": 1.00029731, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 3.7402799747040434, + "language_loss": 0.75869417, + "learning_rate": 3.329745223345244e-07, + "loss": 0.77707165, + "num_input_tokens_seen": 293975440, + "step": 13626, + "time_per_iteration": 2.584418296813965 + }, + { + "auxiliary_loss_clip": 0.01084762, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.03311241, + "balance_loss_mlp": 1.02160048, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.5472868047234403, + "language_loss": 0.73544693, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75661027, + "num_input_tokens_seen": 293997540, + "step": 13627, + "time_per_iteration": 2.6209089756011963 + }, + { + "auxiliary_loss_clip": 0.01097378, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.03386188, + "balance_loss_mlp": 1.01814628, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 1.8285945723334294, + "language_loss": 0.68832421, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.70959556, + "num_input_tokens_seen": 294017030, + "step": 13628, + "time_per_iteration": 2.5887653827667236 + }, + { + "auxiliary_loss_clip": 0.01072369, + "auxiliary_loss_mlp": 0.01036074, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.02320504, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.7822152179779007, + "language_loss": 0.8553077, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87639207, + "num_input_tokens_seen": 294035700, + "step": 13629, + "time_per_iteration": 2.5805859565734863 + }, + { + "auxiliary_loss_clip": 0.01095633, + "auxiliary_loss_mlp": 0.01025277, + "balance_loss_clip": 1.03337264, + "balance_loss_mlp": 1.01481676, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 4.613033778867546, + "language_loss": 0.73902237, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.76023149, + "num_input_tokens_seen": 294049730, + "step": 13630, + "time_per_iteration": 4.002075672149658 + }, + { + "auxiliary_loss_clip": 0.01081652, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.03223133, + "balance_loss_mlp": 1.0222187, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 2.138329048205458, + "language_loss": 0.72107053, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74223471, + "num_input_tokens_seen": 294066545, + "step": 13631, + "time_per_iteration": 2.491497278213501 + }, + { + "auxiliary_loss_clip": 0.01096225, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.03251529, + "balance_loss_mlp": 1.02099586, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.5984941871574783, + "language_loss": 0.76380211, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78507805, + "num_input_tokens_seen": 294087455, + "step": 13632, + "time_per_iteration": 2.567303419113159 + }, + { + "auxiliary_loss_clip": 0.01066822, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.02924967, + "balance_loss_mlp": 1.01875877, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 2.278585133266142, + "language_loss": 0.66022378, + "learning_rate": 3.314698278332588e-07, + "loss": 0.68119073, + "num_input_tokens_seen": 294107480, + "step": 13633, + "time_per_iteration": 2.6794588565826416 + }, + { + "auxiliary_loss_clip": 0.01077729, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.03187895, + "balance_loss_mlp": 1.021806, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.4569122906219256, + "language_loss": 0.7569207, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77801585, + "num_input_tokens_seen": 294130115, + "step": 13634, + "time_per_iteration": 2.617262363433838 + }, + { + "auxiliary_loss_clip": 0.01035474, + "auxiliary_loss_mlp": 0.00749195, + "balance_loss_clip": 1.0322299, + "balance_loss_mlp": 1.00025344, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 3.614107670938325, + "language_loss": 0.81838167, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83622837, + "num_input_tokens_seen": 294148495, + "step": 13635, + "time_per_iteration": 2.8049004077911377 + }, + { + "auxiliary_loss_clip": 0.0108165, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.03084457, + "balance_loss_mlp": 1.01863241, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.7036739313194191, + "language_loss": 0.75811702, + "learning_rate": 3.308259076607949e-07, + "loss": 0.77923441, + "num_input_tokens_seen": 294169595, + "step": 13636, + "time_per_iteration": 2.626983404159546 + }, + { + "auxiliary_loss_clip": 0.01070404, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.03617215, + "balance_loss_mlp": 1.02233887, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 2.484251133089055, + "language_loss": 0.80984426, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83088523, + "num_input_tokens_seen": 294183885, + "step": 13637, + "time_per_iteration": 2.714693307876587 + }, + { + "auxiliary_loss_clip": 0.01085214, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.03393817, + "balance_loss_mlp": 1.02068496, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.2924833713090407, + "language_loss": 0.71162963, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73279297, + "num_input_tokens_seen": 294200150, + "step": 13638, + "time_per_iteration": 2.6538984775543213 + }, + { + "auxiliary_loss_clip": 0.01047848, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.02869976, + "balance_loss_mlp": 1.01705742, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 2.6782314579107087, + "language_loss": 0.79082417, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81160146, + "num_input_tokens_seen": 294220385, + "step": 13639, + "time_per_iteration": 2.76300311088562 + }, + { + "auxiliary_loss_clip": 0.01046193, + "auxiliary_loss_mlp": 0.01026521, + "balance_loss_clip": 1.02949166, + "balance_loss_mlp": 1.01520157, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.6737498788949754, + "language_loss": 0.789096, + "learning_rate": 3.299682336022589e-07, + "loss": 0.80982316, + "num_input_tokens_seen": 294239355, + "step": 13640, + "time_per_iteration": 4.306700229644775 + }, + { + "auxiliary_loss_clip": 0.01061543, + "auxiliary_loss_mlp": 0.01028982, + "balance_loss_clip": 1.03103149, + "balance_loss_mlp": 1.01785946, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 3.745204003378534, + "language_loss": 0.63566029, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65656549, + "num_input_tokens_seen": 294259395, + "step": 13641, + "time_per_iteration": 2.9218668937683105 + }, + { + "auxiliary_loss_clip": 0.01041041, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.03070116, + "balance_loss_mlp": 1.01532483, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.8145567631177524, + "language_loss": 0.73065782, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75133574, + "num_input_tokens_seen": 294277365, + "step": 13642, + "time_per_iteration": 2.6642913818359375 + }, + { + "auxiliary_loss_clip": 0.01074325, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.03458476, + "balance_loss_mlp": 1.02010846, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.744976449560913, + "language_loss": 0.703686, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72473735, + "num_input_tokens_seen": 294297555, + "step": 13643, + "time_per_iteration": 2.697103261947632 + }, + { + "auxiliary_loss_clip": 0.01086393, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.03437591, + "balance_loss_mlp": 1.01990294, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 2.0101817714064887, + "language_loss": 0.65885955, + "learning_rate": 3.291115727880256e-07, + "loss": 0.68003196, + "num_input_tokens_seen": 294317600, + "step": 13644, + "time_per_iteration": 2.608766555786133 + }, + { + "auxiliary_loss_clip": 0.01057198, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.03141725, + "balance_loss_mlp": 1.02288032, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.4196644488699897, + "language_loss": 0.70560431, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72651422, + "num_input_tokens_seen": 294340215, + "step": 13645, + "time_per_iteration": 2.6398520469665527 + }, + { + "auxiliary_loss_clip": 0.01073338, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.03380048, + "balance_loss_mlp": 1.01724863, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 2.0321729366676604, + "language_loss": 0.71850073, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73950934, + "num_input_tokens_seen": 294358590, + "step": 13646, + "time_per_iteration": 2.61179518699646 + }, + { + "auxiliary_loss_clip": 0.01075911, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.0330385, + "balance_loss_mlp": 1.02097273, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.9210415871379003, + "language_loss": 0.78646111, + "learning_rate": 3.284697424316132e-07, + "loss": 0.80754167, + "num_input_tokens_seen": 294375825, + "step": 13647, + "time_per_iteration": 2.645812749862671 + }, + { + "auxiliary_loss_clip": 0.01093956, + "auxiliary_loss_mlp": 0.01028502, + "balance_loss_clip": 1.03436828, + "balance_loss_mlp": 1.01828575, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 2.3287174685605634, + "language_loss": 0.68113363, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.70235825, + "num_input_tokens_seen": 294398500, + "step": 13648, + "time_per_iteration": 2.6185288429260254 + }, + { + "auxiliary_loss_clip": 0.01067145, + "auxiliary_loss_mlp": 0.01028204, + "balance_loss_clip": 1.02915072, + "balance_loss_mlp": 1.01697445, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.7792841186482202, + "language_loss": 0.80243194, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82338542, + "num_input_tokens_seen": 294418840, + "step": 13649, + "time_per_iteration": 2.6404449939727783 + }, + { + "auxiliary_loss_clip": 0.01081337, + "auxiliary_loss_mlp": 0.01033347, + "balance_loss_clip": 1.03581154, + "balance_loss_mlp": 1.02106261, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.636735557693569, + "language_loss": 0.69080389, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71195078, + "num_input_tokens_seen": 294438215, + "step": 13650, + "time_per_iteration": 2.5925986766815186 + }, + { + "auxiliary_loss_clip": 0.01077839, + "auxiliary_loss_mlp": 0.01028704, + "balance_loss_clip": 1.03363323, + "balance_loss_mlp": 1.01693225, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 2.27168958180545, + "language_loss": 0.60881686, + "learning_rate": 3.276148560452001e-07, + "loss": 0.62988228, + "num_input_tokens_seen": 294455260, + "step": 13651, + "time_per_iteration": 4.051574230194092 + }, + { + "auxiliary_loss_clip": 0.01054714, + "auxiliary_loss_mlp": 0.00749635, + "balance_loss_clip": 1.0314517, + "balance_loss_mlp": 1.0002358, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 1.9548153277750926, + "language_loss": 0.72160017, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.73964357, + "num_input_tokens_seen": 294473205, + "step": 13652, + "time_per_iteration": 2.5937297344207764 + }, + { + "auxiliary_loss_clip": 0.01069553, + "auxiliary_loss_mlp": 0.01025638, + "balance_loss_clip": 1.03175628, + "balance_loss_mlp": 1.01622081, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 2.046069861385378, + "language_loss": 0.7305032, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75145507, + "num_input_tokens_seen": 294490645, + "step": 13653, + "time_per_iteration": 2.5400137901306152 + }, + { + "auxiliary_loss_clip": 0.01056392, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.03110743, + "balance_loss_mlp": 1.01962543, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 1.8635464664971324, + "language_loss": 0.62915242, + "learning_rate": 3.269743571056451e-07, + "loss": 0.65003282, + "num_input_tokens_seen": 294513500, + "step": 13654, + "time_per_iteration": 2.7662746906280518 + }, + { + "auxiliary_loss_clip": 0.010693, + "auxiliary_loss_mlp": 0.01024679, + "balance_loss_clip": 1.03219271, + "balance_loss_mlp": 1.01390219, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.5093276693208968, + "language_loss": 0.69959486, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72053462, + "num_input_tokens_seen": 294535710, + "step": 13655, + "time_per_iteration": 2.6527624130249023 + }, + { + "auxiliary_loss_clip": 0.01073525, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.03372407, + "balance_loss_mlp": 1.0221076, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.6298046001131876, + "language_loss": 0.8248958, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84595788, + "num_input_tokens_seen": 294554055, + "step": 13656, + "time_per_iteration": 2.705594778060913 + }, + { + "auxiliary_loss_clip": 0.01069247, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.03279781, + "balance_loss_mlp": 1.01960421, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.191431420362914, + "language_loss": 0.74066132, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.7616514, + "num_input_tokens_seen": 294570390, + "step": 13657, + "time_per_iteration": 2.5476744174957275 + }, + { + "auxiliary_loss_clip": 0.01062425, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.03022075, + "balance_loss_mlp": 1.02107716, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.7186983663022901, + "language_loss": 0.55572999, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57667291, + "num_input_tokens_seen": 294593050, + "step": 13658, + "time_per_iteration": 2.8065192699432373 + }, + { + "auxiliary_loss_clip": 0.01033312, + "auxiliary_loss_mlp": 0.01027554, + "balance_loss_clip": 1.03090596, + "balance_loss_mlp": 1.01693261, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.072039048345895, + "language_loss": 0.78945148, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81006014, + "num_input_tokens_seen": 294608550, + "step": 13659, + "time_per_iteration": 2.6329092979431152 + }, + { + "auxiliary_loss_clip": 0.01078889, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.03023458, + "balance_loss_mlp": 1.0167346, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.915752663636047, + "language_loss": 0.59924138, + "learning_rate": 3.256950723599887e-07, + "loss": 0.62029421, + "num_input_tokens_seen": 294630380, + "step": 13660, + "time_per_iteration": 2.676100730895996 + }, + { + "auxiliary_loss_clip": 0.010815, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.03230631, + "balance_loss_mlp": 1.02020514, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 1.8137126895703528, + "language_loss": 0.72813213, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74926949, + "num_input_tokens_seen": 294648655, + "step": 13661, + "time_per_iteration": 2.493560552597046 + }, + { + "auxiliary_loss_clip": 0.0107956, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.03165317, + "balance_loss_mlp": 1.01928163, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 2.389365178348146, + "language_loss": 0.74809229, + "learning_rate": 3.252691519437143e-07, + "loss": 0.76919615, + "num_input_tokens_seen": 294666915, + "step": 13662, + "time_per_iteration": 2.51727032661438 + }, + { + "auxiliary_loss_clip": 0.01024309, + "auxiliary_loss_mlp": 0.01000812, + "balance_loss_clip": 1.00435615, + "balance_loss_mlp": 0.99981099, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7409260815648161, + "language_loss": 0.54023534, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56048656, + "num_input_tokens_seen": 294731545, + "step": 13663, + "time_per_iteration": 3.1880383491516113 + }, + { + "auxiliary_loss_clip": 0.01046667, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.02830958, + "balance_loss_mlp": 1.02125454, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.0598497306267047, + "language_loss": 0.65520078, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67599797, + "num_input_tokens_seen": 294748745, + "step": 13664, + "time_per_iteration": 2.663520097732544 + }, + { + "auxiliary_loss_clip": 0.0106966, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.03145254, + "balance_loss_mlp": 1.0193069, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.4542822713683217, + "language_loss": 0.75113434, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77212489, + "num_input_tokens_seen": 294768955, + "step": 13665, + "time_per_iteration": 4.024882793426514 + }, + { + "auxiliary_loss_clip": 0.01082557, + "auxiliary_loss_mlp": 0.00749424, + "balance_loss_clip": 1.03373468, + "balance_loss_mlp": 1.00029922, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 2.3868120962938826, + "language_loss": 0.65907311, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.6773929, + "num_input_tokens_seen": 294789250, + "step": 13666, + "time_per_iteration": 2.6554784774780273 + }, + { + "auxiliary_loss_clip": 0.01049099, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.03280687, + "balance_loss_mlp": 1.01820898, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.7230878970136125, + "language_loss": 0.77219641, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.7929703, + "num_input_tokens_seen": 294809760, + "step": 13667, + "time_per_iteration": 2.6770644187927246 + }, + { + "auxiliary_loss_clip": 0.0106613, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.03467274, + "balance_loss_mlp": 1.01934552, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 2.021680205351002, + "language_loss": 0.77078319, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79174614, + "num_input_tokens_seen": 294826495, + "step": 13668, + "time_per_iteration": 2.5824062824249268 + }, + { + "auxiliary_loss_clip": 0.01049394, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.02945852, + "balance_loss_mlp": 1.01993585, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 1.9379954830266988, + "language_loss": 0.73655438, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75735641, + "num_input_tokens_seen": 294845370, + "step": 13669, + "time_per_iteration": 2.7342987060546875 + }, + { + "auxiliary_loss_clip": 0.01084119, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.0324688, + "balance_loss_mlp": 1.017699, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.6578198399989716, + "language_loss": 0.78331864, + "learning_rate": 3.235680111625161e-07, + "loss": 0.80444181, + "num_input_tokens_seen": 294863740, + "step": 13670, + "time_per_iteration": 4.009946584701538 + }, + { + "auxiliary_loss_clip": 0.01090446, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.03491318, + "balance_loss_mlp": 1.02424347, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 1.822148245464418, + "language_loss": 0.74815774, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.7694236, + "num_input_tokens_seen": 294882815, + "step": 13671, + "time_per_iteration": 2.6102211475372314 + }, + { + "auxiliary_loss_clip": 0.01089957, + "auxiliary_loss_mlp": 0.01028218, + "balance_loss_clip": 1.03327465, + "balance_loss_mlp": 1.01636863, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 1.8860555748530945, + "language_loss": 0.76044017, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78162193, + "num_input_tokens_seen": 294901985, + "step": 13672, + "time_per_iteration": 2.637118339538574 + }, + { + "auxiliary_loss_clip": 0.0104482, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.02966726, + "balance_loss_mlp": 1.01884413, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.363110994147712, + "language_loss": 0.74250549, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76326191, + "num_input_tokens_seen": 294919705, + "step": 13673, + "time_per_iteration": 2.5633232593536377 + }, + { + "auxiliary_loss_clip": 0.01077739, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.03379631, + "balance_loss_mlp": 1.01990879, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.729845023102322, + "language_loss": 0.79576325, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81685275, + "num_input_tokens_seen": 294939900, + "step": 13674, + "time_per_iteration": 2.5676865577697754 + }, + { + "auxiliary_loss_clip": 0.01070525, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.03050613, + "balance_loss_mlp": 1.01972902, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 1.985481725231511, + "language_loss": 0.70280159, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72381604, + "num_input_tokens_seen": 294959110, + "step": 13675, + "time_per_iteration": 2.534327507019043 + }, + { + "auxiliary_loss_clip": 0.01076643, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.03198516, + "balance_loss_mlp": 1.02136946, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.534587116412033, + "language_loss": 0.74463284, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76572734, + "num_input_tokens_seen": 294978660, + "step": 13676, + "time_per_iteration": 2.5360677242279053 + }, + { + "auxiliary_loss_clip": 0.01068762, + "auxiliary_loss_mlp": 0.01027648, + "balance_loss_clip": 1.03212738, + "balance_loss_mlp": 1.01749098, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.8315280738585555, + "language_loss": 0.80754715, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82851124, + "num_input_tokens_seen": 294998075, + "step": 13677, + "time_per_iteration": 2.5267834663391113 + }, + { + "auxiliary_loss_clip": 0.01079407, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.03166938, + "balance_loss_mlp": 1.02101445, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 1.708312724659321, + "language_loss": 0.70012021, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72123909, + "num_input_tokens_seen": 295015950, + "step": 13678, + "time_per_iteration": 2.4830822944641113 + }, + { + "auxiliary_loss_clip": 0.01095062, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.03238153, + "balance_loss_mlp": 1.01950741, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.5200306395513297, + "language_loss": 0.71213913, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73339355, + "num_input_tokens_seen": 295036800, + "step": 13679, + "time_per_iteration": 4.127137184143066 + }, + { + "auxiliary_loss_clip": 0.01065904, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.03058243, + "balance_loss_mlp": 1.01760662, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 7.390245033932654, + "language_loss": 0.7024374, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72339344, + "num_input_tokens_seen": 295055300, + "step": 13680, + "time_per_iteration": 2.5788965225219727 + }, + { + "auxiliary_loss_clip": 0.01063493, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.03204942, + "balance_loss_mlp": 1.01662242, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.8066522926792177, + "language_loss": 0.60644484, + "learning_rate": 3.21235586541986e-07, + "loss": 0.62734848, + "num_input_tokens_seen": 295076420, + "step": 13681, + "time_per_iteration": 2.6367990970611572 + }, + { + "auxiliary_loss_clip": 0.01068982, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.03074694, + "balance_loss_mlp": 1.0218153, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.4947782883392808, + "language_loss": 0.69987202, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.72089744, + "num_input_tokens_seen": 295100540, + "step": 13682, + "time_per_iteration": 2.735421657562256 + }, + { + "auxiliary_loss_clip": 0.01098699, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.03402686, + "balance_loss_mlp": 1.02086568, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 2.605581876224074, + "language_loss": 0.79412037, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81543756, + "num_input_tokens_seen": 295120180, + "step": 13683, + "time_per_iteration": 2.511345863342285 + }, + { + "auxiliary_loss_clip": 0.01093218, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.03310108, + "balance_loss_mlp": 1.01938152, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 1.964853531538072, + "language_loss": 0.86311084, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88433653, + "num_input_tokens_seen": 295138530, + "step": 13684, + "time_per_iteration": 2.539267063140869 + }, + { + "auxiliary_loss_clip": 0.01093847, + "auxiliary_loss_mlp": 0.01025508, + "balance_loss_clip": 1.03290105, + "balance_loss_mlp": 1.01545906, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.683674949269296, + "language_loss": 0.7979812, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.81917477, + "num_input_tokens_seen": 295160260, + "step": 13685, + "time_per_iteration": 2.5971875190734863 + }, + { + "auxiliary_loss_clip": 0.01059281, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.03114831, + "balance_loss_mlp": 1.0187453, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.6866330833719618, + "language_loss": 0.68812883, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.70901555, + "num_input_tokens_seen": 295177055, + "step": 13686, + "time_per_iteration": 2.6326942443847656 + }, + { + "auxiliary_loss_clip": 0.01063447, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03112602, + "balance_loss_mlp": 1.019063, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 1.8538391978143693, + "language_loss": 0.78020644, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80114353, + "num_input_tokens_seen": 295193870, + "step": 13687, + "time_per_iteration": 2.6181976795196533 + }, + { + "auxiliary_loss_clip": 0.01086604, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.03260243, + "balance_loss_mlp": 1.01815307, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 1.673756258300757, + "language_loss": 0.72486866, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.74602675, + "num_input_tokens_seen": 295211040, + "step": 13688, + "time_per_iteration": 2.5568313598632812 + }, + { + "auxiliary_loss_clip": 0.01096692, + "auxiliary_loss_mlp": 0.00749335, + "balance_loss_clip": 1.0334332, + "balance_loss_mlp": 1.00026274, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.8722922530396386, + "language_loss": 0.73722368, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75568396, + "num_input_tokens_seen": 295231300, + "step": 13689, + "time_per_iteration": 2.5646653175354004 + }, + { + "auxiliary_loss_clip": 0.01086474, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.03185964, + "balance_loss_mlp": 1.01895607, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 2.1378178074408765, + "language_loss": 0.69069302, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71185905, + "num_input_tokens_seen": 295251045, + "step": 13690, + "time_per_iteration": 2.516794443130493 + }, + { + "auxiliary_loss_clip": 0.01045951, + "auxiliary_loss_mlp": 0.01040525, + "balance_loss_clip": 1.02942133, + "balance_loss_mlp": 1.02766263, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 1.7687117943301187, + "language_loss": 0.8573283, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87819302, + "num_input_tokens_seen": 295270225, + "step": 13691, + "time_per_iteration": 2.648470878601074 + }, + { + "auxiliary_loss_clip": 0.01089401, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.03486741, + "balance_loss_mlp": 1.01997685, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 2.024787237634613, + "language_loss": 0.7687276, + "learning_rate": 3.189108646472252e-07, + "loss": 0.78993201, + "num_input_tokens_seen": 295288950, + "step": 13692, + "time_per_iteration": 4.056003570556641 + }, + { + "auxiliary_loss_clip": 0.01084531, + "auxiliary_loss_mlp": 0.01026413, + "balance_loss_clip": 1.03318107, + "balance_loss_mlp": 1.01580918, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.514737935624153, + "language_loss": 0.7132709, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73438036, + "num_input_tokens_seen": 295309405, + "step": 13693, + "time_per_iteration": 2.5251846313476562 + }, + { + "auxiliary_loss_clip": 0.01054946, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.02820468, + "balance_loss_mlp": 1.02018404, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.3530354103014477, + "language_loss": 0.83766311, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85851252, + "num_input_tokens_seen": 295331115, + "step": 13694, + "time_per_iteration": 2.6848196983337402 + }, + { + "auxiliary_loss_clip": 0.01037212, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.032812, + "balance_loss_mlp": 1.01910532, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 3.774658873441407, + "language_loss": 0.77520001, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79587257, + "num_input_tokens_seen": 295350495, + "step": 13695, + "time_per_iteration": 2.733187198638916 + }, + { + "auxiliary_loss_clip": 0.01074928, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.03310776, + "balance_loss_mlp": 1.02100086, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.8991479132950553, + "language_loss": 0.80685997, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.82792312, + "num_input_tokens_seen": 295368225, + "step": 13696, + "time_per_iteration": 2.610769033432007 + }, + { + "auxiliary_loss_clip": 0.01013806, + "auxiliary_loss_mlp": 0.01007004, + "balance_loss_clip": 1.00377011, + "balance_loss_mlp": 1.00605607, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.741036579475205, + "language_loss": 0.63870728, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65891534, + "num_input_tokens_seen": 295430035, + "step": 13697, + "time_per_iteration": 3.1978518962860107 + }, + { + "auxiliary_loss_clip": 0.01056862, + "auxiliary_loss_mlp": 0.01024836, + "balance_loss_clip": 1.02944016, + "balance_loss_mlp": 1.01525736, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.724626419067809, + "language_loss": 0.73122954, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75204659, + "num_input_tokens_seen": 295447765, + "step": 13698, + "time_per_iteration": 2.607572078704834 + }, + { + "auxiliary_loss_clip": 0.01043137, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.02600718, + "balance_loss_mlp": 1.01930237, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 2.5629479379187323, + "language_loss": 0.71755743, + "learning_rate": 3.174355115608305e-07, + "loss": 0.7383216, + "num_input_tokens_seen": 295464810, + "step": 13699, + "time_per_iteration": 2.585571050643921 + }, + { + "auxiliary_loss_clip": 0.01074021, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.03206968, + "balance_loss_mlp": 1.01676452, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 2.3665552556747005, + "language_loss": 0.81983006, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84084535, + "num_input_tokens_seen": 295482605, + "step": 13700, + "time_per_iteration": 2.6162662506103516 + }, + { + "auxiliary_loss_clip": 0.01073273, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.0308826, + "balance_loss_mlp": 1.02073908, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.7188426742832508, + "language_loss": 0.73103464, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75208092, + "num_input_tokens_seen": 295503780, + "step": 13701, + "time_per_iteration": 2.661184787750244 + }, + { + "auxiliary_loss_clip": 0.01085133, + "auxiliary_loss_mlp": 0.01030545, + "balance_loss_clip": 1.03113651, + "balance_loss_mlp": 1.01973867, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.805276172410837, + "language_loss": 0.68950677, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71066356, + "num_input_tokens_seen": 295522035, + "step": 13702, + "time_per_iteration": 2.59999942779541 + }, + { + "auxiliary_loss_clip": 0.01047691, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.03099096, + "balance_loss_mlp": 1.0186193, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.7513517841749475, + "language_loss": 0.74943751, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.77021205, + "num_input_tokens_seen": 295541190, + "step": 13703, + "time_per_iteration": 2.7543280124664307 + }, + { + "auxiliary_loss_clip": 0.0110045, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.03320885, + "balance_loss_mlp": 1.01902187, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.903258369038519, + "language_loss": 0.70021164, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.72152418, + "num_input_tokens_seen": 295558860, + "step": 13704, + "time_per_iteration": 2.5647988319396973 + }, + { + "auxiliary_loss_clip": 0.01093521, + "auxiliary_loss_mlp": 0.01025404, + "balance_loss_clip": 1.0316397, + "balance_loss_mlp": 1.01518798, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 2.8399595981421926, + "language_loss": 0.64293015, + "learning_rate": 3.161734114144916e-07, + "loss": 0.66411936, + "num_input_tokens_seen": 295578155, + "step": 13705, + "time_per_iteration": 4.150554180145264 + }, + { + "auxiliary_loss_clip": 0.01097777, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.03301096, + "balance_loss_mlp": 1.01690102, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 3.9759995245704585, + "language_loss": 0.69243956, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71370202, + "num_input_tokens_seen": 295599170, + "step": 13706, + "time_per_iteration": 2.5379600524902344 + }, + { + "auxiliary_loss_clip": 0.01068171, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.03248477, + "balance_loss_mlp": 1.02272213, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.735848537412288, + "language_loss": 0.69607925, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71710777, + "num_input_tokens_seen": 295617465, + "step": 13707, + "time_per_iteration": 2.5721638202667236 + }, + { + "auxiliary_loss_clip": 0.01058874, + "auxiliary_loss_mlp": 0.01034319, + "balance_loss_clip": 1.03045177, + "balance_loss_mlp": 1.02160549, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 3.6786752487598466, + "language_loss": 0.79120141, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81213331, + "num_input_tokens_seen": 295634960, + "step": 13708, + "time_per_iteration": 2.6044540405273438 + }, + { + "auxiliary_loss_clip": 0.01078568, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.03155613, + "balance_loss_mlp": 1.01665044, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 2.9585222822338473, + "language_loss": 0.68476868, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70583785, + "num_input_tokens_seen": 295652725, + "step": 13709, + "time_per_iteration": 2.5280187129974365 + }, + { + "auxiliary_loss_clip": 0.01049825, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.0288806, + "balance_loss_mlp": 1.02261877, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 2.7546049587789767, + "language_loss": 0.82275701, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84358627, + "num_input_tokens_seen": 295671195, + "step": 13710, + "time_per_iteration": 2.642961263656616 + }, + { + "auxiliary_loss_clip": 0.01085914, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.03296089, + "balance_loss_mlp": 1.01883638, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 1.9462121963678158, + "language_loss": 0.78244746, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80360973, + "num_input_tokens_seen": 295689130, + "step": 13711, + "time_per_iteration": 3.9642860889434814 + }, + { + "auxiliary_loss_clip": 0.01058632, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.02877414, + "balance_loss_mlp": 1.01699698, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 1.8234439155634374, + "language_loss": 0.65928602, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.68015605, + "num_input_tokens_seen": 295706385, + "step": 13712, + "time_per_iteration": 2.543175220489502 + }, + { + "auxiliary_loss_clip": 0.01078203, + "auxiliary_loss_mlp": 0.01029038, + "balance_loss_clip": 1.03322411, + "balance_loss_mlp": 1.01879191, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.6381701399229642, + "language_loss": 0.74592787, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76700026, + "num_input_tokens_seen": 295727925, + "step": 13713, + "time_per_iteration": 2.5646872520446777 + }, + { + "auxiliary_loss_clip": 0.01085306, + "auxiliary_loss_mlp": 0.01024018, + "balance_loss_clip": 1.03207731, + "balance_loss_mlp": 1.01306295, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 2.977112313020564, + "language_loss": 0.81601298, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83710623, + "num_input_tokens_seen": 295744420, + "step": 13714, + "time_per_iteration": 2.5506432056427 + }, + { + "auxiliary_loss_clip": 0.01087583, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.03530073, + "balance_loss_mlp": 1.02127683, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.871828687581249, + "language_loss": 0.66118741, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68239295, + "num_input_tokens_seen": 295765105, + "step": 13715, + "time_per_iteration": 2.5809152126312256 + }, + { + "auxiliary_loss_clip": 0.01066785, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.03287959, + "balance_loss_mlp": 1.01815534, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 1.9199230500740712, + "language_loss": 0.74680984, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.76777208, + "num_input_tokens_seen": 295784200, + "step": 13716, + "time_per_iteration": 2.647305727005005 + }, + { + "auxiliary_loss_clip": 0.00984163, + "auxiliary_loss_mlp": 0.00997124, + "balance_loss_clip": 1.00782728, + "balance_loss_mlp": 0.9960869, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7156044363281064, + "language_loss": 0.58955926, + "learning_rate": 3.136561087351175e-07, + "loss": 0.60937214, + "num_input_tokens_seen": 295846555, + "step": 13717, + "time_per_iteration": 3.272681713104248 + }, + { + "auxiliary_loss_clip": 0.01080867, + "auxiliary_loss_mlp": 0.00749257, + "balance_loss_clip": 1.03324771, + "balance_loss_mlp": 1.00022972, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 2.1718227396127023, + "language_loss": 0.79841566, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81671691, + "num_input_tokens_seen": 295863425, + "step": 13718, + "time_per_iteration": 2.554337501525879 + }, + { + "auxiliary_loss_clip": 0.01075558, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.03219366, + "balance_loss_mlp": 1.02128434, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.6129202128702675, + "language_loss": 0.6922183, + "learning_rate": 3.132374531662778e-07, + "loss": 0.7132926, + "num_input_tokens_seen": 295880925, + "step": 13719, + "time_per_iteration": 2.4856276512145996 + }, + { + "auxiliary_loss_clip": 0.0106572, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.02980232, + "balance_loss_mlp": 1.02220345, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.374844090127921, + "language_loss": 0.69637048, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71738076, + "num_input_tokens_seen": 295898205, + "step": 13720, + "time_per_iteration": 4.01209282875061 + }, + { + "auxiliary_loss_clip": 0.01022689, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.032897, + "balance_loss_mlp": 1.01654863, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 2.8308187041579105, + "language_loss": 0.75696254, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.7774663, + "num_input_tokens_seen": 295918130, + "step": 13721, + "time_per_iteration": 2.8082334995269775 + }, + { + "auxiliary_loss_clip": 0.01047643, + "auxiliary_loss_mlp": 0.01024735, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.01422715, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 2.232290238370837, + "language_loss": 0.77878207, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.79950583, + "num_input_tokens_seen": 295937760, + "step": 13722, + "time_per_iteration": 2.7359931468963623 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.01029555, + "balance_loss_clip": 1.0345583, + "balance_loss_mlp": 1.01896286, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.6940746438649192, + "language_loss": 0.62424922, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.64550674, + "num_input_tokens_seen": 295957585, + "step": 13723, + "time_per_iteration": 2.52163028717041 + }, + { + "auxiliary_loss_clip": 0.01096856, + "auxiliary_loss_mlp": 0.01028878, + "balance_loss_clip": 1.03297031, + "balance_loss_mlp": 1.01805413, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.5828607773591687, + "language_loss": 0.74350584, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76476312, + "num_input_tokens_seen": 295977135, + "step": 13724, + "time_per_iteration": 2.5203747749328613 + }, + { + "auxiliary_loss_clip": 0.01062516, + "auxiliary_loss_mlp": 0.01030531, + "balance_loss_clip": 1.03173745, + "balance_loss_mlp": 1.01889014, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 2.683890386246208, + "language_loss": 0.64271867, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.6636492, + "num_input_tokens_seen": 295996265, + "step": 13725, + "time_per_iteration": 2.704336643218994 + }, + { + "auxiliary_loss_clip": 0.0106748, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.03023481, + "balance_loss_mlp": 1.01795268, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.5414270910968302, + "language_loss": 0.82005304, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.84101677, + "num_input_tokens_seen": 296014745, + "step": 13726, + "time_per_iteration": 2.5726749897003174 + }, + { + "auxiliary_loss_clip": 0.01076025, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.02818203, + "balance_loss_mlp": 1.02075016, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.7044039572993883, + "language_loss": 0.70417082, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72524065, + "num_input_tokens_seen": 296036960, + "step": 13727, + "time_per_iteration": 2.6202235221862793 + }, + { + "auxiliary_loss_clip": 0.01089115, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03444302, + "balance_loss_mlp": 1.01882732, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 2.8319170303868537, + "language_loss": 0.62233162, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64353311, + "num_input_tokens_seen": 296056540, + "step": 13728, + "time_per_iteration": 2.604094982147217 + }, + { + "auxiliary_loss_clip": 0.01080314, + "auxiliary_loss_mlp": 0.01026023, + "balance_loss_clip": 1.03416836, + "balance_loss_mlp": 1.01463258, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.6800247187122455, + "language_loss": 0.71259177, + "learning_rate": 3.111480143230092e-07, + "loss": 0.7336551, + "num_input_tokens_seen": 296077950, + "step": 13729, + "time_per_iteration": 2.554732322692871 + }, + { + "auxiliary_loss_clip": 0.010052, + "auxiliary_loss_mlp": 0.01000026, + "balance_loss_clip": 1.00488329, + "balance_loss_mlp": 0.99912608, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8592065166697486, + "language_loss": 0.62743556, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64748788, + "num_input_tokens_seen": 296127060, + "step": 13730, + "time_per_iteration": 2.8924646377563477 + }, + { + "auxiliary_loss_clip": 0.01043873, + "auxiliary_loss_mlp": 0.01032619, + "balance_loss_clip": 1.03027678, + "balance_loss_mlp": 1.02173471, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 4.292708259291211, + "language_loss": 0.63136411, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65212899, + "num_input_tokens_seen": 296147775, + "step": 13731, + "time_per_iteration": 2.7587296962738037 + }, + { + "auxiliary_loss_clip": 0.01054089, + "auxiliary_loss_mlp": 0.00749819, + "balance_loss_clip": 1.0276227, + "balance_loss_mlp": 1.00028205, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 2.2318146930397713, + "language_loss": 0.69671965, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71475869, + "num_input_tokens_seen": 296163560, + "step": 13732, + "time_per_iteration": 3.9017059803009033 + }, + { + "auxiliary_loss_clip": 0.01090042, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.03285003, + "balance_loss_mlp": 1.02356279, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.2737071291632853, + "language_loss": 0.70934498, + "learning_rate": 3.103140315024817e-07, + "loss": 0.73059708, + "num_input_tokens_seen": 296178730, + "step": 13733, + "time_per_iteration": 2.6361312866210938 + }, + { + "auxiliary_loss_clip": 0.01093868, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.03195453, + "balance_loss_mlp": 1.01838386, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.4971563625408308, + "language_loss": 0.8253336, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.8465693, + "num_input_tokens_seen": 296200175, + "step": 13734, + "time_per_iteration": 2.596876621246338 + }, + { + "auxiliary_loss_clip": 0.01061816, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.02805758, + "balance_loss_mlp": 1.01898146, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.8283031142083292, + "language_loss": 0.82872343, + "learning_rate": 3.098974244989676e-07, + "loss": 0.84964335, + "num_input_tokens_seen": 296219305, + "step": 13735, + "time_per_iteration": 2.6049070358276367 + }, + { + "auxiliary_loss_clip": 0.01091411, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.03635335, + "balance_loss_mlp": 1.01880133, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.8950064212161635, + "language_loss": 0.7078644, + "learning_rate": 3.096892171265497e-07, + "loss": 0.72906995, + "num_input_tokens_seen": 296236945, + "step": 13736, + "time_per_iteration": 2.5855040550231934 + }, + { + "auxiliary_loss_clip": 0.0101404, + "auxiliary_loss_mlp": 0.010056, + "balance_loss_clip": 1.00401354, + "balance_loss_mlp": 1.00473011, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8514114819629647, + "language_loss": 0.67914248, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.69933891, + "num_input_tokens_seen": 296294685, + "step": 13737, + "time_per_iteration": 3.0739521980285645 + }, + { + "auxiliary_loss_clip": 0.01070491, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.03019381, + "balance_loss_mlp": 1.02206957, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 1.975081381527564, + "language_loss": 0.69626248, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71729279, + "num_input_tokens_seen": 296314790, + "step": 13738, + "time_per_iteration": 2.533937454223633 + }, + { + "auxiliary_loss_clip": 0.01079657, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.03483653, + "balance_loss_mlp": 1.01769328, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.1708546299338054, + "language_loss": 0.62956065, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65065944, + "num_input_tokens_seen": 296335355, + "step": 13739, + "time_per_iteration": 2.7296018600463867 + }, + { + "auxiliary_loss_clip": 0.01002422, + "auxiliary_loss_mlp": 0.00999177, + "balance_loss_clip": 1.00282431, + "balance_loss_mlp": 0.9982115, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8146024978265184, + "language_loss": 0.59294325, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61295933, + "num_input_tokens_seen": 296399885, + "step": 13740, + "time_per_iteration": 3.147331953048706 + }, + { + "auxiliary_loss_clip": 0.01101786, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.03470254, + "balance_loss_mlp": 1.0186677, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 1.7489045472797056, + "language_loss": 0.75365365, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77498049, + "num_input_tokens_seen": 296417660, + "step": 13741, + "time_per_iteration": 2.493720054626465 + }, + { + "auxiliary_loss_clip": 0.01083688, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.03170443, + "balance_loss_mlp": 1.01995921, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.8256964776534457, + "language_loss": 0.62495542, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64610219, + "num_input_tokens_seen": 296438255, + "step": 13742, + "time_per_iteration": 2.6941823959350586 + }, + { + "auxiliary_loss_clip": 0.01054486, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.03266048, + "balance_loss_mlp": 1.02205181, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 2.8274289835368043, + "language_loss": 0.6597622, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68066299, + "num_input_tokens_seen": 296454485, + "step": 13743, + "time_per_iteration": 2.676987886428833 + }, + { + "auxiliary_loss_clip": 0.01075375, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.03286588, + "balance_loss_mlp": 1.02275729, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.9315155003481337, + "language_loss": 0.66994691, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.69104272, + "num_input_tokens_seen": 296473740, + "step": 13744, + "time_per_iteration": 2.625419855117798 + }, + { + "auxiliary_loss_clip": 0.01064834, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.0336709, + "balance_loss_mlp": 1.01804233, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 3.485536311961592, + "language_loss": 0.75517881, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77611625, + "num_input_tokens_seen": 296493355, + "step": 13745, + "time_per_iteration": 4.071111440658569 + }, + { + "auxiliary_loss_clip": 0.0106352, + "auxiliary_loss_mlp": 0.00749546, + "balance_loss_clip": 1.03036022, + "balance_loss_mlp": 1.00024819, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.8766753610383022, + "language_loss": 0.79060537, + "learning_rate": 3.076106700253709e-07, + "loss": 0.80873597, + "num_input_tokens_seen": 296510520, + "step": 13746, + "time_per_iteration": 2.618361234664917 + }, + { + "auxiliary_loss_clip": 0.01091191, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.03529954, + "balance_loss_mlp": 1.02099097, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 1.9660084464443062, + "language_loss": 0.68180162, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70304239, + "num_input_tokens_seen": 296528265, + "step": 13747, + "time_per_iteration": 2.5271623134613037 + }, + { + "auxiliary_loss_clip": 0.01081247, + "auxiliary_loss_mlp": 0.01036928, + "balance_loss_clip": 1.03141665, + "balance_loss_mlp": 1.02344, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 2.063533908102293, + "language_loss": 0.7519967, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.7731784, + "num_input_tokens_seen": 296547810, + "step": 13748, + "time_per_iteration": 2.6054303646087646 + }, + { + "auxiliary_loss_clip": 0.0105816, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.03249955, + "balance_loss_mlp": 1.02478421, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 1.8513141100549766, + "language_loss": 0.63591397, + "learning_rate": 3.069883569603102e-07, + "loss": 0.65683949, + "num_input_tokens_seen": 296565940, + "step": 13749, + "time_per_iteration": 2.561910629272461 + }, + { + "auxiliary_loss_clip": 0.01070011, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.02955067, + "balance_loss_mlp": 1.01824987, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.5356107061923547, + "language_loss": 0.73399103, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75497812, + "num_input_tokens_seen": 296585090, + "step": 13750, + "time_per_iteration": 2.6219496726989746 + }, + { + "auxiliary_loss_clip": 0.01086379, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.03338087, + "balance_loss_mlp": 1.02356625, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.9487577066602462, + "language_loss": 0.65707654, + "learning_rate": 3.065738025663496e-07, + "loss": 0.67828858, + "num_input_tokens_seen": 296604950, + "step": 13751, + "time_per_iteration": 4.127739906311035 + }, + { + "auxiliary_loss_clip": 0.01066302, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_clip": 1.0281918, + "balance_loss_mlp": 1.0176326, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.4888449824097127, + "language_loss": 0.60521197, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.62615442, + "num_input_tokens_seen": 296627780, + "step": 13752, + "time_per_iteration": 2.7561800479888916 + }, + { + "auxiliary_loss_clip": 0.01012844, + "auxiliary_loss_mlp": 0.01003743, + "balance_loss_clip": 1.00316119, + "balance_loss_mlp": 1.00276566, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7881580537388818, + "language_loss": 0.57395267, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.5941186, + "num_input_tokens_seen": 296683850, + "step": 13753, + "time_per_iteration": 3.126899003982544 + }, + { + "auxiliary_loss_clip": 0.00977124, + "auxiliary_loss_mlp": 0.00746684, + "balance_loss_clip": 1.00735855, + "balance_loss_mlp": 0.99988216, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.6986403093017584, + "language_loss": 0.55034959, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56758767, + "num_input_tokens_seen": 296741420, + "step": 13754, + "time_per_iteration": 3.287734031677246 + }, + { + "auxiliary_loss_clip": 0.01051508, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.02902389, + "balance_loss_mlp": 1.01887584, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 2.3050692091637344, + "language_loss": 0.69259584, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71339989, + "num_input_tokens_seen": 296759620, + "step": 13755, + "time_per_iteration": 2.7127695083618164 + }, + { + "auxiliary_loss_clip": 0.01048614, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.0297569, + "balance_loss_mlp": 1.02028775, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 1.8559259043096537, + "language_loss": 0.69938034, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72016841, + "num_input_tokens_seen": 296777275, + "step": 13756, + "time_per_iteration": 2.9515719413757324 + }, + { + "auxiliary_loss_clip": 0.01089497, + "auxiliary_loss_mlp": 0.01031917, + "balance_loss_clip": 1.03610086, + "balance_loss_mlp": 1.02093196, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 2.3584421015280577, + "language_loss": 0.71952713, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74074125, + "num_input_tokens_seen": 296796655, + "step": 13757, + "time_per_iteration": 2.591219902038574 + }, + { + "auxiliary_loss_clip": 0.01088458, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.03336763, + "balance_loss_mlp": 1.01959825, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.475557076518384, + "language_loss": 0.69277376, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71398443, + "num_input_tokens_seen": 296813705, + "step": 13758, + "time_per_iteration": 2.586055040359497 + }, + { + "auxiliary_loss_clip": 0.01071157, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.03036046, + "balance_loss_mlp": 1.01796103, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.484548843032824, + "language_loss": 0.70094246, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.72194088, + "num_input_tokens_seen": 296833985, + "step": 13759, + "time_per_iteration": 4.173388957977295 + }, + { + "auxiliary_loss_clip": 0.0107531, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.03295159, + "balance_loss_mlp": 1.01771069, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.733557217472836, + "language_loss": 0.70875359, + "learning_rate": 3.047114873375161e-07, + "loss": 0.72979581, + "num_input_tokens_seen": 296850150, + "step": 13760, + "time_per_iteration": 2.6874160766601562 + }, + { + "auxiliary_loss_clip": 0.01047424, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.02863765, + "balance_loss_mlp": 1.01822472, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.91476425540521, + "language_loss": 0.77572882, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79649901, + "num_input_tokens_seen": 296869585, + "step": 13761, + "time_per_iteration": 2.7578320503234863 + }, + { + "auxiliary_loss_clip": 0.01062704, + "auxiliary_loss_mlp": 0.01029975, + "balance_loss_clip": 1.03316271, + "balance_loss_mlp": 1.02005112, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.988177895300774, + "language_loss": 0.69682646, + "learning_rate": 3.042983464482387e-07, + "loss": 0.71775329, + "num_input_tokens_seen": 296887710, + "step": 13762, + "time_per_iteration": 2.634575843811035 + }, + { + "auxiliary_loss_clip": 0.01041849, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.03045321, + "balance_loss_mlp": 1.01847744, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 1.8103951377492586, + "language_loss": 0.69796336, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.71867931, + "num_input_tokens_seen": 296906265, + "step": 13763, + "time_per_iteration": 2.6489667892456055 + }, + { + "auxiliary_loss_clip": 0.0100592, + "auxiliary_loss_mlp": 0.01007246, + "balance_loss_clip": 1.01188111, + "balance_loss_mlp": 1.00624466, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8360223950062616, + "language_loss": 0.65070188, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67083359, + "num_input_tokens_seen": 296971290, + "step": 13764, + "time_per_iteration": 3.269751787185669 + }, + { + "auxiliary_loss_clip": 0.01088509, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.03457201, + "balance_loss_mlp": 1.02104867, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 1.9856725161151354, + "language_loss": 0.77869886, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.79991066, + "num_input_tokens_seen": 296989060, + "step": 13765, + "time_per_iteration": 2.565812826156616 + }, + { + "auxiliary_loss_clip": 0.01041137, + "auxiliary_loss_mlp": 0.01031307, + "balance_loss_clip": 1.03075612, + "balance_loss_mlp": 1.01924324, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.7837064202077724, + "language_loss": 0.62602055, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64674497, + "num_input_tokens_seen": 297011300, + "step": 13766, + "time_per_iteration": 2.722628593444824 + }, + { + "auxiliary_loss_clip": 0.01061608, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.03064895, + "balance_loss_mlp": 1.01742053, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.6629099927310131, + "language_loss": 0.82613951, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84704328, + "num_input_tokens_seen": 297030350, + "step": 13767, + "time_per_iteration": 2.6056952476501465 + }, + { + "auxiliary_loss_clip": 0.01072241, + "auxiliary_loss_mlp": 0.01026391, + "balance_loss_clip": 1.03602433, + "balance_loss_mlp": 1.01534605, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 1.5535727994866504, + "language_loss": 0.69113725, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71212351, + "num_input_tokens_seen": 297049710, + "step": 13768, + "time_per_iteration": 2.684473991394043 + }, + { + "auxiliary_loss_clip": 0.01022838, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.02599728, + "balance_loss_mlp": 1.02313292, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 2.202601445648617, + "language_loss": 0.74304259, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76361954, + "num_input_tokens_seen": 297070510, + "step": 13769, + "time_per_iteration": 2.7573938369750977 + }, + { + "auxiliary_loss_clip": 0.0107359, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.03202558, + "balance_loss_mlp": 1.01633239, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 1.8275141964956894, + "language_loss": 0.74214846, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76315832, + "num_input_tokens_seen": 297092585, + "step": 13770, + "time_per_iteration": 2.8887603282928467 + }, + { + "auxiliary_loss_clip": 0.01066914, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.01994538, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.7023490512356791, + "language_loss": 0.75844812, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.77943057, + "num_input_tokens_seen": 297110055, + "step": 13771, + "time_per_iteration": 2.713627338409424 + }, + { + "auxiliary_loss_clip": 0.01096898, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.03294086, + "balance_loss_mlp": 1.01807332, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.5519578898333963, + "language_loss": 0.7273885, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74864733, + "num_input_tokens_seen": 297132170, + "step": 13772, + "time_per_iteration": 2.661996364593506 + }, + { + "auxiliary_loss_clip": 0.01075039, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.03312504, + "balance_loss_mlp": 1.02049541, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.182325419152705, + "language_loss": 0.74065095, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76172149, + "num_input_tokens_seen": 297149515, + "step": 13773, + "time_per_iteration": 4.041924476623535 + }, + { + "auxiliary_loss_clip": 0.01050121, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.03199196, + "balance_loss_mlp": 1.01915789, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 2.4747746625749207, + "language_loss": 0.7576378, + "learning_rate": 3.01824904601915e-07, + "loss": 0.77843446, + "num_input_tokens_seen": 297170320, + "step": 13774, + "time_per_iteration": 2.7036595344543457 + }, + { + "auxiliary_loss_clip": 0.01068023, + "auxiliary_loss_mlp": 0.00749328, + "balance_loss_clip": 1.03460908, + "balance_loss_mlp": 1.00019085, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.6677182857282886, + "language_loss": 0.74955285, + "learning_rate": 3.01619202829249e-07, + "loss": 0.76772642, + "num_input_tokens_seen": 297189935, + "step": 13775, + "time_per_iteration": 2.695044994354248 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.03297877, + "balance_loss_mlp": 1.01946664, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 2.1902826156529214, + "language_loss": 0.73863816, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75995457, + "num_input_tokens_seen": 297210885, + "step": 13776, + "time_per_iteration": 2.579317331314087 + }, + { + "auxiliary_loss_clip": 0.01035938, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.02589798, + "balance_loss_mlp": 1.01787996, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 2.6068429892972897, + "language_loss": 0.77198124, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79264736, + "num_input_tokens_seen": 297228500, + "step": 13777, + "time_per_iteration": 2.642648696899414 + }, + { + "auxiliary_loss_clip": 0.01083188, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.03337812, + "balance_loss_mlp": 1.02021003, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.7956552813631015, + "language_loss": 0.82457566, + "learning_rate": 3.010024839590604e-07, + "loss": 0.8457073, + "num_input_tokens_seen": 297249470, + "step": 13778, + "time_per_iteration": 2.6131558418273926 + }, + { + "auxiliary_loss_clip": 0.01076965, + "auxiliary_loss_mlp": 0.01022528, + "balance_loss_clip": 1.02988219, + "balance_loss_mlp": 1.01160824, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.946748628098324, + "language_loss": 0.74636447, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.76735944, + "num_input_tokens_seen": 297265970, + "step": 13779, + "time_per_iteration": 2.5664467811584473 + }, + { + "auxiliary_loss_clip": 0.00996628, + "auxiliary_loss_mlp": 0.00999309, + "balance_loss_clip": 1.00643659, + "balance_loss_mlp": 0.99833202, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.7695793856547393, + "language_loss": 0.56702805, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58698738, + "num_input_tokens_seen": 297325525, + "step": 13780, + "time_per_iteration": 3.1814589500427246 + }, + { + "auxiliary_loss_clip": 0.0105382, + "auxiliary_loss_mlp": 0.01025506, + "balance_loss_clip": 1.03157568, + "balance_loss_mlp": 1.01409781, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.7952395127350231, + "language_loss": 0.80167711, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.82247031, + "num_input_tokens_seen": 297345025, + "step": 13781, + "time_per_iteration": 2.8241043090820312 + }, + { + "auxiliary_loss_clip": 0.01053309, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.03325605, + "balance_loss_mlp": 1.01717627, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 2.1467906829596757, + "language_loss": 0.75613838, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77696633, + "num_input_tokens_seen": 297363570, + "step": 13782, + "time_per_iteration": 2.7810981273651123 + }, + { + "auxiliary_loss_clip": 0.01082552, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.0297966, + "balance_loss_mlp": 1.01684999, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.8474002532912586, + "language_loss": 0.75968742, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78079206, + "num_input_tokens_seen": 297385385, + "step": 13783, + "time_per_iteration": 2.6561317443847656 + }, + { + "auxiliary_loss_clip": 0.01095995, + "auxiliary_loss_mlp": 0.01023185, + "balance_loss_clip": 1.0324235, + "balance_loss_mlp": 1.0124321, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.7216399566658942, + "language_loss": 0.73927736, + "learning_rate": 2.997707859351304e-07, + "loss": 0.7604692, + "num_input_tokens_seen": 297403950, + "step": 13784, + "time_per_iteration": 4.062314987182617 + }, + { + "auxiliary_loss_clip": 0.01089143, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.03270102, + "balance_loss_mlp": 1.02071881, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.4774075315410353, + "language_loss": 0.70024371, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72146201, + "num_input_tokens_seen": 297424565, + "step": 13785, + "time_per_iteration": 2.6865599155426025 + }, + { + "auxiliary_loss_clip": 0.01074852, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.03521132, + "balance_loss_mlp": 1.0199523, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 2.7686858866230315, + "language_loss": 0.68612409, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70718473, + "num_input_tokens_seen": 297445180, + "step": 13786, + "time_per_iteration": 2.618992805480957 + }, + { + "auxiliary_loss_clip": 0.01056095, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.01991391, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.774228173571979, + "language_loss": 0.77133989, + "learning_rate": 2.991558072017426e-07, + "loss": 0.79221392, + "num_input_tokens_seen": 297463790, + "step": 13787, + "time_per_iteration": 2.7083048820495605 + }, + { + "auxiliary_loss_clip": 0.01078505, + "auxiliary_loss_mlp": 0.01031627, + "balance_loss_clip": 1.03334641, + "balance_loss_mlp": 1.02139878, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.729061514758919, + "language_loss": 0.80538213, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82648343, + "num_input_tokens_seen": 297480100, + "step": 13788, + "time_per_iteration": 2.6055314540863037 + }, + { + "auxiliary_loss_clip": 0.0107137, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.03260136, + "balance_loss_mlp": 1.01968741, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 3.2737201802873654, + "language_loss": 0.71061718, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73163587, + "num_input_tokens_seen": 297499890, + "step": 13789, + "time_per_iteration": 2.6857199668884277 + }, + { + "auxiliary_loss_clip": 0.01062215, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.02915335, + "balance_loss_mlp": 1.01694632, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 2.009222598466673, + "language_loss": 0.68297648, + "learning_rate": 2.985414089339813e-07, + "loss": 0.70388222, + "num_input_tokens_seen": 297521440, + "step": 13790, + "time_per_iteration": 4.290458679199219 + }, + { + "auxiliary_loss_clip": 0.01088328, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.03344321, + "balance_loss_mlp": 1.01585674, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 1.618159009755346, + "language_loss": 0.77539492, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79656041, + "num_input_tokens_seen": 297539920, + "step": 13791, + "time_per_iteration": 2.5522732734680176 + }, + { + "auxiliary_loss_clip": 0.01068914, + "auxiliary_loss_mlp": 0.01026698, + "balance_loss_clip": 1.03264141, + "balance_loss_mlp": 1.01470566, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.5023436686557716, + "language_loss": 0.6997593, + "learning_rate": 2.981321326732651e-07, + "loss": 0.7207154, + "num_input_tokens_seen": 297560000, + "step": 13792, + "time_per_iteration": 2.654836416244507 + }, + { + "auxiliary_loss_clip": 0.01075525, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.03214908, + "balance_loss_mlp": 1.01920152, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.4504529460533226, + "language_loss": 0.64809644, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.66915572, + "num_input_tokens_seen": 297579300, + "step": 13793, + "time_per_iteration": 2.6356465816497803 + }, + { + "auxiliary_loss_clip": 0.01036811, + "auxiliary_loss_mlp": 0.01037116, + "balance_loss_clip": 1.02827454, + "balance_loss_mlp": 1.02363992, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.7358156545650911, + "language_loss": 0.66430217, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68504143, + "num_input_tokens_seen": 297598095, + "step": 13794, + "time_per_iteration": 2.7020859718322754 + }, + { + "auxiliary_loss_clip": 0.01096125, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.03204811, + "balance_loss_mlp": 1.02159142, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 2.0311471647752843, + "language_loss": 0.6636368, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68492764, + "num_input_tokens_seen": 297615955, + "step": 13795, + "time_per_iteration": 2.6659460067749023 + }, + { + "auxiliary_loss_clip": 0.01001548, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.02863061, + "balance_loss_mlp": 1.02389717, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.9574680465796024, + "language_loss": 0.66247642, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68285668, + "num_input_tokens_seen": 297636285, + "step": 13796, + "time_per_iteration": 3.169597625732422 + }, + { + "auxiliary_loss_clip": 0.01043552, + "auxiliary_loss_mlp": 0.01037446, + "balance_loss_clip": 1.02871704, + "balance_loss_mlp": 1.0250361, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.6448023263002776, + "language_loss": 0.71911865, + "learning_rate": 2.971100715196666e-07, + "loss": 0.7399286, + "num_input_tokens_seen": 297653315, + "step": 13797, + "time_per_iteration": 2.7664527893066406 + }, + { + "auxiliary_loss_clip": 0.01026216, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.03048611, + "balance_loss_mlp": 1.01658237, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 1.8727016162849808, + "language_loss": 0.72069222, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74122745, + "num_input_tokens_seen": 297673480, + "step": 13798, + "time_per_iteration": 2.784897565841675 + }, + { + "auxiliary_loss_clip": 0.01054055, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.02911091, + "balance_loss_mlp": 1.02062273, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.6751918831710726, + "language_loss": 0.7624855, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78334475, + "num_input_tokens_seen": 297693250, + "step": 13799, + "time_per_iteration": 4.358710765838623 + }, + { + "auxiliary_loss_clip": 0.01097965, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.03534698, + "balance_loss_mlp": 1.01976299, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 1.8892217549897583, + "language_loss": 0.67508125, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69636863, + "num_input_tokens_seen": 297710975, + "step": 13800, + "time_per_iteration": 2.542665958404541 + }, + { + "auxiliary_loss_clip": 0.01063077, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.03506637, + "balance_loss_mlp": 1.01858449, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.9715524813999883, + "language_loss": 0.74432373, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76526684, + "num_input_tokens_seen": 297730860, + "step": 13801, + "time_per_iteration": 2.7019917964935303 + }, + { + "auxiliary_loss_clip": 0.01051574, + "auxiliary_loss_mlp": 0.01027207, + "balance_loss_clip": 1.03158021, + "balance_loss_mlp": 1.01693118, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 1.521739230798753, + "language_loss": 0.73703063, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75781846, + "num_input_tokens_seen": 297749765, + "step": 13802, + "time_per_iteration": 2.6482505798339844 + }, + { + "auxiliary_loss_clip": 0.01074812, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.0305512, + "balance_loss_mlp": 1.0197506, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.7818346250441803, + "language_loss": 0.74941003, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.77046478, + "num_input_tokens_seen": 297770380, + "step": 13803, + "time_per_iteration": 2.683321475982666 + }, + { + "auxiliary_loss_clip": 0.01086613, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.0344162, + "balance_loss_mlp": 1.02040386, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.896372149112621, + "language_loss": 0.7923072, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81348366, + "num_input_tokens_seen": 297789440, + "step": 13804, + "time_per_iteration": 2.611851453781128 + }, + { + "auxiliary_loss_clip": 0.01096729, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.03404462, + "balance_loss_mlp": 1.01676202, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.6342069163796789, + "language_loss": 0.73066318, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75189883, + "num_input_tokens_seen": 297810425, + "step": 13805, + "time_per_iteration": 2.6030778884887695 + }, + { + "auxiliary_loss_clip": 0.01089334, + "auxiliary_loss_mlp": 0.00749437, + "balance_loss_clip": 1.03462088, + "balance_loss_mlp": 1.00025356, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 2.745959805170004, + "language_loss": 0.77278399, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79117167, + "num_input_tokens_seen": 297827680, + "step": 13806, + "time_per_iteration": 2.5656893253326416 + }, + { + "auxiliary_loss_clip": 0.01090968, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.03512895, + "balance_loss_mlp": 1.0195924, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.9818539533198234, + "language_loss": 0.63367748, + "learning_rate": 2.950707932112444e-07, + "loss": 0.65489852, + "num_input_tokens_seen": 297848005, + "step": 13807, + "time_per_iteration": 2.5840437412261963 + }, + { + "auxiliary_loss_clip": 0.01085185, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.03556323, + "balance_loss_mlp": 1.01566148, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 2.082019374942113, + "language_loss": 0.7345674, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75568581, + "num_input_tokens_seen": 297866730, + "step": 13808, + "time_per_iteration": 2.527036190032959 + }, + { + "auxiliary_loss_clip": 0.01079177, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.03652775, + "balance_loss_mlp": 1.02581739, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.8041836494560242, + "language_loss": 0.66333717, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68451333, + "num_input_tokens_seen": 297886390, + "step": 13809, + "time_per_iteration": 2.669801950454712 + }, + { + "auxiliary_loss_clip": 0.01098796, + "auxiliary_loss_mlp": 0.01024302, + "balance_loss_clip": 1.03404546, + "balance_loss_mlp": 1.01373434, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 2.3423114314034876, + "language_loss": 0.73903024, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76026118, + "num_input_tokens_seen": 297905110, + "step": 13810, + "time_per_iteration": 2.483558416366577 + }, + { + "auxiliary_loss_clip": 0.0106374, + "auxiliary_loss_mlp": 0.01033624, + "balance_loss_clip": 1.03282034, + "balance_loss_mlp": 1.02380717, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.707989853091637, + "language_loss": 0.81204045, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83301413, + "num_input_tokens_seen": 297925460, + "step": 13811, + "time_per_iteration": 2.700542688369751 + }, + { + "auxiliary_loss_clip": 0.01062815, + "auxiliary_loss_mlp": 0.01040016, + "balance_loss_clip": 1.03291535, + "balance_loss_mlp": 1.02764237, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 2.234437288140599, + "language_loss": 0.7341181, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75514644, + "num_input_tokens_seen": 297941760, + "step": 13812, + "time_per_iteration": 4.064177513122559 + }, + { + "auxiliary_loss_clip": 0.01070377, + "auxiliary_loss_mlp": 0.01027397, + "balance_loss_clip": 1.03321922, + "balance_loss_mlp": 1.01674592, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.6839710052917207, + "language_loss": 0.78350621, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80448401, + "num_input_tokens_seen": 297959745, + "step": 13813, + "time_per_iteration": 2.629185199737549 + }, + { + "auxiliary_loss_clip": 0.01039629, + "auxiliary_loss_mlp": 0.00749539, + "balance_loss_clip": 1.02954674, + "balance_loss_mlp": 1.00029802, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 2.050491751193576, + "language_loss": 0.70998102, + "learning_rate": 2.93647144674658e-07, + "loss": 0.72787273, + "num_input_tokens_seen": 297977665, + "step": 13814, + "time_per_iteration": 2.7680246829986572 + }, + { + "auxiliary_loss_clip": 0.01105195, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.03518176, + "balance_loss_mlp": 1.02719498, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 1.9363804455132934, + "language_loss": 0.67853928, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.69999278, + "num_input_tokens_seen": 297993525, + "step": 13815, + "time_per_iteration": 2.5117835998535156 + }, + { + "auxiliary_loss_clip": 0.01086302, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.03518391, + "balance_loss_mlp": 1.01958203, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 1.8083244307106157, + "language_loss": 0.75371295, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.77488619, + "num_input_tokens_seen": 298012920, + "step": 13816, + "time_per_iteration": 2.5681464672088623 + }, + { + "auxiliary_loss_clip": 0.01060257, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.02964139, + "balance_loss_mlp": 1.01930571, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.624439349259857, + "language_loss": 0.81679869, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83769923, + "num_input_tokens_seen": 298033310, + "step": 13817, + "time_per_iteration": 2.6532936096191406 + }, + { + "auxiliary_loss_clip": 0.01089487, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.03513634, + "balance_loss_mlp": 1.02059197, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.7279027961886941, + "language_loss": 0.77974117, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80096012, + "num_input_tokens_seen": 298053530, + "step": 13818, + "time_per_iteration": 2.591826915740967 + }, + { + "auxiliary_loss_clip": 0.01081708, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.0359447, + "balance_loss_mlp": 1.01974046, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 2.1463321086580813, + "language_loss": 0.81819439, + "learning_rate": 2.926321938606453e-07, + "loss": 0.83932006, + "num_input_tokens_seen": 298069305, + "step": 13819, + "time_per_iteration": 2.544111490249634 + }, + { + "auxiliary_loss_clip": 0.01014451, + "auxiliary_loss_mlp": 0.01000396, + "balance_loss_clip": 1.00460923, + "balance_loss_mlp": 0.99938899, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7627560193049578, + "language_loss": 0.56176519, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58191371, + "num_input_tokens_seen": 298125830, + "step": 13820, + "time_per_iteration": 3.1328084468841553 + }, + { + "auxiliary_loss_clip": 0.01083177, + "auxiliary_loss_mlp": 0.01022151, + "balance_loss_clip": 1.03186643, + "balance_loss_mlp": 1.01133251, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.9292217428062768, + "language_loss": 0.68171036, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70276356, + "num_input_tokens_seen": 298142320, + "step": 13821, + "time_per_iteration": 2.51159405708313 + }, + { + "auxiliary_loss_clip": 0.01024988, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.026968, + "balance_loss_mlp": 1.02439582, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 1.9547196282410984, + "language_loss": 0.68980777, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71042621, + "num_input_tokens_seen": 298161845, + "step": 13822, + "time_per_iteration": 2.7222182750701904 + }, + { + "auxiliary_loss_clip": 0.01046305, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.03261745, + "balance_loss_mlp": 1.01946032, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.8552707943387288, + "language_loss": 0.62505871, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64581752, + "num_input_tokens_seen": 298184165, + "step": 13823, + "time_per_iteration": 2.751925230026245 + }, + { + "auxiliary_loss_clip": 0.01003679, + "auxiliary_loss_mlp": 0.00999394, + "balance_loss_clip": 1.00359154, + "balance_loss_mlp": 0.99858916, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.8794126254166813, + "language_loss": 0.61924565, + "learning_rate": 2.916188616354669e-07, + "loss": 0.63927633, + "num_input_tokens_seen": 298251720, + "step": 13824, + "time_per_iteration": 4.6713480949401855 + }, + { + "auxiliary_loss_clip": 0.01098212, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.03480697, + "balance_loss_mlp": 1.01972055, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.6789910950916453, + "language_loss": 0.74011844, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76140463, + "num_input_tokens_seen": 298271910, + "step": 13825, + "time_per_iteration": 2.567176103591919 + }, + { + "auxiliary_loss_clip": 0.01047994, + "auxiliary_loss_mlp": 0.00749412, + "balance_loss_clip": 1.03196621, + "balance_loss_mlp": 1.00021386, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 1.8523823284932794, + "language_loss": 0.80486333, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82283735, + "num_input_tokens_seen": 298288105, + "step": 13826, + "time_per_iteration": 2.7571563720703125 + }, + { + "auxiliary_loss_clip": 0.01097181, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.03316271, + "balance_loss_mlp": 1.01812387, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.8425562031092575, + "language_loss": 0.680372, + "learning_rate": 2.910116396226914e-07, + "loss": 0.70163524, + "num_input_tokens_seen": 298307600, + "step": 13827, + "time_per_iteration": 2.589905023574829 + }, + { + "auxiliary_loss_clip": 0.0108558, + "auxiliary_loss_mlp": 0.01026007, + "balance_loss_clip": 1.03198314, + "balance_loss_mlp": 1.01598144, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 2.0126342822223817, + "language_loss": 0.7406249, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.76174068, + "num_input_tokens_seen": 298323055, + "step": 13828, + "time_per_iteration": 2.5519134998321533 + }, + { + "auxiliary_loss_clip": 0.01059137, + "auxiliary_loss_mlp": 0.01033748, + "balance_loss_clip": 1.0305568, + "balance_loss_mlp": 1.02174997, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 1.6471609936600966, + "language_loss": 0.6709699, + "learning_rate": 2.906071489597657e-07, + "loss": 0.6918987, + "num_input_tokens_seen": 298346950, + "step": 13829, + "time_per_iteration": 2.84999942779541 + }, + { + "auxiliary_loss_clip": 0.0107452, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.03311038, + "balance_loss_mlp": 1.01732659, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.6794184378752068, + "language_loss": 0.82621288, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84724915, + "num_input_tokens_seen": 298366315, + "step": 13830, + "time_per_iteration": 4.02931022644043 + }, + { + "auxiliary_loss_clip": 0.0108511, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.03281283, + "balance_loss_mlp": 1.01830125, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 2.239529251961864, + "language_loss": 0.74250269, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76364625, + "num_input_tokens_seen": 298385185, + "step": 13831, + "time_per_iteration": 2.538252592086792 + }, + { + "auxiliary_loss_clip": 0.01098578, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.03458917, + "balance_loss_mlp": 1.01987958, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.9951624546631577, + "language_loss": 0.71429062, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73558986, + "num_input_tokens_seen": 298402335, + "step": 13832, + "time_per_iteration": 2.5198793411254883 + }, + { + "auxiliary_loss_clip": 0.01067815, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.0302, + "balance_loss_mlp": 1.02124262, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 2.7728327486580526, + "language_loss": 0.8417688, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86276937, + "num_input_tokens_seen": 298423370, + "step": 13833, + "time_per_iteration": 2.640312433242798 + }, + { + "auxiliary_loss_clip": 0.01079385, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.03435624, + "balance_loss_mlp": 1.02210259, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.5061416717470568, + "language_loss": 0.76186109, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78298867, + "num_input_tokens_seen": 298444835, + "step": 13834, + "time_per_iteration": 2.714355707168579 + }, + { + "auxiliary_loss_clip": 0.01093791, + "auxiliary_loss_mlp": 0.00749324, + "balance_loss_clip": 1.03221846, + "balance_loss_mlp": 1.00031376, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 2.0428602660037205, + "language_loss": 0.79726285, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81569397, + "num_input_tokens_seen": 298461845, + "step": 13835, + "time_per_iteration": 2.6454033851623535 + }, + { + "auxiliary_loss_clip": 0.01093002, + "auxiliary_loss_mlp": 0.0103488, + "balance_loss_clip": 1.03641295, + "balance_loss_mlp": 1.02236342, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 2.6777197648983333, + "language_loss": 0.8085838, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82986259, + "num_input_tokens_seen": 298479095, + "step": 13836, + "time_per_iteration": 2.5148861408233643 + }, + { + "auxiliary_loss_clip": 0.01074574, + "auxiliary_loss_mlp": 0.0102779, + "balance_loss_clip": 1.03315878, + "balance_loss_mlp": 1.01758027, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 1.9681065675767975, + "language_loss": 0.77925825, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.80028188, + "num_input_tokens_seen": 298494475, + "step": 13837, + "time_per_iteration": 2.6908605098724365 + }, + { + "auxiliary_loss_clip": 0.01101513, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.03387606, + "balance_loss_mlp": 1.01762617, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.9877568365485414, + "language_loss": 0.83428097, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85559654, + "num_input_tokens_seen": 298513185, + "step": 13838, + "time_per_iteration": 2.5933539867401123 + }, + { + "auxiliary_loss_clip": 0.01065326, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.03005493, + "balance_loss_mlp": 1.02028406, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 2.6692885069258363, + "language_loss": 0.74301088, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76398623, + "num_input_tokens_seen": 298531885, + "step": 13839, + "time_per_iteration": 4.121676206588745 + }, + { + "auxiliary_loss_clip": 0.01082082, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.03262866, + "balance_loss_mlp": 1.0168736, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.5127925139654939, + "language_loss": 0.67850596, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.69961166, + "num_input_tokens_seen": 298554905, + "step": 13840, + "time_per_iteration": 2.680182695388794 + }, + { + "auxiliary_loss_clip": 0.01038782, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.02923417, + "balance_loss_mlp": 1.01816463, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 1.9677802425542048, + "language_loss": 0.79520702, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81590426, + "num_input_tokens_seen": 298571185, + "step": 13841, + "time_per_iteration": 2.6113991737365723 + }, + { + "auxiliary_loss_clip": 0.01050822, + "auxiliary_loss_mlp": 0.01028532, + "balance_loss_clip": 1.03494513, + "balance_loss_mlp": 1.01749337, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.815340056113254, + "language_loss": 0.68617326, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70696676, + "num_input_tokens_seen": 298588505, + "step": 13842, + "time_per_iteration": 2.6572425365448 + }, + { + "auxiliary_loss_clip": 0.01078053, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.0348146, + "balance_loss_mlp": 1.01635075, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 1.6859368281573996, + "language_loss": 0.73052412, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75158668, + "num_input_tokens_seen": 298609295, + "step": 13843, + "time_per_iteration": 2.653588056564331 + }, + { + "auxiliary_loss_clip": 0.01065113, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.03307498, + "balance_loss_mlp": 1.01837635, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 2.4767953570924597, + "language_loss": 0.77291286, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79385829, + "num_input_tokens_seen": 298625765, + "step": 13844, + "time_per_iteration": 2.5868823528289795 + }, + { + "auxiliary_loss_clip": 0.01003913, + "auxiliary_loss_mlp": 0.01002291, + "balance_loss_clip": 1.00444245, + "balance_loss_mlp": 1.00131917, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.7754537922623433, + "language_loss": 0.55222541, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57228744, + "num_input_tokens_seen": 298683005, + "step": 13845, + "time_per_iteration": 3.0668680667877197 + }, + { + "auxiliary_loss_clip": 0.01089849, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.03388238, + "balance_loss_mlp": 1.03331065, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 1.5835619808517594, + "language_loss": 0.75570798, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77706009, + "num_input_tokens_seen": 298703060, + "step": 13846, + "time_per_iteration": 2.6283812522888184 + }, + { + "auxiliary_loss_clip": 0.01038255, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.02693737, + "balance_loss_mlp": 1.01796162, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.5503925968834682, + "language_loss": 0.78692102, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80762434, + "num_input_tokens_seen": 298721765, + "step": 13847, + "time_per_iteration": 2.6477913856506348 + }, + { + "auxiliary_loss_clip": 0.01042617, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.03461504, + "balance_loss_mlp": 1.01631868, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.6220528387920348, + "language_loss": 0.74707937, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76777357, + "num_input_tokens_seen": 298740825, + "step": 13848, + "time_per_iteration": 2.7757866382598877 + }, + { + "auxiliary_loss_clip": 0.01087311, + "auxiliary_loss_mlp": 0.01028512, + "balance_loss_clip": 1.03438687, + "balance_loss_mlp": 1.01802742, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 2.298934287101978, + "language_loss": 0.63533759, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65649581, + "num_input_tokens_seen": 298758515, + "step": 13849, + "time_per_iteration": 2.5335521697998047 + }, + { + "auxiliary_loss_clip": 0.01069239, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.0291822, + "balance_loss_mlp": 1.01856685, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 2.0530631206341963, + "language_loss": 0.79327047, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81426674, + "num_input_tokens_seen": 298776375, + "step": 13850, + "time_per_iteration": 2.5935277938842773 + }, + { + "auxiliary_loss_clip": 0.01055167, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.02965164, + "balance_loss_mlp": 1.0222578, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.6176956185074056, + "language_loss": 0.78282773, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.8037039, + "num_input_tokens_seen": 298795135, + "step": 13851, + "time_per_iteration": 2.6259448528289795 + }, + { + "auxiliary_loss_clip": 0.01015601, + "auxiliary_loss_mlp": 0.01008104, + "balance_loss_clip": 1.00540435, + "balance_loss_mlp": 1.00705457, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7641128003339122, + "language_loss": 0.55810487, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57834196, + "num_input_tokens_seen": 298855475, + "step": 13852, + "time_per_iteration": 4.607764482498169 + }, + { + "auxiliary_loss_clip": 0.010851, + "auxiliary_loss_mlp": 0.01028276, + "balance_loss_clip": 1.03302526, + "balance_loss_mlp": 1.017452, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 1.8060595231247292, + "language_loss": 0.67121458, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69234842, + "num_input_tokens_seen": 298875875, + "step": 13853, + "time_per_iteration": 2.6634674072265625 + }, + { + "auxiliary_loss_clip": 0.01069324, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.03358769, + "balance_loss_mlp": 1.01937616, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.5469754801793827, + "language_loss": 0.78620267, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80719686, + "num_input_tokens_seen": 298895950, + "step": 13854, + "time_per_iteration": 2.6303515434265137 + }, + { + "auxiliary_loss_clip": 0.01023891, + "auxiliary_loss_mlp": 0.01009258, + "balance_loss_clip": 1.00407767, + "balance_loss_mlp": 1.0083102, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7824009885136086, + "language_loss": 0.58680052, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60713196, + "num_input_tokens_seen": 298955770, + "step": 13855, + "time_per_iteration": 2.9775280952453613 + }, + { + "auxiliary_loss_clip": 0.01085194, + "auxiliary_loss_mlp": 0.0102472, + "balance_loss_clip": 1.03310418, + "balance_loss_mlp": 1.01343083, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.808928890203285, + "language_loss": 0.7144016, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73550081, + "num_input_tokens_seen": 298976545, + "step": 13856, + "time_per_iteration": 2.542405128479004 + }, + { + "auxiliary_loss_clip": 0.01087333, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.03342044, + "balance_loss_mlp": 1.01670825, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.5776133426206727, + "language_loss": 0.75367486, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77482599, + "num_input_tokens_seen": 298996750, + "step": 13857, + "time_per_iteration": 2.5918335914611816 + }, + { + "auxiliary_loss_clip": 0.01046838, + "auxiliary_loss_mlp": 0.01025762, + "balance_loss_clip": 1.03146124, + "balance_loss_mlp": 1.01588595, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.7198456115117202, + "language_loss": 0.73140001, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75212598, + "num_input_tokens_seen": 299014895, + "step": 13858, + "time_per_iteration": 2.7003533840179443 + }, + { + "auxiliary_loss_clip": 0.01100846, + "auxiliary_loss_mlp": 0.01037435, + "balance_loss_clip": 1.03374588, + "balance_loss_mlp": 1.02550173, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 2.0519163962151032, + "language_loss": 0.73250341, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75388622, + "num_input_tokens_seen": 299032855, + "step": 13859, + "time_per_iteration": 2.573686361312866 + }, + { + "auxiliary_loss_clip": 0.01081014, + "auxiliary_loss_mlp": 0.01023007, + "balance_loss_clip": 1.03096664, + "balance_loss_mlp": 1.01289201, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.6073235998882809, + "language_loss": 0.78955144, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81059164, + "num_input_tokens_seen": 299052055, + "step": 13860, + "time_per_iteration": 2.6261954307556152 + }, + { + "auxiliary_loss_clip": 0.01028956, + "auxiliary_loss_mlp": 0.01029746, + "balance_loss_clip": 1.03344774, + "balance_loss_mlp": 1.01893377, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.6140809863204721, + "language_loss": 0.82124865, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84183574, + "num_input_tokens_seen": 299075285, + "step": 13861, + "time_per_iteration": 2.7957189083099365 + }, + { + "auxiliary_loss_clip": 0.01099838, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.03576732, + "balance_loss_mlp": 1.02033854, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.8839923973761874, + "language_loss": 0.78683203, + "learning_rate": 2.839705324021806e-07, + "loss": 0.80814368, + "num_input_tokens_seen": 299092520, + "step": 13862, + "time_per_iteration": 2.475869655609131 + }, + { + "auxiliary_loss_clip": 0.01087412, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.03276777, + "balance_loss_mlp": 1.02006888, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 1.811365692814177, + "language_loss": 0.75098252, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77217078, + "num_input_tokens_seen": 299109450, + "step": 13863, + "time_per_iteration": 2.539959669113159 + }, + { + "auxiliary_loss_clip": 0.01032401, + "auxiliary_loss_mlp": 0.00749518, + "balance_loss_clip": 1.02819657, + "balance_loss_mlp": 1.00030446, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.1254292448964853, + "language_loss": 0.7531057, + "learning_rate": 2.835705879864232e-07, + "loss": 0.77092493, + "num_input_tokens_seen": 299129540, + "step": 13864, + "time_per_iteration": 4.11944580078125 + }, + { + "auxiliary_loss_clip": 0.01074611, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.03205955, + "balance_loss_mlp": 1.0206548, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.8468895285295732, + "language_loss": 0.69437671, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71544427, + "num_input_tokens_seen": 299148670, + "step": 13865, + "time_per_iteration": 2.612671136856079 + }, + { + "auxiliary_loss_clip": 0.01087763, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.03432119, + "balance_loss_mlp": 1.01815569, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.6611549973598043, + "language_loss": 0.75195026, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77311957, + "num_input_tokens_seen": 299169330, + "step": 13866, + "time_per_iteration": 2.707860231399536 + }, + { + "auxiliary_loss_clip": 0.01003899, + "auxiliary_loss_mlp": 0.01002285, + "balance_loss_clip": 1.00652242, + "balance_loss_mlp": 1.00090778, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8732510356877855, + "language_loss": 0.63232851, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65239036, + "num_input_tokens_seen": 299220980, + "step": 13867, + "time_per_iteration": 3.0507450103759766 + }, + { + "auxiliary_loss_clip": 0.0107206, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.03195357, + "balance_loss_mlp": 1.02124, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 3.9126025518983982, + "language_loss": 0.72204679, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74308348, + "num_input_tokens_seen": 299240130, + "step": 13868, + "time_per_iteration": 2.6063528060913086 + }, + { + "auxiliary_loss_clip": 0.01075874, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.03666711, + "balance_loss_mlp": 1.01709318, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.4796942962913846, + "language_loss": 0.80375224, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82479268, + "num_input_tokens_seen": 299260705, + "step": 13869, + "time_per_iteration": 2.631488800048828 + }, + { + "auxiliary_loss_clip": 0.01090564, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.0361526, + "balance_loss_mlp": 1.02323484, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.8658604755091952, + "language_loss": 0.82509077, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84634173, + "num_input_tokens_seen": 299278925, + "step": 13870, + "time_per_iteration": 4.103020906448364 + }, + { + "auxiliary_loss_clip": 0.01064826, + "auxiliary_loss_mlp": 0.01028333, + "balance_loss_clip": 1.03268838, + "balance_loss_mlp": 1.01608467, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.9664891204967745, + "language_loss": 0.69993502, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72086656, + "num_input_tokens_seen": 299291580, + "step": 13871, + "time_per_iteration": 2.508110284805298 + }, + { + "auxiliary_loss_clip": 0.01085882, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.03479934, + "balance_loss_mlp": 1.01602578, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 1.7526381906055466, + "language_loss": 0.69210541, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.71322995, + "num_input_tokens_seen": 299310385, + "step": 13872, + "time_per_iteration": 2.5418386459350586 + }, + { + "auxiliary_loss_clip": 0.01074324, + "auxiliary_loss_mlp": 0.0102631, + "balance_loss_clip": 1.03263259, + "balance_loss_mlp": 1.01553965, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 1.9463175523785605, + "language_loss": 0.73211145, + "learning_rate": 2.817740608055712e-07, + "loss": 0.7531178, + "num_input_tokens_seen": 299327660, + "step": 13873, + "time_per_iteration": 2.624868869781494 + }, + { + "auxiliary_loss_clip": 0.01077153, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.01908803, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 2.0709414914746205, + "language_loss": 0.75340623, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77449739, + "num_input_tokens_seen": 299343685, + "step": 13874, + "time_per_iteration": 2.7609732151031494 + }, + { + "auxiliary_loss_clip": 0.0106577, + "auxiliary_loss_mlp": 0.01028035, + "balance_loss_clip": 1.03122151, + "balance_loss_mlp": 1.01743746, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 2.0263170536339548, + "language_loss": 0.66250366, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68344176, + "num_input_tokens_seen": 299363305, + "step": 13875, + "time_per_iteration": 2.588953971862793 + }, + { + "auxiliary_loss_clip": 0.01047887, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.02988839, + "balance_loss_mlp": 1.02159131, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.9330036154167016, + "language_loss": 0.79650748, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81731331, + "num_input_tokens_seen": 299382630, + "step": 13876, + "time_per_iteration": 2.6424033641815186 + }, + { + "auxiliary_loss_clip": 0.01078581, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.03156805, + "balance_loss_mlp": 1.02357697, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 1.9425259095009435, + "language_loss": 0.87743616, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89858127, + "num_input_tokens_seen": 299402385, + "step": 13877, + "time_per_iteration": 2.5413544178009033 + }, + { + "auxiliary_loss_clip": 0.01049232, + "auxiliary_loss_mlp": 0.0102504, + "balance_loss_clip": 1.03006291, + "balance_loss_mlp": 1.01440001, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 1.8613570400720694, + "language_loss": 0.69136864, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71211135, + "num_input_tokens_seen": 299419820, + "step": 13878, + "time_per_iteration": 4.068578720092773 + }, + { + "auxiliary_loss_clip": 0.01074687, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.03433919, + "balance_loss_mlp": 1.01636636, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 1.9539042937072175, + "language_loss": 0.79533911, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81635422, + "num_input_tokens_seen": 299436265, + "step": 13879, + "time_per_iteration": 2.548938751220703 + }, + { + "auxiliary_loss_clip": 0.01049479, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.03363681, + "balance_loss_mlp": 1.01786232, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 1.9498825424407018, + "language_loss": 0.82894289, + "learning_rate": 2.803804103009828e-07, + "loss": 0.84971678, + "num_input_tokens_seen": 299451660, + "step": 13880, + "time_per_iteration": 2.6112313270568848 + }, + { + "auxiliary_loss_clip": 0.01077596, + "auxiliary_loss_mlp": 0.01026809, + "balance_loss_clip": 1.03290606, + "balance_loss_mlp": 1.01616371, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.6044114552820892, + "language_loss": 0.78169513, + "learning_rate": 2.80181578143982e-07, + "loss": 0.8027392, + "num_input_tokens_seen": 299472070, + "step": 13881, + "time_per_iteration": 2.624857187271118 + }, + { + "auxiliary_loss_clip": 0.01052131, + "auxiliary_loss_mlp": 0.01021597, + "balance_loss_clip": 1.03148472, + "balance_loss_mlp": 1.012007, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.3114848197207163, + "language_loss": 0.78132069, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80205798, + "num_input_tokens_seen": 299486725, + "step": 13882, + "time_per_iteration": 2.69296932220459 + }, + { + "auxiliary_loss_clip": 0.010558, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_clip": 1.03033948, + "balance_loss_mlp": 1.03199172, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.7895438102829964, + "language_loss": 0.80798334, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82898587, + "num_input_tokens_seen": 299505435, + "step": 13883, + "time_per_iteration": 2.6068496704101562 + }, + { + "auxiliary_loss_clip": 0.01084846, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.0324899, + "balance_loss_mlp": 1.01756585, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 1.8969056573048622, + "language_loss": 0.74105221, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76219261, + "num_input_tokens_seen": 299523555, + "step": 13884, + "time_per_iteration": 2.5212273597717285 + }, + { + "auxiliary_loss_clip": 0.01084498, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.03565788, + "balance_loss_mlp": 1.02099085, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 2.0693844766668197, + "language_loss": 0.70075428, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72193718, + "num_input_tokens_seen": 299541660, + "step": 13885, + "time_per_iteration": 2.6328718662261963 + }, + { + "auxiliary_loss_clip": 0.0106598, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.03251672, + "balance_loss_mlp": 1.01778615, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 2.4372384694677276, + "language_loss": 0.70272577, + "learning_rate": 2.791883957449912e-07, + "loss": 0.72367907, + "num_input_tokens_seen": 299562465, + "step": 13886, + "time_per_iteration": 2.6997904777526855 + }, + { + "auxiliary_loss_clip": 0.01058008, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.03030562, + "balance_loss_mlp": 1.01853085, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.5841874863986087, + "language_loss": 0.7915163, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81240678, + "num_input_tokens_seen": 299582700, + "step": 13887, + "time_per_iteration": 2.586942672729492 + }, + { + "auxiliary_loss_clip": 0.01083007, + "auxiliary_loss_mlp": 0.00749465, + "balance_loss_clip": 1.03689337, + "balance_loss_mlp": 1.00020468, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.129559616731613, + "language_loss": 0.63798642, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.65631115, + "num_input_tokens_seen": 299600310, + "step": 13888, + "time_per_iteration": 2.583487033843994 + }, + { + "auxiliary_loss_clip": 0.01076487, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.03254068, + "balance_loss_mlp": 1.01490068, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 2.3224205153153887, + "language_loss": 0.66444403, + "learning_rate": 2.785932692855244e-07, + "loss": 0.68547046, + "num_input_tokens_seen": 299617025, + "step": 13889, + "time_per_iteration": 2.6657180786132812 + }, + { + "auxiliary_loss_clip": 0.01075343, + "auxiliary_loss_mlp": 0.01024888, + "balance_loss_clip": 1.0300169, + "balance_loss_mlp": 1.01403999, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 2.0262169144584448, + "language_loss": 0.68572772, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70673001, + "num_input_tokens_seen": 299633050, + "step": 13890, + "time_per_iteration": 2.526827335357666 + }, + { + "auxiliary_loss_clip": 0.01076273, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.03360224, + "balance_loss_mlp": 1.02152574, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.434807851150909, + "language_loss": 0.59268469, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61377835, + "num_input_tokens_seen": 299646445, + "step": 13891, + "time_per_iteration": 2.5342447757720947 + }, + { + "auxiliary_loss_clip": 0.0108751, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.03398657, + "balance_loss_mlp": 1.01896012, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.6486837382488149, + "language_loss": 0.71768165, + "learning_rate": 2.779987303092846e-07, + "loss": 0.738855, + "num_input_tokens_seen": 299662665, + "step": 13892, + "time_per_iteration": 2.540461778640747 + }, + { + "auxiliary_loss_clip": 0.01094748, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.03323412, + "balance_loss_mlp": 1.016711, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.6226894617420673, + "language_loss": 0.66052675, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68175262, + "num_input_tokens_seen": 299683585, + "step": 13893, + "time_per_iteration": 4.0962536334991455 + }, + { + "auxiliary_loss_clip": 0.01065496, + "auxiliary_loss_mlp": 0.01024815, + "balance_loss_clip": 1.02997804, + "balance_loss_mlp": 1.01391971, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 1.9994329523236252, + "language_loss": 0.78037614, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80127931, + "num_input_tokens_seen": 299702680, + "step": 13894, + "time_per_iteration": 2.5760626792907715 + }, + { + "auxiliary_loss_clip": 0.01076591, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.03308153, + "balance_loss_mlp": 1.01510978, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.693530949550113, + "language_loss": 0.72653949, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74756604, + "num_input_tokens_seen": 299721050, + "step": 13895, + "time_per_iteration": 2.544260263442993 + }, + { + "auxiliary_loss_clip": 0.01085683, + "auxiliary_loss_mlp": 0.01039834, + "balance_loss_clip": 1.03386569, + "balance_loss_mlp": 1.02751994, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 1.8947374356197828, + "language_loss": 0.72186428, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74311942, + "num_input_tokens_seen": 299738255, + "step": 13896, + "time_per_iteration": 2.565751075744629 + }, + { + "auxiliary_loss_clip": 0.01083177, + "auxiliary_loss_mlp": 0.0102614, + "balance_loss_clip": 1.03170633, + "balance_loss_mlp": 1.01560175, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.2335574239543337, + "language_loss": 0.59084433, + "learning_rate": 2.770091380848423e-07, + "loss": 0.61193752, + "num_input_tokens_seen": 299761315, + "step": 13897, + "time_per_iteration": 2.8039824962615967 + }, + { + "auxiliary_loss_clip": 0.01023498, + "auxiliary_loss_mlp": 0.0074655, + "balance_loss_clip": 1.00352407, + "balance_loss_mlp": 0.99976462, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.7009321526003008, + "language_loss": 0.57651985, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59422028, + "num_input_tokens_seen": 299828735, + "step": 13898, + "time_per_iteration": 3.132310390472412 + }, + { + "auxiliary_loss_clip": 0.01082924, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.03545046, + "balance_loss_mlp": 1.02116895, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.7047649585180888, + "language_loss": 0.79829836, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.81945634, + "num_input_tokens_seen": 299848395, + "step": 13899, + "time_per_iteration": 2.5068511962890625 + }, + { + "auxiliary_loss_clip": 0.01098366, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.03383684, + "balance_loss_mlp": 1.02035308, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.6693508596903224, + "language_loss": 0.68829066, + "learning_rate": 2.764161667219749e-07, + "loss": 0.70958185, + "num_input_tokens_seen": 299871665, + "step": 13900, + "time_per_iteration": 2.722072124481201 + }, + { + "auxiliary_loss_clip": 0.01073926, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.03352177, + "balance_loss_mlp": 1.01783466, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.537318721779136, + "language_loss": 0.71356142, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73458588, + "num_input_tokens_seen": 299891960, + "step": 13901, + "time_per_iteration": 2.604052782058716 + }, + { + "auxiliary_loss_clip": 0.01050206, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.02979136, + "balance_loss_mlp": 1.02333283, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.147889448543305, + "language_loss": 0.80140376, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82225913, + "num_input_tokens_seen": 299905070, + "step": 13902, + "time_per_iteration": 2.6442694664001465 + }, + { + "auxiliary_loss_clip": 0.01083639, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.0326184, + "balance_loss_mlp": 1.02020121, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.6160875594347013, + "language_loss": 0.62812233, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64926791, + "num_input_tokens_seen": 299925130, + "step": 13903, + "time_per_iteration": 2.569260597229004 + }, + { + "auxiliary_loss_clip": 0.0106938, + "auxiliary_loss_mlp": 0.01034642, + "balance_loss_clip": 1.03092635, + "balance_loss_mlp": 1.02237582, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 6.280099832922468, + "language_loss": 0.74187618, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76291645, + "num_input_tokens_seen": 299943845, + "step": 13904, + "time_per_iteration": 4.1492393016815186 + }, + { + "auxiliary_loss_clip": 0.01068557, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.03016984, + "balance_loss_mlp": 1.01765633, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.748742144334997, + "language_loss": 0.72568363, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74666029, + "num_input_tokens_seen": 299961620, + "step": 13905, + "time_per_iteration": 2.634282350540161 + }, + { + "auxiliary_loss_clip": 0.01083566, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.03388524, + "balance_loss_mlp": 1.0248127, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.7692946048327791, + "language_loss": 0.66444963, + "learning_rate": 2.752319888771e-07, + "loss": 0.6856299, + "num_input_tokens_seen": 299982170, + "step": 13906, + "time_per_iteration": 2.574720859527588 + }, + { + "auxiliary_loss_clip": 0.01084307, + "auxiliary_loss_mlp": 0.01024617, + "balance_loss_clip": 1.03234196, + "balance_loss_mlp": 1.01374531, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.40293166449436, + "language_loss": 0.74172008, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.7628094, + "num_input_tokens_seen": 300001330, + "step": 13907, + "time_per_iteration": 2.491175651550293 + }, + { + "auxiliary_loss_clip": 0.01065214, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.03212214, + "balance_loss_mlp": 1.02243233, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 7.746810540698093, + "language_loss": 0.75299042, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77397764, + "num_input_tokens_seen": 300020645, + "step": 13908, + "time_per_iteration": 2.620112657546997 + }, + { + "auxiliary_loss_clip": 0.01087492, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03356278, + "balance_loss_mlp": 1.01680195, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 2.1066470140025886, + "language_loss": 0.71445465, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73562306, + "num_input_tokens_seen": 300039945, + "step": 13909, + "time_per_iteration": 2.540555000305176 + }, + { + "auxiliary_loss_clip": 0.01100252, + "auxiliary_loss_mlp": 0.00749522, + "balance_loss_clip": 1.03417349, + "balance_loss_mlp": 1.00030577, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 2.4523209362708687, + "language_loss": 0.73290825, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75140601, + "num_input_tokens_seen": 300058260, + "step": 13910, + "time_per_iteration": 4.012665510177612 + }, + { + "auxiliary_loss_clip": 0.01079507, + "auxiliary_loss_mlp": 0.00749725, + "balance_loss_clip": 1.03149343, + "balance_loss_mlp": 1.00028181, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 2.0267785568232632, + "language_loss": 0.73281097, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75110328, + "num_input_tokens_seen": 300076720, + "step": 13911, + "time_per_iteration": 2.5429787635803223 + }, + { + "auxiliary_loss_clip": 0.01079917, + "auxiliary_loss_mlp": 0.01037516, + "balance_loss_clip": 1.03470111, + "balance_loss_mlp": 1.02658415, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 4.597939860563702, + "language_loss": 0.79112035, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81229466, + "num_input_tokens_seen": 300092950, + "step": 13912, + "time_per_iteration": 2.5693211555480957 + }, + { + "auxiliary_loss_clip": 0.01088758, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.0342952, + "balance_loss_mlp": 1.0187149, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.7727957105812386, + "language_loss": 0.78744721, + "learning_rate": 2.738534240246797e-07, + "loss": 0.80862677, + "num_input_tokens_seen": 300110950, + "step": 13913, + "time_per_iteration": 2.5203745365142822 + }, + { + "auxiliary_loss_clip": 0.01084353, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.03138876, + "balance_loss_mlp": 1.02158475, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 2.0532820346148335, + "language_loss": 0.7361142, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75728947, + "num_input_tokens_seen": 300128705, + "step": 13914, + "time_per_iteration": 2.5250895023345947 + }, + { + "auxiliary_loss_clip": 0.01050484, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.03329992, + "balance_loss_mlp": 1.02366996, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.5715505633048859, + "language_loss": 0.70998156, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73083735, + "num_input_tokens_seen": 300148635, + "step": 13915, + "time_per_iteration": 2.655047655105591 + }, + { + "auxiliary_loss_clip": 0.01064966, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.03191614, + "balance_loss_mlp": 1.01775062, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.7995338590950745, + "language_loss": 0.72655112, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74748141, + "num_input_tokens_seen": 300165490, + "step": 13916, + "time_per_iteration": 2.6347837448120117 + }, + { + "auxiliary_loss_clip": 0.01065155, + "auxiliary_loss_mlp": 0.00749343, + "balance_loss_clip": 1.03314257, + "balance_loss_mlp": 1.00020266, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 1.9792591183839896, + "language_loss": 0.7463423, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76448727, + "num_input_tokens_seen": 300182130, + "step": 13917, + "time_per_iteration": 2.550678253173828 + }, + { + "auxiliary_loss_clip": 0.01094183, + "auxiliary_loss_mlp": 0.01028233, + "balance_loss_clip": 1.0343821, + "balance_loss_mlp": 1.0177958, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.8066259926074977, + "language_loss": 0.79321444, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81443858, + "num_input_tokens_seen": 300203050, + "step": 13918, + "time_per_iteration": 2.529543876647949 + }, + { + "auxiliary_loss_clip": 0.01056272, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.03397548, + "balance_loss_mlp": 1.0221349, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.591581893200503, + "language_loss": 0.68049908, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70139265, + "num_input_tokens_seen": 300224380, + "step": 13919, + "time_per_iteration": 4.388173580169678 + }, + { + "auxiliary_loss_clip": 0.01076749, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.02974379, + "balance_loss_mlp": 1.02234793, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1.6553607317248673, + "language_loss": 0.73853374, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75964284, + "num_input_tokens_seen": 300242915, + "step": 13920, + "time_per_iteration": 2.5956709384918213 + }, + { + "auxiliary_loss_clip": 0.01074795, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.03129768, + "balance_loss_mlp": 1.02158213, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 34.269627405840005, + "language_loss": 0.69130808, + "learning_rate": 2.722818488237566e-07, + "loss": 0.71238399, + "num_input_tokens_seen": 300261905, + "step": 13921, + "time_per_iteration": 2.7462892532348633 + }, + { + "auxiliary_loss_clip": 0.01090509, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03456998, + "balance_loss_mlp": 1.02035236, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.9197422290664874, + "language_loss": 0.84991276, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87113208, + "num_input_tokens_seen": 300281145, + "step": 13922, + "time_per_iteration": 2.5306973457336426 + }, + { + "auxiliary_loss_clip": 0.01055948, + "auxiliary_loss_mlp": 0.00749212, + "balance_loss_clip": 1.03168964, + "balance_loss_mlp": 1.00028586, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.7776930334912495, + "language_loss": 0.71705091, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73510247, + "num_input_tokens_seen": 300301610, + "step": 13923, + "time_per_iteration": 2.622004270553589 + }, + { + "auxiliary_loss_clip": 0.01073313, + "auxiliary_loss_mlp": 0.01026752, + "balance_loss_clip": 1.03455126, + "balance_loss_mlp": 1.01536131, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 2.035973885203376, + "language_loss": 0.76075387, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78175449, + "num_input_tokens_seen": 300319420, + "step": 13924, + "time_per_iteration": 2.6638007164001465 + }, + { + "auxiliary_loss_clip": 0.01073077, + "auxiliary_loss_mlp": 0.01025836, + "balance_loss_clip": 1.03100657, + "balance_loss_mlp": 1.01507163, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.5074909717000844, + "language_loss": 0.6451689, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66615802, + "num_input_tokens_seen": 300341325, + "step": 13925, + "time_per_iteration": 2.640748977661133 + }, + { + "auxiliary_loss_clip": 0.01076482, + "auxiliary_loss_mlp": 0.01028721, + "balance_loss_clip": 1.03465343, + "balance_loss_mlp": 1.01787329, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 2.1140910048419848, + "language_loss": 0.74531806, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76637012, + "num_input_tokens_seen": 300361620, + "step": 13926, + "time_per_iteration": 2.6328723430633545 + }, + { + "auxiliary_loss_clip": 0.01087822, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.03504133, + "balance_loss_mlp": 1.02213526, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.6794684697100617, + "language_loss": 0.71417892, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73539394, + "num_input_tokens_seen": 300378675, + "step": 13927, + "time_per_iteration": 2.6418933868408203 + }, + { + "auxiliary_loss_clip": 0.00993226, + "auxiliary_loss_mlp": 0.01000104, + "balance_loss_clip": 1.00451732, + "balance_loss_mlp": 0.99903661, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.6977254231619691, + "language_loss": 0.58789402, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60782737, + "num_input_tokens_seen": 300449740, + "step": 13928, + "time_per_iteration": 3.3334591388702393 + }, + { + "auxiliary_loss_clip": 0.01059401, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.03419518, + "balance_loss_mlp": 1.02322805, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.740761653809868, + "language_loss": 0.69530207, + "learning_rate": 2.707144665977068e-07, + "loss": 0.7162534, + "num_input_tokens_seen": 300470000, + "step": 13929, + "time_per_iteration": 2.6252243518829346 + }, + { + "auxiliary_loss_clip": 0.01088003, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.03382123, + "balance_loss_mlp": 1.01416397, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.7693496483974394, + "language_loss": 0.66865647, + "learning_rate": 2.705188388275574e-07, + "loss": 0.689798, + "num_input_tokens_seen": 300494975, + "step": 13930, + "time_per_iteration": 2.7490131855010986 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01024787, + "balance_loss_clip": 1.03549576, + "balance_loss_mlp": 1.01435065, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.6530665098318726, + "language_loss": 0.7101301, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73094052, + "num_input_tokens_seen": 300513175, + "step": 13931, + "time_per_iteration": 2.702207326889038 + }, + { + "auxiliary_loss_clip": 0.01058642, + "auxiliary_loss_mlp": 0.01027764, + "balance_loss_clip": 1.02962041, + "balance_loss_mlp": 1.01653409, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.7014793008887066, + "language_loss": 0.71845293, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73931706, + "num_input_tokens_seen": 300533770, + "step": 13932, + "time_per_iteration": 2.6641716957092285 + }, + { + "auxiliary_loss_clip": 0.01032417, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.03151405, + "balance_loss_mlp": 1.02310395, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 1.9778726211977198, + "language_loss": 0.66594386, + "learning_rate": 2.699323490393628e-07, + "loss": 0.68659914, + "num_input_tokens_seen": 300552995, + "step": 13933, + "time_per_iteration": 4.179757595062256 + }, + { + "auxiliary_loss_clip": 0.01066586, + "auxiliary_loss_mlp": 0.01038492, + "balance_loss_clip": 1.03348851, + "balance_loss_mlp": 1.02756, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 1.975751285432502, + "language_loss": 0.76510835, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78615916, + "num_input_tokens_seen": 300570275, + "step": 13934, + "time_per_iteration": 2.550687313079834 + }, + { + "auxiliary_loss_clip": 0.01088834, + "auxiliary_loss_mlp": 0.01028068, + "balance_loss_clip": 1.03740478, + "balance_loss_mlp": 1.01720226, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 1.6276625952235964, + "language_loss": 0.77425885, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79542792, + "num_input_tokens_seen": 300590875, + "step": 13935, + "time_per_iteration": 2.5622715950012207 + }, + { + "auxiliary_loss_clip": 0.01055757, + "auxiliary_loss_mlp": 0.01028387, + "balance_loss_clip": 1.03168273, + "balance_loss_mlp": 1.01726532, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 2.9012001713902387, + "language_loss": 0.56088877, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.58173025, + "num_input_tokens_seen": 300607490, + "step": 13936, + "time_per_iteration": 2.6068484783172607 + }, + { + "auxiliary_loss_clip": 0.01076458, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.0295856, + "balance_loss_mlp": 1.02071428, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.9556341566711932, + "language_loss": 0.89283216, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91392207, + "num_input_tokens_seen": 300623635, + "step": 13937, + "time_per_iteration": 2.516710042953491 + }, + { + "auxiliary_loss_clip": 0.01088202, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.03406048, + "balance_loss_mlp": 1.01774299, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 10.58337561782411, + "language_loss": 0.81645477, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83762318, + "num_input_tokens_seen": 300643835, + "step": 13938, + "time_per_iteration": 2.576718807220459 + }, + { + "auxiliary_loss_clip": 0.01090373, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.03438282, + "balance_loss_mlp": 1.01833725, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.795531471337577, + "language_loss": 0.7070294, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72823155, + "num_input_tokens_seen": 300662500, + "step": 13939, + "time_per_iteration": 2.6056814193725586 + }, + { + "auxiliary_loss_clip": 0.0106493, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.03195453, + "balance_loss_mlp": 1.02400136, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 1.6166765029714536, + "language_loss": 0.75848722, + "learning_rate": 2.6856616936428e-07, + "loss": 0.77949589, + "num_input_tokens_seen": 300681480, + "step": 13940, + "time_per_iteration": 2.6520471572875977 + }, + { + "auxiliary_loss_clip": 0.01080376, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.03240585, + "balance_loss_mlp": 1.01932764, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.693243033944574, + "language_loss": 0.76515663, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78626657, + "num_input_tokens_seen": 300699165, + "step": 13941, + "time_per_iteration": 2.582524061203003 + }, + { + "auxiliary_loss_clip": 0.01057136, + "auxiliary_loss_mlp": 0.01027042, + "balance_loss_clip": 1.032534, + "balance_loss_mlp": 1.0155381, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.5396892873261607, + "language_loss": 0.73720872, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.7580505, + "num_input_tokens_seen": 300714615, + "step": 13942, + "time_per_iteration": 2.768702983856201 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.03230143, + "balance_loss_mlp": 1.02669716, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.74315687156021, + "language_loss": 0.79680949, + "learning_rate": 2.679816484834554e-07, + "loss": 0.8177188, + "num_input_tokens_seen": 300734860, + "step": 13943, + "time_per_iteration": 2.7922983169555664 + }, + { + "auxiliary_loss_clip": 0.01044084, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.02966404, + "balance_loss_mlp": 1.01894975, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 3.3473357588426444, + "language_loss": 0.85071057, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87145603, + "num_input_tokens_seen": 300752735, + "step": 13944, + "time_per_iteration": 4.097015857696533 + }, + { + "auxiliary_loss_clip": 0.01002703, + "auxiliary_loss_mlp": 0.00746568, + "balance_loss_clip": 1.0038805, + "balance_loss_mlp": 0.99982387, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.6167929874601862, + "language_loss": 0.50244504, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.51993781, + "num_input_tokens_seen": 300820760, + "step": 13945, + "time_per_iteration": 3.1851532459259033 + }, + { + "auxiliary_loss_clip": 0.01051444, + "auxiliary_loss_mlp": 0.01025862, + "balance_loss_clip": 1.03398395, + "balance_loss_mlp": 1.01470935, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 1.7937013827298, + "language_loss": 0.65082455, + "learning_rate": 2.673977187074017e-07, + "loss": 0.6715976, + "num_input_tokens_seen": 300840025, + "step": 13946, + "time_per_iteration": 2.74002742767334 + }, + { + "auxiliary_loss_clip": 0.01045545, + "auxiliary_loss_mlp": 0.01027082, + "balance_loss_clip": 1.02995729, + "balance_loss_mlp": 1.01504183, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.8137284503047721, + "language_loss": 0.67350924, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69423544, + "num_input_tokens_seen": 300860380, + "step": 13947, + "time_per_iteration": 2.748927354812622 + }, + { + "auxiliary_loss_clip": 0.01069429, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.03192151, + "balance_loss_mlp": 1.01934624, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.5118165352002952, + "language_loss": 0.69835961, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.71937168, + "num_input_tokens_seen": 300881895, + "step": 13948, + "time_per_iteration": 2.7249033451080322 + }, + { + "auxiliary_loss_clip": 0.01071648, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.0318799, + "balance_loss_mlp": 1.01897073, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 2.004668173429373, + "language_loss": 0.84976745, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87076801, + "num_input_tokens_seen": 300901575, + "step": 13949, + "time_per_iteration": 4.144365549087524 + }, + { + "auxiliary_loss_clip": 0.01069182, + "auxiliary_loss_mlp": 0.01024569, + "balance_loss_clip": 1.03328753, + "balance_loss_mlp": 1.01421547, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 1.8266130864026144, + "language_loss": 0.70322025, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72415775, + "num_input_tokens_seen": 300919735, + "step": 13950, + "time_per_iteration": 2.6438305377960205 + }, + { + "auxiliary_loss_clip": 0.01079787, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.03392148, + "balance_loss_mlp": 1.01535034, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.7704239440648908, + "language_loss": 0.64669788, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66775852, + "num_input_tokens_seen": 300939150, + "step": 13951, + "time_per_iteration": 2.5090250968933105 + }, + { + "auxiliary_loss_clip": 0.01088946, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.03569639, + "balance_loss_mlp": 1.01660371, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.5113273822970175, + "language_loss": 0.69569683, + "learning_rate": 2.662316332665393e-07, + "loss": 0.71685833, + "num_input_tokens_seen": 300959730, + "step": 13952, + "time_per_iteration": 2.5979273319244385 + }, + { + "auxiliary_loss_clip": 0.01083316, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.03289843, + "balance_loss_mlp": 1.01705146, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 1.837191244666216, + "language_loss": 0.72902858, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.75013804, + "num_input_tokens_seen": 300976120, + "step": 13953, + "time_per_iteration": 2.601301431655884 + }, + { + "auxiliary_loss_clip": 0.01019445, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.02776361, + "balance_loss_mlp": 1.01974046, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 1.8438388166809665, + "language_loss": 0.68399823, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70451319, + "num_input_tokens_seen": 300995080, + "step": 13954, + "time_per_iteration": 2.711216688156128 + }, + { + "auxiliary_loss_clip": 0.01076733, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.03464246, + "balance_loss_mlp": 1.02118874, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 2.7563293524207424, + "language_loss": 0.73333716, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75441575, + "num_input_tokens_seen": 301012920, + "step": 13955, + "time_per_iteration": 2.646911144256592 + }, + { + "auxiliary_loss_clip": 0.01030218, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.02856684, + "balance_loss_mlp": 1.01906002, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.548953166419763, + "language_loss": 0.66599667, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.6866048, + "num_input_tokens_seen": 301028875, + "step": 13956, + "time_per_iteration": 2.7249863147735596 + }, + { + "auxiliary_loss_clip": 0.01086944, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.03351843, + "balance_loss_mlp": 1.02020073, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.676491918628401, + "language_loss": 0.79654145, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81772816, + "num_input_tokens_seen": 301050115, + "step": 13957, + "time_per_iteration": 2.6015522480010986 + }, + { + "auxiliary_loss_clip": 0.00977167, + "auxiliary_loss_mlp": 0.01001736, + "balance_loss_clip": 1.01286352, + "balance_loss_mlp": 1.00052571, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7541560236706135, + "language_loss": 0.53295398, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55274296, + "num_input_tokens_seen": 301114155, + "step": 13958, + "time_per_iteration": 3.3039376735687256 + }, + { + "auxiliary_loss_clip": 0.01084359, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.03228927, + "balance_loss_mlp": 1.02098107, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.89655451429257, + "language_loss": 0.73283738, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75400841, + "num_input_tokens_seen": 301133150, + "step": 13959, + "time_per_iteration": 2.5650880336761475 + }, + { + "auxiliary_loss_clip": 0.01062368, + "auxiliary_loss_mlp": 0.01026737, + "balance_loss_clip": 1.0339663, + "balance_loss_mlp": 1.01643729, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 2.2961585657378834, + "language_loss": 0.55636477, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57725585, + "num_input_tokens_seen": 301153600, + "step": 13960, + "time_per_iteration": 4.280573129653931 + }, + { + "auxiliary_loss_clip": 0.00995614, + "auxiliary_loss_mlp": 0.01002769, + "balance_loss_clip": 1.00480866, + "balance_loss_mlp": 1.00187492, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7676358962702482, + "language_loss": 0.60719085, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62717462, + "num_input_tokens_seen": 301214335, + "step": 13961, + "time_per_iteration": 3.257822275161743 + }, + { + "auxiliary_loss_clip": 0.01035903, + "auxiliary_loss_mlp": 0.01039095, + "balance_loss_clip": 1.02573073, + "balance_loss_mlp": 1.02735329, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.2427345308895297, + "language_loss": 0.68033546, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70108545, + "num_input_tokens_seen": 301228960, + "step": 13962, + "time_per_iteration": 2.669635772705078 + }, + { + "auxiliary_loss_clip": 0.01065851, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.03132772, + "balance_loss_mlp": 1.01879001, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 3.2311163238188443, + "language_loss": 0.7329433, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75389719, + "num_input_tokens_seen": 301245875, + "step": 13963, + "time_per_iteration": 2.622432231903076 + }, + { + "auxiliary_loss_clip": 0.01072181, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.03068519, + "balance_loss_mlp": 1.02210045, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 2.174886907606595, + "language_loss": 0.76382375, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.7848773, + "num_input_tokens_seen": 301265550, + "step": 13964, + "time_per_iteration": 2.683701753616333 + }, + { + "auxiliary_loss_clip": 0.0107906, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.03390467, + "balance_loss_mlp": 1.02169931, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 1.8610919612171517, + "language_loss": 0.77892828, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80005658, + "num_input_tokens_seen": 301282035, + "step": 13965, + "time_per_iteration": 2.5968198776245117 + }, + { + "auxiliary_loss_clip": 0.01080444, + "auxiliary_loss_mlp": 0.01027395, + "balance_loss_clip": 1.03201222, + "balance_loss_mlp": 1.01706517, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 1.7531738966952122, + "language_loss": 0.65424466, + "learning_rate": 2.635199742359684e-07, + "loss": 0.67532301, + "num_input_tokens_seen": 301305210, + "step": 13966, + "time_per_iteration": 2.670165538787842 + }, + { + "auxiliary_loss_clip": 0.010741, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.03194618, + "balance_loss_mlp": 1.02103114, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.8686131050313954, + "language_loss": 0.74328387, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76434195, + "num_input_tokens_seen": 301324885, + "step": 13967, + "time_per_iteration": 2.711028814315796 + }, + { + "auxiliary_loss_clip": 0.01071014, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.03293991, + "balance_loss_mlp": 1.01997256, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 3.576688423104221, + "language_loss": 0.83143544, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85245669, + "num_input_tokens_seen": 301343070, + "step": 13968, + "time_per_iteration": 2.687129020690918 + }, + { + "auxiliary_loss_clip": 0.01070536, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.03433049, + "balance_loss_mlp": 1.02117729, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 1.8025319324397455, + "language_loss": 0.77670562, + "learning_rate": 2.629405828689075e-07, + "loss": 0.79772955, + "num_input_tokens_seen": 301359280, + "step": 13969, + "time_per_iteration": 2.636645555496216 + }, + { + "auxiliary_loss_clip": 0.01078882, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.03300834, + "balance_loss_mlp": 1.01496577, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.07363657744771, + "language_loss": 0.77427018, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79532743, + "num_input_tokens_seen": 301376465, + "step": 13970, + "time_per_iteration": 2.56522274017334 + }, + { + "auxiliary_loss_clip": 0.01075764, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.0325942, + "balance_loss_mlp": 1.02311254, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 2.118395036938226, + "language_loss": 0.72457397, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74567389, + "num_input_tokens_seen": 301396000, + "step": 13971, + "time_per_iteration": 2.618288278579712 + }, + { + "auxiliary_loss_clip": 0.01071177, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.02938128, + "balance_loss_mlp": 1.0170244, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 1.967079263289359, + "language_loss": 0.77235878, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79335821, + "num_input_tokens_seen": 301413160, + "step": 13972, + "time_per_iteration": 2.582383632659912 + }, + { + "auxiliary_loss_clip": 0.01024928, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.02766824, + "balance_loss_mlp": 1.02289081, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.5416963539327382, + "language_loss": 0.68202341, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70262051, + "num_input_tokens_seen": 301433325, + "step": 13973, + "time_per_iteration": 4.197234630584717 + }, + { + "auxiliary_loss_clip": 0.01077493, + "auxiliary_loss_mlp": 0.0102541, + "balance_loss_clip": 1.03342509, + "balance_loss_mlp": 1.01444912, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 1.8337655283029586, + "language_loss": 0.77801526, + "learning_rate": 2.619762480773382e-07, + "loss": 0.79904431, + "num_input_tokens_seen": 301450265, + "step": 13974, + "time_per_iteration": 2.5713138580322266 + }, + { + "auxiliary_loss_clip": 0.01081987, + "auxiliary_loss_mlp": 0.01032952, + "balance_loss_clip": 1.03470051, + "balance_loss_mlp": 1.02206802, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.5436664145352155, + "language_loss": 0.72368622, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74483556, + "num_input_tokens_seen": 301470760, + "step": 13975, + "time_per_iteration": 2.533897638320923 + }, + { + "auxiliary_loss_clip": 0.01074736, + "auxiliary_loss_mlp": 0.01025455, + "balance_loss_clip": 1.03259683, + "balance_loss_mlp": 1.01413584, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 2.534147331159146, + "language_loss": 0.72668368, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74768561, + "num_input_tokens_seen": 301489425, + "step": 13976, + "time_per_iteration": 2.581646203994751 + }, + { + "auxiliary_loss_clip": 0.01094897, + "auxiliary_loss_mlp": 0.00749267, + "balance_loss_clip": 1.03235042, + "balance_loss_mlp": 1.00023198, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.7001488716455395, + "language_loss": 0.71903646, + "learning_rate": 2.61398438016311e-07, + "loss": 0.73747814, + "num_input_tokens_seen": 301508885, + "step": 13977, + "time_per_iteration": 2.531985282897949 + }, + { + "auxiliary_loss_clip": 0.01082994, + "auxiliary_loss_mlp": 0.01026755, + "balance_loss_clip": 1.02989042, + "balance_loss_mlp": 1.01590705, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.468291226883301, + "language_loss": 0.68692577, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70802331, + "num_input_tokens_seen": 301533780, + "step": 13978, + "time_per_iteration": 2.6742067337036133 + }, + { + "auxiliary_loss_clip": 0.01058195, + "auxiliary_loss_mlp": 0.0103006, + "balance_loss_clip": 1.03064477, + "balance_loss_mlp": 1.01924145, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 3.2250668417548853, + "language_loss": 0.78061676, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80149925, + "num_input_tokens_seen": 301551775, + "step": 13979, + "time_per_iteration": 2.637824773788452 + }, + { + "auxiliary_loss_clip": 0.01082705, + "auxiliary_loss_mlp": 0.01027735, + "balance_loss_clip": 1.03538334, + "balance_loss_mlp": 1.01660085, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 2.3191775073001395, + "language_loss": 0.7779001, + "learning_rate": 2.60821221306778e-07, + "loss": 0.7990045, + "num_input_tokens_seen": 301570495, + "step": 13980, + "time_per_iteration": 2.5385069847106934 + }, + { + "auxiliary_loss_clip": 0.01059566, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.0317632, + "balance_loss_mlp": 1.01995873, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.5582755976481755, + "language_loss": 0.86551714, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88641614, + "num_input_tokens_seen": 301591705, + "step": 13981, + "time_per_iteration": 2.667452096939087 + }, + { + "auxiliary_loss_clip": 0.01085171, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.0331943, + "balance_loss_mlp": 1.01987624, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 2.015018976552546, + "language_loss": 0.6774416, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.69860053, + "num_input_tokens_seen": 301611670, + "step": 13982, + "time_per_iteration": 2.626072883605957 + }, + { + "auxiliary_loss_clip": 0.01046253, + "auxiliary_loss_mlp": 0.01034405, + "balance_loss_clip": 1.03201985, + "balance_loss_mlp": 1.02139342, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.7408680952871103, + "language_loss": 0.6824339, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70324045, + "num_input_tokens_seen": 301632540, + "step": 13983, + "time_per_iteration": 2.764596700668335 + }, + { + "auxiliary_loss_clip": 0.01055249, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.02803993, + "balance_loss_mlp": 1.01978803, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.7320389003065744, + "language_loss": 0.79089272, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.81176186, + "num_input_tokens_seen": 301651480, + "step": 13984, + "time_per_iteration": 4.2294602394104 + }, + { + "auxiliary_loss_clip": 0.01082447, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.03047073, + "balance_loss_mlp": 1.02151835, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 1.9553163115283556, + "language_loss": 0.60391021, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62505591, + "num_input_tokens_seen": 301670010, + "step": 13985, + "time_per_iteration": 2.5962984561920166 + }, + { + "auxiliary_loss_clip": 0.01052221, + "auxiliary_loss_mlp": 0.01029422, + "balance_loss_clip": 1.03056836, + "balance_loss_mlp": 1.01811457, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 1.6009431932444438, + "language_loss": 0.8175019, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.83831835, + "num_input_tokens_seen": 301689785, + "step": 13986, + "time_per_iteration": 2.706456422805786 + }, + { + "auxiliary_loss_clip": 0.0107542, + "auxiliary_loss_mlp": 0.00749237, + "balance_loss_clip": 1.03445053, + "balance_loss_mlp": 1.00019491, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.5195384463756467, + "language_loss": 0.65761447, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67586106, + "num_input_tokens_seen": 301712225, + "step": 13987, + "time_per_iteration": 2.6587114334106445 + }, + { + "auxiliary_loss_clip": 0.01096998, + "auxiliary_loss_mlp": 0.00749396, + "balance_loss_clip": 1.03367138, + "balance_loss_mlp": 1.00029206, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 1.8810350661498687, + "language_loss": 0.67325771, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69172156, + "num_input_tokens_seen": 301730955, + "step": 13988, + "time_per_iteration": 2.5165133476257324 + }, + { + "auxiliary_loss_clip": 0.01087444, + "auxiliary_loss_mlp": 0.01037301, + "balance_loss_clip": 1.03680229, + "balance_loss_mlp": 1.02536774, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.686216656038546, + "language_loss": 0.80848897, + "learning_rate": 2.590931332560622e-07, + "loss": 0.82973641, + "num_input_tokens_seen": 301746930, + "step": 13989, + "time_per_iteration": 4.060243129730225 + }, + { + "auxiliary_loss_clip": 0.01081309, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.03048539, + "balance_loss_mlp": 1.01835978, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.8771117762375018, + "language_loss": 0.75088036, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77198803, + "num_input_tokens_seen": 301766945, + "step": 13990, + "time_per_iteration": 2.6395769119262695 + }, + { + "auxiliary_loss_clip": 0.01082098, + "auxiliary_loss_mlp": 0.01029777, + "balance_loss_clip": 1.03149915, + "balance_loss_mlp": 1.01875603, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.5333241838642566, + "language_loss": 0.80866241, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82978117, + "num_input_tokens_seen": 301785460, + "step": 13991, + "time_per_iteration": 2.523496150970459 + }, + { + "auxiliary_loss_clip": 0.01059779, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.03305709, + "balance_loss_mlp": 1.0213604, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 1.9362339711509176, + "language_loss": 0.70327348, + "learning_rate": 2.585182919204105e-07, + "loss": 0.72418725, + "num_input_tokens_seen": 301804180, + "step": 13992, + "time_per_iteration": 2.63285231590271 + }, + { + "auxiliary_loss_clip": 0.01058532, + "auxiliary_loss_mlp": 0.01027003, + "balance_loss_clip": 1.03017521, + "balance_loss_mlp": 1.01579189, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 1.6717676418228984, + "language_loss": 0.7638396, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78469497, + "num_input_tokens_seen": 301823670, + "step": 13993, + "time_per_iteration": 2.6003105640411377 + }, + { + "auxiliary_loss_clip": 0.01088595, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.03118801, + "balance_loss_mlp": 1.02334905, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 1.860034511828039, + "language_loss": 0.74198771, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76323307, + "num_input_tokens_seen": 301845890, + "step": 13994, + "time_per_iteration": 2.7524776458740234 + }, + { + "auxiliary_loss_clip": 0.01083496, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.03271151, + "balance_loss_mlp": 1.01928854, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 2.0905271475440164, + "language_loss": 0.59523594, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61636698, + "num_input_tokens_seen": 301863985, + "step": 13995, + "time_per_iteration": 2.6044960021972656 + }, + { + "auxiliary_loss_clip": 0.01083277, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.03189147, + "balance_loss_mlp": 1.0186913, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.600618368900707, + "language_loss": 0.71903986, + "learning_rate": 2.577527613603163e-07, + "loss": 0.74017531, + "num_input_tokens_seen": 301882765, + "step": 13996, + "time_per_iteration": 2.5380330085754395 + }, + { + "auxiliary_loss_clip": 0.01068078, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_clip": 1.03010702, + "balance_loss_mlp": 1.01845109, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.7771992869558504, + "language_loss": 0.64340508, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66437495, + "num_input_tokens_seen": 301902720, + "step": 13997, + "time_per_iteration": 2.598738431930542 + }, + { + "auxiliary_loss_clip": 0.01074928, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.03447652, + "balance_loss_mlp": 1.02054119, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 2.008002250491414, + "language_loss": 0.81905091, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84012979, + "num_input_tokens_seen": 301921245, + "step": 13998, + "time_per_iteration": 2.580994129180908 + }, + { + "auxiliary_loss_clip": 0.01088945, + "auxiliary_loss_mlp": 0.00749489, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.00024319, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.6525586997944597, + "language_loss": 0.80156732, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.81995165, + "num_input_tokens_seen": 301942320, + "step": 13999, + "time_per_iteration": 4.06113076210022 + }, + { + "auxiliary_loss_clip": 0.01084037, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.0318985, + "balance_loss_mlp": 1.02110982, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 6.331293258425148, + "language_loss": 0.66926193, + "learning_rate": 2.569882878592096e-07, + "loss": 0.69043636, + "num_input_tokens_seen": 301963110, + "step": 14000, + "time_per_iteration": 2.5673623085021973 + }, + { + "auxiliary_loss_clip": 0.0109045, + "auxiliary_loss_mlp": 0.01025921, + "balance_loss_clip": 1.03420305, + "balance_loss_mlp": 1.01462579, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.5227366894769692, + "language_loss": 0.79435217, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81551588, + "num_input_tokens_seen": 301984915, + "step": 14001, + "time_per_iteration": 2.5976288318634033 + }, + { + "auxiliary_loss_clip": 0.010318, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.03064227, + "balance_loss_mlp": 1.01721978, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.6089146158717262, + "language_loss": 0.78458363, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80517882, + "num_input_tokens_seen": 302004095, + "step": 14002, + "time_per_iteration": 2.678466796875 + }, + { + "auxiliary_loss_clip": 0.01051791, + "auxiliary_loss_mlp": 0.00749113, + "balance_loss_clip": 1.03193891, + "balance_loss_mlp": 1.00014079, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.3900561538853902, + "language_loss": 0.78062379, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.79863286, + "num_input_tokens_seen": 302027250, + "step": 14003, + "time_per_iteration": 2.7294962406158447 + }, + { + "auxiliary_loss_clip": 0.01070029, + "auxiliary_loss_mlp": 0.01024902, + "balance_loss_clip": 1.0337584, + "balance_loss_mlp": 1.01395857, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.6561779442792242, + "language_loss": 0.65550816, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67645741, + "num_input_tokens_seen": 302046950, + "step": 14004, + "time_per_iteration": 2.6048760414123535 + }, + { + "auxiliary_loss_clip": 0.01084068, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.03223681, + "balance_loss_mlp": 1.01939917, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 1.9517485297519688, + "language_loss": 0.75922471, + "learning_rate": 2.560341831785724e-07, + "loss": 0.78037769, + "num_input_tokens_seen": 302065470, + "step": 14005, + "time_per_iteration": 2.599905490875244 + }, + { + "auxiliary_loss_clip": 0.01057469, + "auxiliary_loss_mlp": 0.00749384, + "balance_loss_clip": 1.02938318, + "balance_loss_mlp": 1.00026572, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.7790520792126812, + "language_loss": 0.77335489, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79142344, + "num_input_tokens_seen": 302083190, + "step": 14006, + "time_per_iteration": 2.675401449203491 + }, + { + "auxiliary_loss_clip": 0.01086249, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.03344464, + "balance_loss_mlp": 1.02138567, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.719387206045169, + "language_loss": 0.77022672, + "learning_rate": 2.556530041751932e-07, + "loss": 0.7914142, + "num_input_tokens_seen": 302098820, + "step": 14007, + "time_per_iteration": 2.586643695831299 + }, + { + "auxiliary_loss_clip": 0.0106808, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.0320425, + "balance_loss_mlp": 1.01648736, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.7654488939933741, + "language_loss": 0.6591754, + "learning_rate": 2.554625138886102e-07, + "loss": 0.68013674, + "num_input_tokens_seen": 302117075, + "step": 14008, + "time_per_iteration": 2.6997222900390625 + }, + { + "auxiliary_loss_clip": 0.01014062, + "auxiliary_loss_mlp": 0.01006261, + "balance_loss_clip": 1.00415611, + "balance_loss_mlp": 1.00535476, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7131536071119087, + "language_loss": 0.56991017, + "learning_rate": 2.552720897550631e-07, + "loss": 0.5901134, + "num_input_tokens_seen": 302179735, + "step": 14009, + "time_per_iteration": 3.192603349685669 + }, + { + "auxiliary_loss_clip": 0.01024376, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.02903032, + "balance_loss_mlp": 1.01902604, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.6457508520578155, + "language_loss": 0.78075373, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80129248, + "num_input_tokens_seen": 302202055, + "step": 14010, + "time_per_iteration": 2.759939432144165 + }, + { + "auxiliary_loss_clip": 0.01101993, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.0362072, + "balance_loss_mlp": 1.02501488, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.8413911484694316, + "language_loss": 0.71994531, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74133074, + "num_input_tokens_seen": 302221360, + "step": 14011, + "time_per_iteration": 2.6484909057617188 + }, + { + "auxiliary_loss_clip": 0.01081176, + "auxiliary_loss_mlp": 0.01036331, + "balance_loss_clip": 1.03291106, + "balance_loss_mlp": 1.02510107, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.9460523655761102, + "language_loss": 0.83843899, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.85961407, + "num_input_tokens_seen": 302240715, + "step": 14012, + "time_per_iteration": 2.6578376293182373 + }, + { + "auxiliary_loss_clip": 0.01089426, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.03141665, + "balance_loss_mlp": 1.0194155, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 2.3169023091949086, + "language_loss": 0.68162012, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70280033, + "num_input_tokens_seen": 302260950, + "step": 14013, + "time_per_iteration": 4.10816502571106 + }, + { + "auxiliary_loss_clip": 0.01102449, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.03516901, + "balance_loss_mlp": 1.01756716, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 3.263606005084964, + "language_loss": 0.7844336, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.80574989, + "num_input_tokens_seen": 302277500, + "step": 14014, + "time_per_iteration": 2.5859506130218506 + }, + { + "auxiliary_loss_clip": 0.01064039, + "auxiliary_loss_mlp": 0.00749213, + "balance_loss_clip": 1.03054118, + "balance_loss_mlp": 1.00022435, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.6636375467237248, + "language_loss": 0.6749661, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69309866, + "num_input_tokens_seen": 302297930, + "step": 14015, + "time_per_iteration": 2.674924373626709 + }, + { + "auxiliary_loss_clip": 0.01097651, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.03488588, + "balance_loss_mlp": 1.01777601, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 1.9804431823906794, + "language_loss": 0.75782597, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.77909696, + "num_input_tokens_seen": 302315735, + "step": 14016, + "time_per_iteration": 2.5235483646392822 + }, + { + "auxiliary_loss_clip": 0.01071425, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.03201389, + "balance_loss_mlp": 1.01977479, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 4.608622088911253, + "language_loss": 0.79550576, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81652874, + "num_input_tokens_seen": 302332790, + "step": 14017, + "time_per_iteration": 2.671050786972046 + }, + { + "auxiliary_loss_clip": 0.01073248, + "auxiliary_loss_mlp": 0.01028577, + "balance_loss_clip": 1.0325048, + "balance_loss_mlp": 1.0178721, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 2.1436300415839753, + "language_loss": 0.62922728, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.65024555, + "num_input_tokens_seen": 302346490, + "step": 14018, + "time_per_iteration": 2.592108726501465 + }, + { + "auxiliary_loss_clip": 0.01085661, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.03438878, + "balance_loss_mlp": 1.01778746, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.9299637817581232, + "language_loss": 0.79874253, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81988245, + "num_input_tokens_seen": 302363235, + "step": 14019, + "time_per_iteration": 2.5850703716278076 + }, + { + "auxiliary_loss_clip": 0.01048478, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.02806091, + "balance_loss_mlp": 1.02253819, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.926826433040691, + "language_loss": 0.78490746, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80575156, + "num_input_tokens_seen": 302383270, + "step": 14020, + "time_per_iteration": 2.6595096588134766 + }, + { + "auxiliary_loss_clip": 0.01083347, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.03318739, + "balance_loss_mlp": 1.01843667, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.8581278465811815, + "language_loss": 0.71187794, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73300427, + "num_input_tokens_seen": 302401355, + "step": 14021, + "time_per_iteration": 2.6198365688323975 + }, + { + "auxiliary_loss_clip": 0.01068015, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.03395152, + "balance_loss_mlp": 1.02424431, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.5041447277751527, + "language_loss": 0.69630659, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71734524, + "num_input_tokens_seen": 302419515, + "step": 14022, + "time_per_iteration": 2.6221258640289307 + }, + { + "auxiliary_loss_clip": 0.01039654, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.0324657, + "balance_loss_mlp": 1.02387154, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 1.7040359226867747, + "language_loss": 0.72066671, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74142122, + "num_input_tokens_seen": 302438280, + "step": 14023, + "time_per_iteration": 2.8296592235565186 + }, + { + "auxiliary_loss_clip": 0.01087525, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.03481197, + "balance_loss_mlp": 1.02338672, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.7245465940518658, + "language_loss": 0.66774678, + "learning_rate": 2.524236710204559e-07, + "loss": 0.68897057, + "num_input_tokens_seen": 302460860, + "step": 14024, + "time_per_iteration": 4.1217498779296875 + }, + { + "auxiliary_loss_clip": 0.01082123, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.03273916, + "balance_loss_mlp": 1.01834512, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.824631097826297, + "language_loss": 0.80859244, + "learning_rate": 2.522343063158261e-07, + "loss": 0.82970929, + "num_input_tokens_seen": 302476980, + "step": 14025, + "time_per_iteration": 2.5417182445526123 + }, + { + "auxiliary_loss_clip": 0.0108251, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.0330739, + "balance_loss_mlp": 1.01872206, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.480110433970341, + "language_loss": 0.77885741, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.7999652, + "num_input_tokens_seen": 302496380, + "step": 14026, + "time_per_iteration": 2.5965986251831055 + }, + { + "auxiliary_loss_clip": 0.01068285, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.0310123, + "balance_loss_mlp": 1.02117026, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.5516367299455636, + "language_loss": 0.82638508, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84739077, + "num_input_tokens_seen": 302516845, + "step": 14027, + "time_per_iteration": 2.671769857406616 + }, + { + "auxiliary_loss_clip": 0.01071379, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.03137207, + "balance_loss_mlp": 1.01862681, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.6495688547694518, + "language_loss": 0.56703138, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58803892, + "num_input_tokens_seen": 302538865, + "step": 14028, + "time_per_iteration": 2.7549071311950684 + }, + { + "auxiliary_loss_clip": 0.01069692, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.03182733, + "balance_loss_mlp": 1.01666546, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 2.0003016748624725, + "language_loss": 0.63777947, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65874863, + "num_input_tokens_seen": 302557970, + "step": 14029, + "time_per_iteration": 4.143076181411743 + }, + { + "auxiliary_loss_clip": 0.01093727, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.03370512, + "balance_loss_mlp": 1.01895261, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5770513473335823, + "language_loss": 0.74944353, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77066541, + "num_input_tokens_seen": 302578915, + "step": 14030, + "time_per_iteration": 2.6484949588775635 + }, + { + "auxiliary_loss_clip": 0.01076966, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03451061, + "balance_loss_mlp": 1.02032089, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.9059596099531986, + "language_loss": 0.83145833, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85253936, + "num_input_tokens_seen": 302596300, + "step": 14031, + "time_per_iteration": 2.5519802570343018 + }, + { + "auxiliary_loss_clip": 0.01071322, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.03027344, + "balance_loss_mlp": 1.01953709, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.9084870442648811, + "language_loss": 0.80048895, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82150167, + "num_input_tokens_seen": 302614975, + "step": 14032, + "time_per_iteration": 2.5486514568328857 + }, + { + "auxiliary_loss_clip": 0.01059603, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.02919507, + "balance_loss_mlp": 1.01979423, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.511951336946478, + "language_loss": 0.75462794, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77554703, + "num_input_tokens_seen": 302636415, + "step": 14033, + "time_per_iteration": 2.6389825344085693 + }, + { + "auxiliary_loss_clip": 0.01057601, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.03129256, + "balance_loss_mlp": 1.02246344, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.8854982276304515, + "language_loss": 0.83463752, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85554159, + "num_input_tokens_seen": 302653605, + "step": 14034, + "time_per_iteration": 2.587935209274292 + }, + { + "auxiliary_loss_clip": 0.01059445, + "auxiliary_loss_mlp": 0.0102759, + "balance_loss_clip": 1.03079665, + "balance_loss_mlp": 1.01510894, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 3.8603623592803245, + "language_loss": 0.78437531, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80524558, + "num_input_tokens_seen": 302673965, + "step": 14035, + "time_per_iteration": 2.696776866912842 + }, + { + "auxiliary_loss_clip": 0.01079015, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.03123772, + "balance_loss_mlp": 1.02012753, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 2.928818359255414, + "language_loss": 0.72040069, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74150115, + "num_input_tokens_seen": 302695560, + "step": 14036, + "time_per_iteration": 2.6070003509521484 + }, + { + "auxiliary_loss_clip": 0.01091362, + "auxiliary_loss_mlp": 0.01027045, + "balance_loss_clip": 1.03293645, + "balance_loss_mlp": 1.01821768, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.7183025631598297, + "language_loss": 0.69857621, + "learning_rate": 2.49967101396557e-07, + "loss": 0.7197603, + "num_input_tokens_seen": 302713480, + "step": 14037, + "time_per_iteration": 2.575766086578369 + }, + { + "auxiliary_loss_clip": 0.01095135, + "auxiliary_loss_mlp": 0.0102316, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 1.01277649, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.7632988792786404, + "language_loss": 0.68534511, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.70652807, + "num_input_tokens_seen": 302736860, + "step": 14038, + "time_per_iteration": 2.574127674102783 + }, + { + "auxiliary_loss_clip": 0.01034105, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.02765262, + "balance_loss_mlp": 1.0249548, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.610126586270983, + "language_loss": 0.768785, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78948873, + "num_input_tokens_seen": 302757745, + "step": 14039, + "time_per_iteration": 2.6411452293395996 + }, + { + "auxiliary_loss_clip": 0.01102872, + "auxiliary_loss_mlp": 0.01026989, + "balance_loss_clip": 1.0360142, + "balance_loss_mlp": 1.0159682, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 1.8780398294960463, + "language_loss": 0.79571307, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81701171, + "num_input_tokens_seen": 302774885, + "step": 14040, + "time_per_iteration": 3.9833920001983643 + }, + { + "auxiliary_loss_clip": 0.01063956, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.03297806, + "balance_loss_mlp": 1.01984072, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 2.3424267569295654, + "language_loss": 0.69066978, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71162176, + "num_input_tokens_seen": 302791035, + "step": 14041, + "time_per_iteration": 2.591719150543213 + }, + { + "auxiliary_loss_clip": 0.01073567, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.03129411, + "balance_loss_mlp": 1.02025676, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.975091835900667, + "language_loss": 0.68813676, + "learning_rate": 2.490252523307341e-07, + "loss": 0.70918322, + "num_input_tokens_seen": 302808650, + "step": 14042, + "time_per_iteration": 2.5665881633758545 + }, + { + "auxiliary_loss_clip": 0.01072174, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.0318166, + "balance_loss_mlp": 1.02155709, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 2.3691017268392174, + "language_loss": 0.74766922, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.7687127, + "num_input_tokens_seen": 302824605, + "step": 14043, + "time_per_iteration": 2.5490317344665527 + }, + { + "auxiliary_loss_clip": 0.01096465, + "auxiliary_loss_mlp": 0.00749364, + "balance_loss_clip": 1.03418314, + "balance_loss_mlp": 1.00024843, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 3.2770882920625475, + "language_loss": 0.71845287, + "learning_rate": 2.486489774343865e-07, + "loss": 0.73691112, + "num_input_tokens_seen": 302840170, + "step": 14044, + "time_per_iteration": 2.4730618000030518 + }, + { + "auxiliary_loss_clip": 0.01073137, + "auxiliary_loss_mlp": 0.01027099, + "balance_loss_clip": 1.03155494, + "balance_loss_mlp": 1.01615572, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.5800047880404815, + "language_loss": 0.7491551, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77015746, + "num_input_tokens_seen": 302858320, + "step": 14045, + "time_per_iteration": 2.5784666538238525 + }, + { + "auxiliary_loss_clip": 0.01072747, + "auxiliary_loss_mlp": 0.00749361, + "balance_loss_clip": 1.02995181, + "balance_loss_mlp": 1.00031304, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 1.6014288848996239, + "language_loss": 0.78406715, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80228829, + "num_input_tokens_seen": 302875255, + "step": 14046, + "time_per_iteration": 2.544243574142456 + }, + { + "auxiliary_loss_clip": 0.01068209, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.03121579, + "balance_loss_mlp": 1.01909876, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 2.4679703274263, + "language_loss": 0.77843755, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.79944313, + "num_input_tokens_seen": 302894690, + "step": 14047, + "time_per_iteration": 2.5457205772399902 + }, + { + "auxiliary_loss_clip": 0.01075493, + "auxiliary_loss_mlp": 0.01026514, + "balance_loss_clip": 1.03555107, + "balance_loss_mlp": 1.01570153, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 2.1685385162224313, + "language_loss": 0.72009885, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74111891, + "num_input_tokens_seen": 302912405, + "step": 14048, + "time_per_iteration": 2.650843381881714 + }, + { + "auxiliary_loss_clip": 0.01016339, + "auxiliary_loss_mlp": 0.01030253, + "balance_loss_clip": 1.03106797, + "balance_loss_mlp": 1.01918435, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 2.61033390057666, + "language_loss": 0.73641574, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75688165, + "num_input_tokens_seen": 302932525, + "step": 14049, + "time_per_iteration": 2.866472005844116 + }, + { + "auxiliary_loss_clip": 0.01014236, + "auxiliary_loss_mlp": 0.00746572, + "balance_loss_clip": 1.00441003, + "balance_loss_mlp": 0.99982399, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.8027528355486206, + "language_loss": 0.60639966, + "learning_rate": 2.475217468471729e-07, + "loss": 0.6240077, + "num_input_tokens_seen": 302991285, + "step": 14050, + "time_per_iteration": 3.2247228622436523 + }, + { + "auxiliary_loss_clip": 0.01068796, + "auxiliary_loss_mlp": 0.00749613, + "balance_loss_clip": 1.03026998, + "balance_loss_mlp": 1.00022793, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.179621945890089, + "language_loss": 0.72011256, + "learning_rate": 2.473341076306303e-07, + "loss": 0.73829663, + "num_input_tokens_seen": 303009515, + "step": 14051, + "time_per_iteration": 2.6024129390716553 + }, + { + "auxiliary_loss_clip": 0.01082874, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.03176665, + "balance_loss_mlp": 1.01451707, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 1.7379532792716783, + "language_loss": 0.7485013, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76958382, + "num_input_tokens_seen": 303026905, + "step": 14052, + "time_per_iteration": 4.116090536117554 + }, + { + "auxiliary_loss_clip": 0.01065233, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.03296685, + "balance_loss_mlp": 1.01802468, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 1.8821017341773703, + "language_loss": 0.73670995, + "learning_rate": 2.469590285884575e-07, + "loss": 0.75763631, + "num_input_tokens_seen": 303045245, + "step": 14053, + "time_per_iteration": 2.5485029220581055 + }, + { + "auxiliary_loss_clip": 0.010769, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.03179908, + "balance_loss_mlp": 1.01445389, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 1.888243645765689, + "language_loss": 0.74110126, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76212341, + "num_input_tokens_seen": 303065205, + "step": 14054, + "time_per_iteration": 2.5199849605560303 + }, + { + "auxiliary_loss_clip": 0.01089378, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.03417039, + "balance_loss_mlp": 1.01739025, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.3585584283400705, + "language_loss": 0.78049946, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.8016758, + "num_input_tokens_seen": 303088250, + "step": 14055, + "time_per_iteration": 2.6263723373413086 + }, + { + "auxiliary_loss_clip": 0.0108371, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.03271484, + "balance_loss_mlp": 1.01898372, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.7592180151030072, + "language_loss": 0.7260927, + "learning_rate": 2.463969086091302e-07, + "loss": 0.74722457, + "num_input_tokens_seen": 303109280, + "step": 14056, + "time_per_iteration": 2.563096046447754 + }, + { + "auxiliary_loss_clip": 0.01087314, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.0335927, + "balance_loss_mlp": 1.02248836, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.0959799834118216, + "language_loss": 0.67448682, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69570279, + "num_input_tokens_seen": 303126075, + "step": 14057, + "time_per_iteration": 2.526634454727173 + }, + { + "auxiliary_loss_clip": 0.01045358, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.02945757, + "balance_loss_mlp": 1.01760125, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.7177486545723724, + "language_loss": 0.77657855, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79731858, + "num_input_tokens_seen": 303146920, + "step": 14058, + "time_per_iteration": 2.658313274383545 + }, + { + "auxiliary_loss_clip": 0.01098066, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.03402042, + "balance_loss_mlp": 1.01892149, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.532546511179215, + "language_loss": 0.69874597, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72002256, + "num_input_tokens_seen": 303167885, + "step": 14059, + "time_per_iteration": 2.5307493209838867 + }, + { + "auxiliary_loss_clip": 0.01101049, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.03475451, + "balance_loss_mlp": 1.01949453, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 2.411918083521014, + "language_loss": 0.5791381, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.60046327, + "num_input_tokens_seen": 303185000, + "step": 14060, + "time_per_iteration": 2.468085765838623 + }, + { + "auxiliary_loss_clip": 0.01074448, + "auxiliary_loss_mlp": 0.01032884, + "balance_loss_clip": 1.03011584, + "balance_loss_mlp": 1.02025986, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 6.000651059996098, + "language_loss": 0.757599, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77867234, + "num_input_tokens_seen": 303205210, + "step": 14061, + "time_per_iteration": 2.603209972381592 + }, + { + "auxiliary_loss_clip": 0.01068072, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.03121865, + "balance_loss_mlp": 1.01658511, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.3872113870525222, + "language_loss": 0.70647013, + "learning_rate": 2.452744642558013e-07, + "loss": 0.7274332, + "num_input_tokens_seen": 303224655, + "step": 14062, + "time_per_iteration": 2.669285297393799 + }, + { + "auxiliary_loss_clip": 0.00988567, + "auxiliary_loss_mlp": 0.01002297, + "balance_loss_clip": 1.01391733, + "balance_loss_mlp": 1.00116408, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6350647908297434, + "language_loss": 0.5267154, + "learning_rate": 2.450876230433432e-07, + "loss": 0.546624, + "num_input_tokens_seen": 303289645, + "step": 14063, + "time_per_iteration": 3.292854070663452 + }, + { + "auxiliary_loss_clip": 0.01053, + "auxiliary_loss_mlp": 0.01024054, + "balance_loss_clip": 1.0323261, + "balance_loss_mlp": 1.01460671, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.7836881630862351, + "language_loss": 0.81802362, + "learning_rate": 2.449008483773378e-07, + "loss": 0.83879417, + "num_input_tokens_seen": 303308350, + "step": 14064, + "time_per_iteration": 4.382059812545776 + }, + { + "auxiliary_loss_clip": 0.01089409, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.03620124, + "balance_loss_mlp": 1.01962781, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 2.047333938433595, + "language_loss": 0.72775, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74895805, + "num_input_tokens_seen": 303325230, + "step": 14065, + "time_per_iteration": 2.5520145893096924 + }, + { + "auxiliary_loss_clip": 0.01060339, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.03097069, + "balance_loss_mlp": 1.01895285, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.5284958696482407, + "language_loss": 0.77373755, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79463512, + "num_input_tokens_seen": 303345810, + "step": 14066, + "time_per_iteration": 2.691305160522461 + }, + { + "auxiliary_loss_clip": 0.01060181, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.03462148, + "balance_loss_mlp": 1.0179832, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.517534950887861, + "language_loss": 0.69986355, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72075623, + "num_input_tokens_seen": 303365140, + "step": 14067, + "time_per_iteration": 2.587219715118408 + }, + { + "auxiliary_loss_clip": 0.01057498, + "auxiliary_loss_mlp": 0.0102658, + "balance_loss_clip": 1.02942991, + "balance_loss_mlp": 1.01586938, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 2.1055147282619595, + "language_loss": 0.7112143, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73205507, + "num_input_tokens_seen": 303386150, + "step": 14068, + "time_per_iteration": 2.778855085372925 + }, + { + "auxiliary_loss_clip": 0.009843, + "auxiliary_loss_mlp": 0.01004112, + "balance_loss_clip": 1.0044775, + "balance_loss_mlp": 1.00314641, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6937557829233225, + "language_loss": 0.60494423, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62482834, + "num_input_tokens_seen": 303453770, + "step": 14069, + "time_per_iteration": 4.708638668060303 + }, + { + "auxiliary_loss_clip": 0.01071831, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.03343368, + "balance_loss_mlp": 1.01585436, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.4206426309820177, + "language_loss": 0.74613154, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76711291, + "num_input_tokens_seen": 303474520, + "step": 14070, + "time_per_iteration": 2.5805678367614746 + }, + { + "auxiliary_loss_clip": 0.01049095, + "auxiliary_loss_mlp": 0.01029712, + "balance_loss_clip": 1.03122866, + "balance_loss_mlp": 1.01860213, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.6260705761197793, + "language_loss": 0.67118275, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69197083, + "num_input_tokens_seen": 303497345, + "step": 14071, + "time_per_iteration": 2.788698196411133 + }, + { + "auxiliary_loss_clip": 0.0101267, + "auxiliary_loss_mlp": 0.00746597, + "balance_loss_clip": 1.00336099, + "balance_loss_mlp": 0.99973631, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7354774132321544, + "language_loss": 0.6102668, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.62785947, + "num_input_tokens_seen": 303554890, + "step": 14072, + "time_per_iteration": 2.9409708976745605 + }, + { + "auxiliary_loss_clip": 0.0106034, + "auxiliary_loss_mlp": 0.01032247, + "balance_loss_clip": 1.03422904, + "balance_loss_mlp": 1.01944447, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 1.809674890913138, + "language_loss": 0.72735816, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74828398, + "num_input_tokens_seen": 303574380, + "step": 14073, + "time_per_iteration": 2.656187057495117 + }, + { + "auxiliary_loss_clip": 0.01081583, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.03563523, + "balance_loss_mlp": 1.02012873, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 1.675285619188533, + "language_loss": 0.77874523, + "learning_rate": 2.430367633291155e-07, + "loss": 0.79988325, + "num_input_tokens_seen": 303594910, + "step": 14074, + "time_per_iteration": 2.7120964527130127 + }, + { + "auxiliary_loss_clip": 0.01086823, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.03407562, + "balance_loss_mlp": 1.01791477, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 1.9344951540294077, + "language_loss": 0.75459105, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77575016, + "num_input_tokens_seen": 303613520, + "step": 14075, + "time_per_iteration": 2.564074754714966 + }, + { + "auxiliary_loss_clip": 0.01071775, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.03104448, + "balance_loss_mlp": 1.01635098, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 2.068754216314089, + "language_loss": 0.73475772, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75575066, + "num_input_tokens_seen": 303631225, + "step": 14076, + "time_per_iteration": 2.577949285507202 + }, + { + "auxiliary_loss_clip": 0.01080247, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.03403306, + "balance_loss_mlp": 1.02381921, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 2.0504438789661985, + "language_loss": 0.77794933, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79910219, + "num_input_tokens_seen": 303649175, + "step": 14077, + "time_per_iteration": 2.580413341522217 + }, + { + "auxiliary_loss_clip": 0.01063512, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.03280485, + "balance_loss_mlp": 1.02070427, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 2.017880938055654, + "language_loss": 0.75539345, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77634645, + "num_input_tokens_seen": 303665915, + "step": 14078, + "time_per_iteration": 2.596143960952759 + }, + { + "auxiliary_loss_clip": 0.0105517, + "auxiliary_loss_mlp": 0.01023194, + "balance_loss_clip": 1.0324347, + "balance_loss_mlp": 1.01183963, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.47367910738808, + "language_loss": 0.85303712, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87382078, + "num_input_tokens_seen": 303679985, + "step": 14079, + "time_per_iteration": 4.186610221862793 + }, + { + "auxiliary_loss_clip": 0.01071402, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.03380382, + "balance_loss_mlp": 1.02410114, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.202518925055592, + "language_loss": 0.59120286, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61227655, + "num_input_tokens_seen": 303698470, + "step": 14080, + "time_per_iteration": 2.6283211708068848 + }, + { + "auxiliary_loss_clip": 0.01072507, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.03315389, + "balance_loss_mlp": 1.01987052, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.0080178918654847, + "language_loss": 0.66424102, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68527901, + "num_input_tokens_seen": 303716415, + "step": 14081, + "time_per_iteration": 2.5732972621917725 + }, + { + "auxiliary_loss_clip": 0.01080994, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.03158069, + "balance_loss_mlp": 1.02306271, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 1.8747226808768782, + "language_loss": 0.73420393, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75535834, + "num_input_tokens_seen": 303734490, + "step": 14082, + "time_per_iteration": 2.5732884407043457 + }, + { + "auxiliary_loss_clip": 0.010392, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.02715111, + "balance_loss_mlp": 1.02289319, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 1.9955291955182133, + "language_loss": 0.76003569, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78077853, + "num_input_tokens_seen": 303752310, + "step": 14083, + "time_per_iteration": 2.66549015045166 + }, + { + "auxiliary_loss_clip": 0.01046939, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.0293982, + "balance_loss_mlp": 1.01970589, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 2.3780221586381303, + "language_loss": 0.66080171, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68159807, + "num_input_tokens_seen": 303776065, + "step": 14084, + "time_per_iteration": 2.7648844718933105 + }, + { + "auxiliary_loss_clip": 0.01055877, + "auxiliary_loss_mlp": 0.01027663, + "balance_loss_clip": 1.03437829, + "balance_loss_mlp": 1.01703548, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 2.5510351340630883, + "language_loss": 0.70060337, + "learning_rate": 2.409939651426938e-07, + "loss": 0.72143877, + "num_input_tokens_seen": 303793500, + "step": 14085, + "time_per_iteration": 2.6607117652893066 + }, + { + "auxiliary_loss_clip": 0.01054455, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.03090882, + "balance_loss_mlp": 1.01759088, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.475922342165898, + "language_loss": 0.70851946, + "learning_rate": 2.408086562860634e-07, + "loss": 0.72934437, + "num_input_tokens_seen": 303814835, + "step": 14086, + "time_per_iteration": 2.6948156356811523 + }, + { + "auxiliary_loss_clip": 0.01076837, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.03110182, + "balance_loss_mlp": 1.01955914, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.7932143895122283, + "language_loss": 0.7505101, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.77158403, + "num_input_tokens_seen": 303834505, + "step": 14087, + "time_per_iteration": 2.5273244380950928 + }, + { + "auxiliary_loss_clip": 0.01067798, + "auxiliary_loss_mlp": 0.01021794, + "balance_loss_clip": 1.0345912, + "balance_loss_mlp": 1.01082671, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.4474554697431101, + "language_loss": 0.74156821, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76246417, + "num_input_tokens_seen": 303855050, + "step": 14088, + "time_per_iteration": 2.5811872482299805 + }, + { + "auxiliary_loss_clip": 0.010885, + "auxiliary_loss_mlp": 0.0103253, + "balance_loss_clip": 1.03405666, + "balance_loss_mlp": 1.02164042, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 1.9504976260341655, + "language_loss": 0.71957898, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74078929, + "num_input_tokens_seen": 303875635, + "step": 14089, + "time_per_iteration": 2.552358627319336 + }, + { + "auxiliary_loss_clip": 0.0109651, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.03534484, + "balance_loss_mlp": 1.01538229, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.437475671927781, + "language_loss": 0.78984708, + "learning_rate": 2.400680880168928e-07, + "loss": 0.8110708, + "num_input_tokens_seen": 303896750, + "step": 14090, + "time_per_iteration": 2.4843928813934326 + }, + { + "auxiliary_loss_clip": 0.01039263, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.0286057, + "balance_loss_mlp": 1.02646661, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 4.349928578695947, + "language_loss": 0.7706089, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.79139423, + "num_input_tokens_seen": 303915435, + "step": 14091, + "time_per_iteration": 2.7173399925231934 + }, + { + "auxiliary_loss_clip": 0.01022362, + "auxiliary_loss_mlp": 0.01003675, + "balance_loss_clip": 1.0026499, + "balance_loss_mlp": 1.00280499, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8203783022440717, + "language_loss": 0.59369713, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61395752, + "num_input_tokens_seen": 303977245, + "step": 14092, + "time_per_iteration": 4.588685750961304 + }, + { + "auxiliary_loss_clip": 0.01068856, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.02972162, + "balance_loss_mlp": 1.02098346, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 21.947411688682067, + "language_loss": 0.70238441, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72339785, + "num_input_tokens_seen": 303996055, + "step": 14093, + "time_per_iteration": 2.547968626022339 + }, + { + "auxiliary_loss_clip": 0.01094104, + "auxiliary_loss_mlp": 0.01025489, + "balance_loss_clip": 1.03282154, + "balance_loss_mlp": 1.01517737, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 1.8788180944282515, + "language_loss": 0.83736974, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85856569, + "num_input_tokens_seen": 304012205, + "step": 14094, + "time_per_iteration": 2.488361358642578 + }, + { + "auxiliary_loss_clip": 0.0107618, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.03369486, + "balance_loss_mlp": 1.02049315, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.6000390107395894, + "language_loss": 0.71199501, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73306286, + "num_input_tokens_seen": 304033475, + "step": 14095, + "time_per_iteration": 2.577305316925049 + }, + { + "auxiliary_loss_clip": 0.01082439, + "auxiliary_loss_mlp": 0.00749339, + "balance_loss_clip": 1.03225899, + "balance_loss_mlp": 1.0002687, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 2.78192716730277, + "language_loss": 0.80850512, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82682288, + "num_input_tokens_seen": 304051845, + "step": 14096, + "time_per_iteration": 2.610988140106201 + }, + { + "auxiliary_loss_clip": 0.01086815, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.03230286, + "balance_loss_mlp": 1.01666188, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.9721492404935788, + "language_loss": 0.77161324, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79276681, + "num_input_tokens_seen": 304069965, + "step": 14097, + "time_per_iteration": 2.5961365699768066 + }, + { + "auxiliary_loss_clip": 0.01064603, + "auxiliary_loss_mlp": 0.01024165, + "balance_loss_clip": 1.03289688, + "balance_loss_mlp": 1.0134418, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.7946973099249763, + "language_loss": 0.80455303, + "learning_rate": 2.385901552932048e-07, + "loss": 0.8254407, + "num_input_tokens_seen": 304086805, + "step": 14098, + "time_per_iteration": 2.657440662384033 + }, + { + "auxiliary_loss_clip": 0.0107824, + "auxiliary_loss_mlp": 0.00749423, + "balance_loss_clip": 1.03225696, + "balance_loss_mlp": 1.0002501, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 2.002008217040402, + "language_loss": 0.71944016, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73771679, + "num_input_tokens_seen": 304105865, + "step": 14099, + "time_per_iteration": 2.542980432510376 + }, + { + "auxiliary_loss_clip": 0.01082078, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.03058052, + "balance_loss_mlp": 1.01757181, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 1.6874571040853221, + "language_loss": 0.63114011, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.65225708, + "num_input_tokens_seen": 304128300, + "step": 14100, + "time_per_iteration": 2.641993522644043 + }, + { + "auxiliary_loss_clip": 0.01088889, + "auxiliary_loss_mlp": 0.01028695, + "balance_loss_clip": 1.03368211, + "balance_loss_mlp": 1.01696491, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 2.1525150290635366, + "language_loss": 0.73827624, + "learning_rate": 2.380370324111085e-07, + "loss": 0.7594521, + "num_input_tokens_seen": 304143695, + "step": 14101, + "time_per_iteration": 2.5426337718963623 + }, + { + "auxiliary_loss_clip": 0.01084999, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.03147459, + "balance_loss_mlp": 1.01879859, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 1.9790928474124727, + "language_loss": 0.71359789, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73474211, + "num_input_tokens_seen": 304165800, + "step": 14102, + "time_per_iteration": 2.593966007232666 + }, + { + "auxiliary_loss_clip": 0.01068786, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.03100753, + "balance_loss_mlp": 1.01882112, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.0015794244286407, + "language_loss": 0.81644368, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83743989, + "num_input_tokens_seen": 304182910, + "step": 14103, + "time_per_iteration": 2.566758632659912 + }, + { + "auxiliary_loss_clip": 0.01097216, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.03461564, + "balance_loss_mlp": 1.01898038, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 1.9645409322578196, + "language_loss": 0.78577644, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80704486, + "num_input_tokens_seen": 304200175, + "step": 14104, + "time_per_iteration": 2.5044057369232178 + }, + { + "auxiliary_loss_clip": 0.01090148, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.03556621, + "balance_loss_mlp": 1.02012765, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 2.724082530341221, + "language_loss": 0.78835028, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.80956876, + "num_input_tokens_seen": 304217775, + "step": 14105, + "time_per_iteration": 4.086205959320068 + }, + { + "auxiliary_loss_clip": 0.01070451, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.03272152, + "balance_loss_mlp": 1.02158689, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 2.018217044953587, + "language_loss": 0.50264895, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.5237003, + "num_input_tokens_seen": 304235760, + "step": 14106, + "time_per_iteration": 2.5547361373901367 + }, + { + "auxiliary_loss_clip": 0.01065425, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.03350246, + "balance_loss_mlp": 1.02131319, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 1.834679487678236, + "language_loss": 0.75725949, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77823353, + "num_input_tokens_seen": 304253985, + "step": 14107, + "time_per_iteration": 2.598170042037964 + }, + { + "auxiliary_loss_clip": 0.01063417, + "auxiliary_loss_mlp": 0.0102455, + "balance_loss_clip": 1.03126717, + "balance_loss_mlp": 1.01365423, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.7033424282694725, + "language_loss": 0.73202032, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.7529, + "num_input_tokens_seen": 304276785, + "step": 14108, + "time_per_iteration": 4.275249481201172 + }, + { + "auxiliary_loss_clip": 0.01093778, + "auxiliary_loss_mlp": 0.01025946, + "balance_loss_clip": 1.03291011, + "balance_loss_mlp": 1.01345861, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.7401822996663572, + "language_loss": 0.72237581, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74357307, + "num_input_tokens_seen": 304296310, + "step": 14109, + "time_per_iteration": 2.533055305480957 + }, + { + "auxiliary_loss_clip": 0.01024021, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.02891064, + "balance_loss_mlp": 1.01705098, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 2.62169613662076, + "language_loss": 0.74308419, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.76360846, + "num_input_tokens_seen": 304311715, + "step": 14110, + "time_per_iteration": 2.654709815979004 + }, + { + "auxiliary_loss_clip": 0.01038565, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.03116393, + "balance_loss_mlp": 1.01927888, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.6668646244093066, + "language_loss": 0.7622658, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78295135, + "num_input_tokens_seen": 304331910, + "step": 14111, + "time_per_iteration": 2.7013044357299805 + }, + { + "auxiliary_loss_clip": 0.0109556, + "auxiliary_loss_mlp": 0.01025064, + "balance_loss_clip": 1.03357625, + "balance_loss_mlp": 1.01491392, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 1.5972829906088104, + "language_loss": 0.67263472, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69384098, + "num_input_tokens_seen": 304351405, + "step": 14112, + "time_per_iteration": 2.6259493827819824 + }, + { + "auxiliary_loss_clip": 0.01080331, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.03057957, + "balance_loss_mlp": 1.01932931, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.4838107847172473, + "language_loss": 0.73768306, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75878811, + "num_input_tokens_seen": 304372935, + "step": 14113, + "time_per_iteration": 2.5621695518493652 + }, + { + "auxiliary_loss_clip": 0.01060159, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.03409481, + "balance_loss_mlp": 1.02056241, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 1.927869303335163, + "language_loss": 0.66892397, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.6898374, + "num_input_tokens_seen": 304393070, + "step": 14114, + "time_per_iteration": 2.669626474380493 + }, + { + "auxiliary_loss_clip": 0.01100205, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.03511524, + "balance_loss_mlp": 1.02072573, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.7897307894877064, + "language_loss": 0.78981221, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.81113619, + "num_input_tokens_seen": 304411195, + "step": 14115, + "time_per_iteration": 2.5556743144989014 + }, + { + "auxiliary_loss_clip": 0.01097222, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.03388762, + "balance_loss_mlp": 1.01925516, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 2.1368653336902743, + "language_loss": 0.78514844, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.80642074, + "num_input_tokens_seen": 304429425, + "step": 14116, + "time_per_iteration": 2.509082078933716 + }, + { + "auxiliary_loss_clip": 0.01087587, + "auxiliary_loss_mlp": 0.01028952, + "balance_loss_clip": 1.03296518, + "balance_loss_mlp": 1.01766896, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 5.9171747458428685, + "language_loss": 0.68154919, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70271456, + "num_input_tokens_seen": 304447460, + "step": 14117, + "time_per_iteration": 2.474499464035034 + }, + { + "auxiliary_loss_clip": 0.01078554, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.03241336, + "balance_loss_mlp": 1.01501513, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 1.804307934188357, + "language_loss": 0.64848614, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.66953313, + "num_input_tokens_seen": 304468230, + "step": 14118, + "time_per_iteration": 4.072153329849243 + }, + { + "auxiliary_loss_clip": 0.01052578, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.03034639, + "balance_loss_mlp": 1.01970482, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.5158567562548726, + "language_loss": 0.73031241, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75114238, + "num_input_tokens_seen": 304484860, + "step": 14119, + "time_per_iteration": 2.6721303462982178 + }, + { + "auxiliary_loss_clip": 0.010642, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.0309844, + "balance_loss_mlp": 1.01884305, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.7384635585250559, + "language_loss": 0.78227353, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80322325, + "num_input_tokens_seen": 304503575, + "step": 14120, + "time_per_iteration": 2.5570003986358643 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.03561342, + "balance_loss_mlp": 1.01661587, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 6.2827691974899205, + "language_loss": 0.75678754, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.77791619, + "num_input_tokens_seen": 304525005, + "step": 14121, + "time_per_iteration": 2.5697662830352783 + }, + { + "auxiliary_loss_clip": 0.00983435, + "auxiliary_loss_mlp": 0.00999444, + "balance_loss_clip": 1.00439787, + "balance_loss_mlp": 0.99846059, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8130486386808371, + "language_loss": 0.60132611, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.6211549, + "num_input_tokens_seen": 304585220, + "step": 14122, + "time_per_iteration": 3.1525182723999023 + }, + { + "auxiliary_loss_clip": 0.0108695, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.03363109, + "balance_loss_mlp": 1.01675296, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 2.238001408215896, + "language_loss": 0.79752201, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81866693, + "num_input_tokens_seen": 304604665, + "step": 14123, + "time_per_iteration": 2.5865659713745117 + }, + { + "auxiliary_loss_clip": 0.01084308, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.0346247, + "balance_loss_mlp": 1.01467001, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 2.194906521499868, + "language_loss": 0.82887781, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.84997857, + "num_input_tokens_seen": 304620600, + "step": 14124, + "time_per_iteration": 2.5759217739105225 + }, + { + "auxiliary_loss_clip": 0.01051062, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.03337979, + "balance_loss_mlp": 1.02057076, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 1.8187274707415946, + "language_loss": 0.71744281, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73827106, + "num_input_tokens_seen": 304639540, + "step": 14125, + "time_per_iteration": 2.6863200664520264 + }, + { + "auxiliary_loss_clip": 0.01101149, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.0348177, + "balance_loss_mlp": 1.02570009, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.8347008193733418, + "language_loss": 0.73724478, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75863266, + "num_input_tokens_seen": 304660595, + "step": 14126, + "time_per_iteration": 2.5695314407348633 + }, + { + "auxiliary_loss_clip": 0.01047125, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.03010929, + "balance_loss_mlp": 1.02355266, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.5077470997976845, + "language_loss": 0.67325354, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69408423, + "num_input_tokens_seen": 304679580, + "step": 14127, + "time_per_iteration": 2.645937442779541 + }, + { + "auxiliary_loss_clip": 0.01061058, + "auxiliary_loss_mlp": 0.00749612, + "balance_loss_clip": 1.03030229, + "balance_loss_mlp": 1.00029683, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.808121569102017, + "language_loss": 0.69082242, + "learning_rate": 2.330860086502211e-07, + "loss": 0.70892906, + "num_input_tokens_seen": 304698385, + "step": 14128, + "time_per_iteration": 2.659278631210327 + }, + { + "auxiliary_loss_clip": 0.01068369, + "auxiliary_loss_mlp": 0.01031392, + "balance_loss_clip": 1.0316627, + "balance_loss_mlp": 1.0198226, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 2.1841728632454567, + "language_loss": 0.781142, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80213964, + "num_input_tokens_seen": 304715430, + "step": 14129, + "time_per_iteration": 2.5797014236450195 + }, + { + "auxiliary_loss_clip": 0.01029143, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.03086841, + "balance_loss_mlp": 1.01809692, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 1.9095474155942842, + "language_loss": 0.68421161, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.704799, + "num_input_tokens_seen": 304734345, + "step": 14130, + "time_per_iteration": 2.743356704711914 + }, + { + "auxiliary_loss_clip": 0.01087946, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.0349201, + "balance_loss_mlp": 1.01860321, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 1.6390082020008738, + "language_loss": 0.7083056, + "learning_rate": 2.3253890747186e-07, + "loss": 0.72947991, + "num_input_tokens_seen": 304755030, + "step": 14131, + "time_per_iteration": 2.6378533840179443 + }, + { + "auxiliary_loss_clip": 0.01066235, + "auxiliary_loss_mlp": 0.0102839, + "balance_loss_clip": 1.03377545, + "balance_loss_mlp": 1.01742256, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.9074391318497819, + "language_loss": 0.6843859, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.7053321, + "num_input_tokens_seen": 304774320, + "step": 14132, + "time_per_iteration": 4.1678876876831055 + }, + { + "auxiliary_loss_clip": 0.01093527, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.03161216, + "balance_loss_mlp": 1.02416563, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.5914814920272389, + "language_loss": 0.70389611, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72517484, + "num_input_tokens_seen": 304795355, + "step": 14133, + "time_per_iteration": 2.561767578125 + }, + { + "auxiliary_loss_clip": 0.00986965, + "auxiliary_loss_mlp": 0.00746566, + "balance_loss_clip": 1.00701714, + "balance_loss_mlp": 0.99976969, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.723169214777159, + "language_loss": 0.5759151, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59325039, + "num_input_tokens_seen": 304863915, + "step": 14134, + "time_per_iteration": 3.261160135269165 + }, + { + "auxiliary_loss_clip": 0.01063127, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.03343201, + "balance_loss_mlp": 1.01812577, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 2.2787329833368113, + "language_loss": 0.78827631, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.80920225, + "num_input_tokens_seen": 304881555, + "step": 14135, + "time_per_iteration": 2.632025718688965 + }, + { + "auxiliary_loss_clip": 0.01086007, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.03377318, + "balance_loss_mlp": 1.01907921, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 1.7468621691110728, + "language_loss": 0.63547188, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65663791, + "num_input_tokens_seen": 304898760, + "step": 14136, + "time_per_iteration": 2.5237884521484375 + }, + { + "auxiliary_loss_clip": 0.0109037, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.03510416, + "balance_loss_mlp": 1.01772928, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 2.0300590713771105, + "language_loss": 0.83707833, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.85828251, + "num_input_tokens_seen": 304915465, + "step": 14137, + "time_per_iteration": 2.539393186569214 + }, + { + "auxiliary_loss_clip": 0.01067571, + "auxiliary_loss_mlp": 0.01025948, + "balance_loss_clip": 1.03559768, + "balance_loss_mlp": 1.0161972, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.4259768253973033, + "language_loss": 0.7883786, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80931377, + "num_input_tokens_seen": 304933190, + "step": 14138, + "time_per_iteration": 2.642486810684204 + }, + { + "auxiliary_loss_clip": 0.01087443, + "auxiliary_loss_mlp": 0.01025809, + "balance_loss_clip": 1.03429782, + "balance_loss_mlp": 1.01500916, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.6829732463538858, + "language_loss": 0.64241695, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66354942, + "num_input_tokens_seen": 304951110, + "step": 14139, + "time_per_iteration": 2.549100399017334 + }, + { + "auxiliary_loss_clip": 0.01055746, + "auxiliary_loss_mlp": 0.01027165, + "balance_loss_clip": 1.03221798, + "balance_loss_mlp": 1.01682353, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 2.00714883522347, + "language_loss": 0.71004194, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.73087108, + "num_input_tokens_seen": 304969095, + "step": 14140, + "time_per_iteration": 2.5527563095092773 + }, + { + "auxiliary_loss_clip": 0.01056877, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.03163385, + "balance_loss_mlp": 1.01897621, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 2.1212876667724356, + "language_loss": 0.63982087, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66069674, + "num_input_tokens_seen": 304989315, + "step": 14141, + "time_per_iteration": 2.676844596862793 + }, + { + "auxiliary_loss_clip": 0.0106736, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.03383136, + "balance_loss_mlp": 1.01873946, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.4959337626863924, + "language_loss": 0.70865691, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.7296285, + "num_input_tokens_seen": 305011020, + "step": 14142, + "time_per_iteration": 2.784191608428955 + }, + { + "auxiliary_loss_clip": 0.01044162, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.02814376, + "balance_loss_mlp": 1.018291, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 1.5553776045214909, + "language_loss": 0.65546793, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.6762051, + "num_input_tokens_seen": 305033550, + "step": 14143, + "time_per_iteration": 2.669799327850342 + }, + { + "auxiliary_loss_clip": 0.01068654, + "auxiliary_loss_mlp": 0.00749413, + "balance_loss_clip": 1.03215504, + "balance_loss_mlp": 1.0002178, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 1.8141902404438133, + "language_loss": 0.67824179, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69642246, + "num_input_tokens_seen": 305052885, + "step": 14144, + "time_per_iteration": 2.621854543685913 + }, + { + "auxiliary_loss_clip": 0.01026505, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.02884698, + "balance_loss_mlp": 1.02042508, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.0856231178406532, + "language_loss": 0.6478681, + "learning_rate": 2.299937473050777e-07, + "loss": 0.66846228, + "num_input_tokens_seen": 305071995, + "step": 14145, + "time_per_iteration": 4.125805854797363 + }, + { + "auxiliary_loss_clip": 0.01075724, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.0320344, + "balance_loss_mlp": 1.02109909, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.9248722128772902, + "language_loss": 0.85801733, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87910002, + "num_input_tokens_seen": 305090190, + "step": 14146, + "time_per_iteration": 2.5428178310394287 + }, + { + "auxiliary_loss_clip": 0.01094703, + "auxiliary_loss_mlp": 0.01025671, + "balance_loss_clip": 1.0315969, + "balance_loss_mlp": 1.01503778, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 1.7223130809182197, + "language_loss": 0.83647478, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85767853, + "num_input_tokens_seen": 305109355, + "step": 14147, + "time_per_iteration": 2.476769208908081 + }, + { + "auxiliary_loss_clip": 0.01084749, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.03301501, + "balance_loss_mlp": 1.01876032, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 3.101369758815863, + "language_loss": 0.85333556, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87448603, + "num_input_tokens_seen": 305124165, + "step": 14148, + "time_per_iteration": 3.9813780784606934 + }, + { + "auxiliary_loss_clip": 0.01073505, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.03163147, + "balance_loss_mlp": 1.01874161, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 3.297626114457011, + "language_loss": 0.72055876, + "learning_rate": 2.292689741370204e-07, + "loss": 0.74159473, + "num_input_tokens_seen": 305143940, + "step": 14149, + "time_per_iteration": 2.5759811401367188 + }, + { + "auxiliary_loss_clip": 0.01077084, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.03378129, + "balance_loss_mlp": 1.01839542, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 2.36636566616379, + "language_loss": 0.75788641, + "learning_rate": 2.290879486935804e-07, + "loss": 0.77895314, + "num_input_tokens_seen": 305163505, + "step": 14150, + "time_per_iteration": 2.5788486003875732 + }, + { + "auxiliary_loss_clip": 0.01052713, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.03118193, + "balance_loss_mlp": 1.02015758, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.6382307277154924, + "language_loss": 0.72190607, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74274433, + "num_input_tokens_seen": 305182325, + "step": 14151, + "time_per_iteration": 2.59228777885437 + }, + { + "auxiliary_loss_clip": 0.00980701, + "auxiliary_loss_mlp": 0.01003704, + "balance_loss_clip": 1.00921774, + "balance_loss_mlp": 1.00277412, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8965940206645825, + "language_loss": 0.59647459, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61631864, + "num_input_tokens_seen": 305230775, + "step": 14152, + "time_per_iteration": 2.9940073490142822 + }, + { + "auxiliary_loss_clip": 0.01003346, + "auxiliary_loss_mlp": 0.00999289, + "balance_loss_clip": 1.00331855, + "balance_loss_mlp": 0.99837077, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.6949339998388672, + "language_loss": 0.61225533, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63228172, + "num_input_tokens_seen": 305296000, + "step": 14153, + "time_per_iteration": 3.1690073013305664 + }, + { + "auxiliary_loss_clip": 0.01088473, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.03405786, + "balance_loss_mlp": 1.02078807, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 1.7347105760122634, + "language_loss": 0.80961657, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.8308233, + "num_input_tokens_seen": 305314705, + "step": 14154, + "time_per_iteration": 2.551255702972412 + }, + { + "auxiliary_loss_clip": 0.01061225, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.03198099, + "balance_loss_mlp": 1.02122188, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 2.2485737679181543, + "language_loss": 0.79348993, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81441551, + "num_input_tokens_seen": 305333870, + "step": 14155, + "time_per_iteration": 2.756352186203003 + }, + { + "auxiliary_loss_clip": 0.01062815, + "auxiliary_loss_mlp": 0.01026356, + "balance_loss_clip": 1.03073967, + "balance_loss_mlp": 1.01517403, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 1.6942706975590074, + "language_loss": 0.70531356, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72620523, + "num_input_tokens_seen": 305352780, + "step": 14156, + "time_per_iteration": 2.6752302646636963 + }, + { + "auxiliary_loss_clip": 0.01057149, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.03245759, + "balance_loss_mlp": 1.02176285, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 1.6975114965845073, + "language_loss": 0.73704731, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75794566, + "num_input_tokens_seen": 305371370, + "step": 14157, + "time_per_iteration": 2.6248881816864014 + }, + { + "auxiliary_loss_clip": 0.01025672, + "auxiliary_loss_mlp": 0.01024043, + "balance_loss_clip": 1.03133166, + "balance_loss_mlp": 1.01420867, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 2.9202406617796624, + "language_loss": 0.79232883, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81282598, + "num_input_tokens_seen": 305387955, + "step": 14158, + "time_per_iteration": 4.264881134033203 + }, + { + "auxiliary_loss_clip": 0.01080097, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.03207064, + "balance_loss_mlp": 1.02151263, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 1.8975621798005284, + "language_loss": 0.79346788, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81460702, + "num_input_tokens_seen": 305406285, + "step": 14159, + "time_per_iteration": 2.516875743865967 + }, + { + "auxiliary_loss_clip": 0.01075441, + "auxiliary_loss_mlp": 0.01031013, + "balance_loss_clip": 1.03307343, + "balance_loss_mlp": 1.01990271, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 3.0449287146631576, + "language_loss": 0.71385896, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73492348, + "num_input_tokens_seen": 305424500, + "step": 14160, + "time_per_iteration": 2.6080539226531982 + }, + { + "auxiliary_loss_clip": 0.01093684, + "auxiliary_loss_mlp": 0.01028139, + "balance_loss_clip": 1.03522384, + "balance_loss_mlp": 1.01598001, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 2.2798686654699782, + "language_loss": 0.70385236, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.7250706, + "num_input_tokens_seen": 305442990, + "step": 14161, + "time_per_iteration": 2.6655642986297607 + }, + { + "auxiliary_loss_clip": 0.01085109, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.0296402, + "balance_loss_mlp": 1.02008414, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.15811833357697, + "language_loss": 0.78331995, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80447924, + "num_input_tokens_seen": 305463065, + "step": 14162, + "time_per_iteration": 2.592073917388916 + }, + { + "auxiliary_loss_clip": 0.01089172, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.03462052, + "balance_loss_mlp": 1.02266479, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 3.959514932006472, + "language_loss": 0.76846838, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.78969979, + "num_input_tokens_seen": 305489070, + "step": 14163, + "time_per_iteration": 2.685011863708496 + }, + { + "auxiliary_loss_clip": 0.01004093, + "auxiliary_loss_mlp": 0.00996873, + "balance_loss_clip": 1.00395644, + "balance_loss_mlp": 0.99582952, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6903120707199911, + "language_loss": 0.55024958, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57025921, + "num_input_tokens_seen": 305551490, + "step": 14164, + "time_per_iteration": 3.2276241779327393 + }, + { + "auxiliary_loss_clip": 0.01087517, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03326178, + "balance_loss_mlp": 1.02097476, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 1.8445065248512083, + "language_loss": 0.72445095, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.74564898, + "num_input_tokens_seen": 305570535, + "step": 14165, + "time_per_iteration": 2.561739206314087 + }, + { + "auxiliary_loss_clip": 0.01058832, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03305411, + "balance_loss_mlp": 1.02015531, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.5763941808465352, + "language_loss": 0.67301691, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69391656, + "num_input_tokens_seen": 305590800, + "step": 14166, + "time_per_iteration": 2.62542462348938 + }, + { + "auxiliary_loss_clip": 0.01069734, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.02916324, + "balance_loss_mlp": 1.02007818, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 4.675212265611369, + "language_loss": 0.73386824, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75487685, + "num_input_tokens_seen": 305609495, + "step": 14167, + "time_per_iteration": 2.5745909214019775 + }, + { + "auxiliary_loss_clip": 0.01096078, + "auxiliary_loss_mlp": 0.01026555, + "balance_loss_clip": 1.03306723, + "balance_loss_mlp": 1.01608849, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.7959368055560263, + "language_loss": 0.80347311, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82469946, + "num_input_tokens_seen": 305629420, + "step": 14168, + "time_per_iteration": 2.542968273162842 + }, + { + "auxiliary_loss_clip": 0.01095441, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_clip": 1.03193283, + "balance_loss_mlp": 1.01638377, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 4.6614184759121375, + "language_loss": 0.75963104, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78085756, + "num_input_tokens_seen": 305649835, + "step": 14169, + "time_per_iteration": 2.5548627376556396 + }, + { + "auxiliary_loss_clip": 0.01101126, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.03553152, + "balance_loss_mlp": 1.01801205, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1.7243032990029457, + "language_loss": 0.63647753, + "learning_rate": 2.254815511000452e-07, + "loss": 0.6577847, + "num_input_tokens_seen": 305668840, + "step": 14170, + "time_per_iteration": 2.5046701431274414 + }, + { + "auxiliary_loss_clip": 0.01075732, + "auxiliary_loss_mlp": 0.01025432, + "balance_loss_clip": 1.03013301, + "balance_loss_mlp": 1.01484025, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 3.161255436105954, + "language_loss": 0.86572957, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88674128, + "num_input_tokens_seen": 305686955, + "step": 14171, + "time_per_iteration": 2.5215954780578613 + }, + { + "auxiliary_loss_clip": 0.01080245, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.03296995, + "balance_loss_mlp": 1.02320206, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.6964842444819743, + "language_loss": 0.5500927, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.57123876, + "num_input_tokens_seen": 305706290, + "step": 14172, + "time_per_iteration": 4.066091537475586 + }, + { + "auxiliary_loss_clip": 0.01069512, + "auxiliary_loss_mlp": 0.01026991, + "balance_loss_clip": 1.03022623, + "balance_loss_mlp": 1.01784182, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.724832882005979, + "language_loss": 0.69634008, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71730506, + "num_input_tokens_seen": 305723835, + "step": 14173, + "time_per_iteration": 2.5398993492126465 + }, + { + "auxiliary_loss_clip": 0.01075351, + "auxiliary_loss_mlp": 0.0074936, + "balance_loss_clip": 1.03119183, + "balance_loss_mlp": 1.00018132, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 2.8732932086533025, + "language_loss": 0.76845884, + "learning_rate": 2.247634997500205e-07, + "loss": 0.78670591, + "num_input_tokens_seen": 305741655, + "step": 14174, + "time_per_iteration": 2.561849594116211 + }, + { + "auxiliary_loss_clip": 0.01059395, + "auxiliary_loss_mlp": 0.00749387, + "balance_loss_clip": 1.03048718, + "balance_loss_mlp": 1.00022483, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.5206409486135493, + "language_loss": 0.81854111, + "learning_rate": 2.245841551883676e-07, + "loss": 0.83662891, + "num_input_tokens_seen": 305761890, + "step": 14175, + "time_per_iteration": 2.649629831314087 + }, + { + "auxiliary_loss_clip": 0.01102552, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.03611922, + "balance_loss_mlp": 1.01811862, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 2.443612675405769, + "language_loss": 0.65778846, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67911261, + "num_input_tokens_seen": 305779190, + "step": 14176, + "time_per_iteration": 2.4845659732818604 + }, + { + "auxiliary_loss_clip": 0.01070315, + "auxiliary_loss_mlp": 0.00749258, + "balance_loss_clip": 1.03075218, + "balance_loss_mlp": 1.00021482, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 2.176195391982205, + "language_loss": 0.78559923, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80379498, + "num_input_tokens_seen": 305799870, + "step": 14177, + "time_per_iteration": 2.649358034133911 + }, + { + "auxiliary_loss_clip": 0.01075235, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.03279305, + "balance_loss_mlp": 1.01939476, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.7429107785214784, + "language_loss": 0.73327851, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75433266, + "num_input_tokens_seen": 305819695, + "step": 14178, + "time_per_iteration": 2.6392223834991455 + }, + { + "auxiliary_loss_clip": 0.01063288, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.03358543, + "balance_loss_mlp": 1.02549207, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.9694581279376546, + "language_loss": 0.74731088, + "learning_rate": 2.238674502491935e-07, + "loss": 0.7683081, + "num_input_tokens_seen": 305837270, + "step": 14179, + "time_per_iteration": 2.584099292755127 + }, + { + "auxiliary_loss_clip": 0.01096454, + "auxiliary_loss_mlp": 0.01023365, + "balance_loss_clip": 1.03421474, + "balance_loss_mlp": 1.01273751, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 2.090772693744862, + "language_loss": 0.81742764, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83862579, + "num_input_tokens_seen": 305855250, + "step": 14180, + "time_per_iteration": 2.5217525959014893 + }, + { + "auxiliary_loss_clip": 0.01033413, + "auxiliary_loss_mlp": 0.01034628, + "balance_loss_clip": 1.02873945, + "balance_loss_mlp": 1.02364838, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 6.196939812870626, + "language_loss": 0.61106926, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63174963, + "num_input_tokens_seen": 305875660, + "step": 14181, + "time_per_iteration": 2.71899676322937 + }, + { + "auxiliary_loss_clip": 0.01097104, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.03549147, + "balance_loss_mlp": 1.02084208, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.0449435602693082, + "language_loss": 0.7262643, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74754465, + "num_input_tokens_seen": 305892415, + "step": 14182, + "time_per_iteration": 2.5111148357391357 + }, + { + "auxiliary_loss_clip": 0.01045432, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.03039813, + "balance_loss_mlp": 1.02165151, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 1.866769557108527, + "language_loss": 0.707883, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.72866249, + "num_input_tokens_seen": 305912665, + "step": 14183, + "time_per_iteration": 2.663571834564209 + }, + { + "auxiliary_loss_clip": 0.01069152, + "auxiliary_loss_mlp": 0.01026758, + "balance_loss_clip": 1.03392363, + "balance_loss_mlp": 1.01660156, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 2.1314858610828398, + "language_loss": 0.73160839, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.75256753, + "num_input_tokens_seen": 305931515, + "step": 14184, + "time_per_iteration": 4.077738046646118 + }, + { + "auxiliary_loss_clip": 0.01098363, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.03502893, + "balance_loss_mlp": 1.0187943, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.8096720652079568, + "language_loss": 0.76762271, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.78890163, + "num_input_tokens_seen": 305949965, + "step": 14185, + "time_per_iteration": 2.526207685470581 + }, + { + "auxiliary_loss_clip": 0.01050182, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.02913511, + "balance_loss_mlp": 1.01796627, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 2.2704638576308738, + "language_loss": 0.79476589, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81557393, + "num_input_tokens_seen": 305967820, + "step": 14186, + "time_per_iteration": 2.605727195739746 + }, + { + "auxiliary_loss_clip": 0.01073721, + "auxiliary_loss_mlp": 0.01026042, + "balance_loss_clip": 1.03030634, + "balance_loss_mlp": 1.01449084, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.614688433425162, + "language_loss": 0.62835848, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64935613, + "num_input_tokens_seen": 305985505, + "step": 14187, + "time_per_iteration": 2.6296660900115967 + }, + { + "auxiliary_loss_clip": 0.01043059, + "auxiliary_loss_mlp": 0.01027794, + "balance_loss_clip": 1.02919841, + "balance_loss_mlp": 1.01587915, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 7.539822584223656, + "language_loss": 0.76732492, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.78803349, + "num_input_tokens_seen": 306005220, + "step": 14188, + "time_per_iteration": 4.218381881713867 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.03283775, + "balance_loss_mlp": 1.01942134, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.7473062605077343, + "language_loss": 0.78286368, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80404705, + "num_input_tokens_seen": 306023785, + "step": 14189, + "time_per_iteration": 2.600155830383301 + }, + { + "auxiliary_loss_clip": 0.01071836, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.03049517, + "balance_loss_mlp": 1.01797712, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 1.9926842467790062, + "language_loss": 0.79500586, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81601912, + "num_input_tokens_seen": 306041600, + "step": 14190, + "time_per_iteration": 2.565028190612793 + }, + { + "auxiliary_loss_clip": 0.0104948, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.03293157, + "balance_loss_mlp": 1.01835227, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 1.796398560884281, + "language_loss": 0.75876546, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.77955461, + "num_input_tokens_seen": 306060345, + "step": 14191, + "time_per_iteration": 2.6485085487365723 + }, + { + "auxiliary_loss_clip": 0.01086397, + "auxiliary_loss_mlp": 0.01027453, + "balance_loss_clip": 1.03460526, + "balance_loss_mlp": 1.0163188, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.771955388855732, + "language_loss": 0.69397438, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71511292, + "num_input_tokens_seen": 306078285, + "step": 14192, + "time_per_iteration": 2.5501911640167236 + }, + { + "auxiliary_loss_clip": 0.01091252, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.03521967, + "balance_loss_mlp": 1.02413046, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.539586173247169, + "language_loss": 0.62587482, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.64715493, + "num_input_tokens_seen": 306093760, + "step": 14193, + "time_per_iteration": 2.563397169113159 + }, + { + "auxiliary_loss_clip": 0.01069555, + "auxiliary_loss_mlp": 0.01024804, + "balance_loss_clip": 1.0324676, + "balance_loss_mlp": 1.01358676, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 1.777500260780553, + "language_loss": 0.76868439, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78962803, + "num_input_tokens_seen": 306112595, + "step": 14194, + "time_per_iteration": 2.564870834350586 + }, + { + "auxiliary_loss_clip": 0.01096994, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.03389311, + "balance_loss_mlp": 1.01650667, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 2.39877503958341, + "language_loss": 0.69494134, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71618229, + "num_input_tokens_seen": 306131800, + "step": 14195, + "time_per_iteration": 2.60087251663208 + }, + { + "auxiliary_loss_clip": 0.01071624, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.03237939, + "balance_loss_mlp": 1.01958942, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 1.9779655630029467, + "language_loss": 0.85441625, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.87544358, + "num_input_tokens_seen": 306150590, + "step": 14196, + "time_per_iteration": 2.579282522201538 + }, + { + "auxiliary_loss_clip": 0.01004967, + "auxiliary_loss_mlp": 0.01003864, + "balance_loss_clip": 1.00499678, + "balance_loss_mlp": 1.00306535, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7598377204824981, + "language_loss": 0.5507158, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57080412, + "num_input_tokens_seen": 306205850, + "step": 14197, + "time_per_iteration": 3.0783145427703857 + }, + { + "auxiliary_loss_clip": 0.0106099, + "auxiliary_loss_mlp": 0.00749275, + "balance_loss_clip": 1.03079855, + "balance_loss_mlp": 1.00019515, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.4913957490277772, + "language_loss": 0.81505334, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83315593, + "num_input_tokens_seen": 306225220, + "step": 14198, + "time_per_iteration": 2.707843065261841 + }, + { + "auxiliary_loss_clip": 0.01095972, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.03344679, + "balance_loss_mlp": 1.01807237, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.438121726214208, + "language_loss": 0.68636847, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70760596, + "num_input_tokens_seen": 306249865, + "step": 14199, + "time_per_iteration": 4.14480447769165 + }, + { + "auxiliary_loss_clip": 0.01059392, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.029814, + "balance_loss_mlp": 1.02041936, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.6938492231201623, + "language_loss": 0.86240232, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88330573, + "num_input_tokens_seen": 306270215, + "step": 14200, + "time_per_iteration": 2.656057834625244 + }, + { + "auxiliary_loss_clip": 0.01066733, + "auxiliary_loss_mlp": 0.01024707, + "balance_loss_clip": 1.03309536, + "balance_loss_mlp": 1.01433587, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.7610860308106906, + "language_loss": 0.77880675, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.79972118, + "num_input_tokens_seen": 306288960, + "step": 14201, + "time_per_iteration": 2.627824068069458 + }, + { + "auxiliary_loss_clip": 0.01071869, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.03401351, + "balance_loss_mlp": 1.01765656, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 1.8972445797058968, + "language_loss": 0.68970603, + "learning_rate": 2.19767322694256e-07, + "loss": 0.71070385, + "num_input_tokens_seen": 306308735, + "step": 14202, + "time_per_iteration": 2.6837217807769775 + }, + { + "auxiliary_loss_clip": 0.01089037, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.03515077, + "balance_loss_mlp": 1.02464604, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 2.468332853696283, + "language_loss": 0.80119312, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82243913, + "num_input_tokens_seen": 306329015, + "step": 14203, + "time_per_iteration": 2.5615694522857666 + }, + { + "auxiliary_loss_clip": 0.01078699, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.03284478, + "balance_loss_mlp": 1.02006781, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 1.9304180174174388, + "language_loss": 0.65868199, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.67978758, + "num_input_tokens_seen": 306349085, + "step": 14204, + "time_per_iteration": 2.5942447185516357 + }, + { + "auxiliary_loss_clip": 0.01098432, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.03348541, + "balance_loss_mlp": 1.01906657, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.1787077601057945, + "language_loss": 0.59478426, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.61607724, + "num_input_tokens_seen": 306365385, + "step": 14205, + "time_per_iteration": 2.4280176162719727 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.03266454, + "balance_loss_mlp": 1.01371884, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.2715787339999265, + "language_loss": 0.7230376, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74402475, + "num_input_tokens_seen": 306384585, + "step": 14206, + "time_per_iteration": 2.638347864151001 + }, + { + "auxiliary_loss_clip": 0.01088942, + "auxiliary_loss_mlp": 0.0102722, + "balance_loss_clip": 1.03483582, + "balance_loss_mlp": 1.01613319, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 2.6902434321361475, + "language_loss": 0.76126397, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78242552, + "num_input_tokens_seen": 306401565, + "step": 14207, + "time_per_iteration": 2.4937591552734375 + }, + { + "auxiliary_loss_clip": 0.01099293, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.03437853, + "balance_loss_mlp": 1.0181725, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 2.790114626750914, + "language_loss": 0.85033882, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87163103, + "num_input_tokens_seen": 306419995, + "step": 14208, + "time_per_iteration": 2.4528605937957764 + }, + { + "auxiliary_loss_clip": 0.01076809, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.03312576, + "balance_loss_mlp": 1.01976919, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.6402313061123586, + "language_loss": 0.65926272, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.68033266, + "num_input_tokens_seen": 306439240, + "step": 14209, + "time_per_iteration": 2.6117751598358154 + }, + { + "auxiliary_loss_clip": 0.01048457, + "auxiliary_loss_mlp": 0.01025521, + "balance_loss_clip": 1.03014302, + "balance_loss_mlp": 1.01481605, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 2.068743481737566, + "language_loss": 0.7049458, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72568566, + "num_input_tokens_seen": 306458425, + "step": 14210, + "time_per_iteration": 2.6638259887695312 + }, + { + "auxiliary_loss_clip": 0.01072106, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.03160095, + "balance_loss_mlp": 1.01636505, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.6659704424184347, + "language_loss": 0.70020628, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72119802, + "num_input_tokens_seen": 306477210, + "step": 14211, + "time_per_iteration": 2.611020565032959 + }, + { + "auxiliary_loss_clip": 0.01078045, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.03355575, + "balance_loss_mlp": 1.02046764, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 2.206592581357339, + "language_loss": 0.8131361, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83423173, + "num_input_tokens_seen": 306495820, + "step": 14212, + "time_per_iteration": 4.096228122711182 + }, + { + "auxiliary_loss_clip": 0.01059805, + "auxiliary_loss_mlp": 0.01028408, + "balance_loss_clip": 1.02923298, + "balance_loss_mlp": 1.01609981, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 1.9291944677286992, + "language_loss": 0.65996051, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68084264, + "num_input_tokens_seen": 306516420, + "step": 14213, + "time_per_iteration": 2.768099546432495 + }, + { + "auxiliary_loss_clip": 0.01095806, + "auxiliary_loss_mlp": 0.0102563, + "balance_loss_clip": 1.03314126, + "balance_loss_mlp": 1.01473379, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 4.398440384180921, + "language_loss": 0.78160405, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80281842, + "num_input_tokens_seen": 306534785, + "step": 14214, + "time_per_iteration": 2.5244147777557373 + }, + { + "auxiliary_loss_clip": 0.01077778, + "auxiliary_loss_mlp": 0.01027581, + "balance_loss_clip": 1.03164423, + "balance_loss_mlp": 1.01535654, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 2.9253692922069674, + "language_loss": 0.66527545, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68632907, + "num_input_tokens_seen": 306552440, + "step": 14215, + "time_per_iteration": 2.613736391067505 + }, + { + "auxiliary_loss_clip": 0.01097056, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.03467, + "balance_loss_mlp": 1.01521242, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 2.0793061596413156, + "language_loss": 0.62886745, + "learning_rate": 2.172890718362279e-07, + "loss": 0.65009809, + "num_input_tokens_seen": 306573600, + "step": 14216, + "time_per_iteration": 2.6218085289001465 + }, + { + "auxiliary_loss_clip": 0.01059585, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03055632, + "balance_loss_mlp": 1.01965356, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 2.329735993533153, + "language_loss": 0.65458924, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67549527, + "num_input_tokens_seen": 306592840, + "step": 14217, + "time_per_iteration": 2.6000454425811768 + }, + { + "auxiliary_loss_clip": 0.0108456, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.03560591, + "balance_loss_mlp": 1.01701808, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.4428092653610396, + "language_loss": 0.65011203, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67123079, + "num_input_tokens_seen": 306613210, + "step": 14218, + "time_per_iteration": 2.6420552730560303 + }, + { + "auxiliary_loss_clip": 0.01081061, + "auxiliary_loss_mlp": 0.010315, + "balance_loss_clip": 1.03113425, + "balance_loss_mlp": 1.01966822, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 1.7269073399881212, + "language_loss": 0.70300752, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72413313, + "num_input_tokens_seen": 306631620, + "step": 14219, + "time_per_iteration": 2.5368540287017822 + }, + { + "auxiliary_loss_clip": 0.01072432, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.03017008, + "balance_loss_mlp": 1.02064371, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.1147474950123297, + "language_loss": 0.67797816, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69902188, + "num_input_tokens_seen": 306646695, + "step": 14220, + "time_per_iteration": 2.5293118953704834 + }, + { + "auxiliary_loss_clip": 0.01093862, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.03334832, + "balance_loss_mlp": 1.018731, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 2.408656994252355, + "language_loss": 0.7129038, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73413384, + "num_input_tokens_seen": 306665465, + "step": 14221, + "time_per_iteration": 2.547731876373291 + }, + { + "auxiliary_loss_clip": 0.01063714, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.03062105, + "balance_loss_mlp": 1.02521491, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 1.940803571815879, + "language_loss": 0.59973478, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.62073892, + "num_input_tokens_seen": 306685950, + "step": 14222, + "time_per_iteration": 2.7703664302825928 + }, + { + "auxiliary_loss_clip": 0.01065311, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.02997959, + "balance_loss_mlp": 1.01689231, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.7199474204184446, + "language_loss": 0.8394779, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86041474, + "num_input_tokens_seen": 306705740, + "step": 14223, + "time_per_iteration": 2.6016018390655518 + }, + { + "auxiliary_loss_clip": 0.01085415, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.03382528, + "balance_loss_mlp": 1.02090645, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.707521245478188, + "language_loss": 0.74401361, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76518369, + "num_input_tokens_seen": 306725065, + "step": 14224, + "time_per_iteration": 4.118286848068237 + }, + { + "auxiliary_loss_clip": 0.010657, + "auxiliary_loss_mlp": 0.00749496, + "balance_loss_clip": 1.0289855, + "balance_loss_mlp": 1.00021696, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 2.811682295510997, + "language_loss": 0.75602627, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77417827, + "num_input_tokens_seen": 306743630, + "step": 14225, + "time_per_iteration": 2.561185598373413 + }, + { + "auxiliary_loss_clip": 0.01030368, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.02897382, + "balance_loss_mlp": 1.02272356, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.639423550195234, + "language_loss": 0.77232307, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.7929579, + "num_input_tokens_seen": 306763105, + "step": 14226, + "time_per_iteration": 2.7995612621307373 + }, + { + "auxiliary_loss_clip": 0.01099575, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.03377712, + "balance_loss_mlp": 1.02146339, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 1.9195599143048196, + "language_loss": 0.5445829, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56590748, + "num_input_tokens_seen": 306779875, + "step": 14227, + "time_per_iteration": 2.535719871520996 + }, + { + "auxiliary_loss_clip": 0.01064577, + "auxiliary_loss_mlp": 0.0074939, + "balance_loss_clip": 1.03278029, + "balance_loss_mlp": 1.00017428, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 1.8093287644088827, + "language_loss": 0.65651697, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67465663, + "num_input_tokens_seen": 306800015, + "step": 14228, + "time_per_iteration": 4.038755893707275 + }, + { + "auxiliary_loss_clip": 0.01066518, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.0346365, + "balance_loss_mlp": 1.02317977, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 2.114705362749777, + "language_loss": 0.7369864, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.75799578, + "num_input_tokens_seen": 306814160, + "step": 14229, + "time_per_iteration": 2.6217286586761475 + }, + { + "auxiliary_loss_clip": 0.01084581, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.01678848, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 1.8901831000982257, + "language_loss": 0.7228086, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74392343, + "num_input_tokens_seen": 306833310, + "step": 14230, + "time_per_iteration": 2.551715612411499 + }, + { + "auxiliary_loss_clip": 0.01080975, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.03144598, + "balance_loss_mlp": 1.01400089, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 2.2186117538770267, + "language_loss": 0.8256197, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84667945, + "num_input_tokens_seen": 306851345, + "step": 14231, + "time_per_iteration": 2.4701268672943115 + }, + { + "auxiliary_loss_clip": 0.01088783, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.03432274, + "balance_loss_mlp": 1.0186379, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 2.823146665258237, + "language_loss": 0.68093801, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.70212936, + "num_input_tokens_seen": 306871040, + "step": 14232, + "time_per_iteration": 2.548367977142334 + }, + { + "auxiliary_loss_clip": 0.0107778, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.03438711, + "balance_loss_mlp": 1.01698124, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 3.0052868487480984, + "language_loss": 0.67224342, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69330382, + "num_input_tokens_seen": 306891625, + "step": 14233, + "time_per_iteration": 2.636826753616333 + }, + { + "auxiliary_loss_clip": 0.01084209, + "auxiliary_loss_mlp": 0.01027963, + "balance_loss_clip": 1.03181434, + "balance_loss_mlp": 1.01762152, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.6331659174331117, + "language_loss": 0.76657575, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78769743, + "num_input_tokens_seen": 306910020, + "step": 14234, + "time_per_iteration": 2.5108914375305176 + }, + { + "auxiliary_loss_clip": 0.00992621, + "auxiliary_loss_mlp": 0.01001767, + "balance_loss_clip": 1.00307417, + "balance_loss_mlp": 1.00049114, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7522482339886297, + "language_loss": 0.58068597, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60062987, + "num_input_tokens_seen": 306969505, + "step": 14235, + "time_per_iteration": 3.109970808029175 + }, + { + "auxiliary_loss_clip": 0.01005675, + "auxiliary_loss_mlp": 0.01004518, + "balance_loss_clip": 1.005404, + "balance_loss_mlp": 1.00369573, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7870191921601136, + "language_loss": 0.56681114, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58691305, + "num_input_tokens_seen": 307027710, + "step": 14236, + "time_per_iteration": 3.045313835144043 + }, + { + "auxiliary_loss_clip": 0.01073549, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.03155065, + "balance_loss_mlp": 1.020082, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.7697779615526827, + "language_loss": 0.70555055, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72659636, + "num_input_tokens_seen": 307045515, + "step": 14237, + "time_per_iteration": 2.547124147415161 + }, + { + "auxiliary_loss_clip": 0.01073552, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.03116918, + "balance_loss_mlp": 1.01840639, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.3132543134659507, + "language_loss": 0.6313622, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.65238535, + "num_input_tokens_seen": 307064470, + "step": 14238, + "time_per_iteration": 2.6123836040496826 + }, + { + "auxiliary_loss_clip": 0.01092553, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.03263891, + "balance_loss_mlp": 1.01916027, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 4.492341789590377, + "language_loss": 0.69214672, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71335506, + "num_input_tokens_seen": 307083900, + "step": 14239, + "time_per_iteration": 4.000153064727783 + }, + { + "auxiliary_loss_clip": 0.01098139, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.0327003, + "balance_loss_mlp": 1.02084601, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 2.1448788475632417, + "language_loss": 0.66789758, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68919635, + "num_input_tokens_seen": 307104590, + "step": 14240, + "time_per_iteration": 2.5599334239959717 + }, + { + "auxiliary_loss_clip": 0.0105582, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.03044593, + "balance_loss_mlp": 1.02390432, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 1.573142513971892, + "language_loss": 0.62091911, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64184576, + "num_input_tokens_seen": 307125580, + "step": 14241, + "time_per_iteration": 2.7317183017730713 + }, + { + "auxiliary_loss_clip": 0.01100731, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03402793, + "balance_loss_mlp": 1.02153265, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 2.1734834109117678, + "language_loss": 0.74392658, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76526803, + "num_input_tokens_seen": 307147625, + "step": 14242, + "time_per_iteration": 2.6116690635681152 + }, + { + "auxiliary_loss_clip": 0.01030032, + "auxiliary_loss_mlp": 0.01045191, + "balance_loss_clip": 1.03420901, + "balance_loss_mlp": 1.03270435, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 2.005198895345937, + "language_loss": 0.76555407, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78630632, + "num_input_tokens_seen": 307164665, + "step": 14243, + "time_per_iteration": 2.7862420082092285 + }, + { + "auxiliary_loss_clip": 0.01075841, + "auxiliary_loss_mlp": 0.00749246, + "balance_loss_clip": 1.03365433, + "balance_loss_mlp": 1.00029874, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 2.0040467090031155, + "language_loss": 0.68267012, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70092094, + "num_input_tokens_seen": 307182530, + "step": 14244, + "time_per_iteration": 2.7352447509765625 + }, + { + "auxiliary_loss_clip": 0.01014468, + "auxiliary_loss_mlp": 0.01001023, + "balance_loss_clip": 1.00467789, + "balance_loss_mlp": 1.0001111, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7918262073635159, + "language_loss": 0.58440834, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60456324, + "num_input_tokens_seen": 307241240, + "step": 14245, + "time_per_iteration": 3.003005266189575 + }, + { + "auxiliary_loss_clip": 0.0108987, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.03393507, + "balance_loss_mlp": 1.01507044, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.714652748478323, + "language_loss": 0.77585053, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79701704, + "num_input_tokens_seen": 307261485, + "step": 14246, + "time_per_iteration": 2.54364275932312 + }, + { + "auxiliary_loss_clip": 0.01067678, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.0280571, + "balance_loss_mlp": 1.0165633, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 2.0771465363351216, + "language_loss": 0.81396651, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83491862, + "num_input_tokens_seen": 307279160, + "step": 14247, + "time_per_iteration": 2.570526123046875 + }, + { + "auxiliary_loss_clip": 0.01074714, + "auxiliary_loss_mlp": 0.0102701, + "balance_loss_clip": 1.03215075, + "balance_loss_mlp": 1.01575649, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 1.8669017798631855, + "language_loss": 0.77488399, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79590124, + "num_input_tokens_seen": 307297920, + "step": 14248, + "time_per_iteration": 2.571366310119629 + }, + { + "auxiliary_loss_clip": 0.01043148, + "auxiliary_loss_mlp": 0.01039875, + "balance_loss_clip": 1.02721119, + "balance_loss_mlp": 1.0268575, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 1.8692349506129453, + "language_loss": 0.77952671, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.80035686, + "num_input_tokens_seen": 307318320, + "step": 14249, + "time_per_iteration": 2.712498188018799 + }, + { + "auxiliary_loss_clip": 0.01069528, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.03077745, + "balance_loss_mlp": 1.02194273, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 1.8098272582037724, + "language_loss": 0.7815783, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80260247, + "num_input_tokens_seen": 307336720, + "step": 14250, + "time_per_iteration": 2.573173761367798 + }, + { + "auxiliary_loss_clip": 0.01073209, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.03291321, + "balance_loss_mlp": 1.01636052, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 1.7298743897290287, + "language_loss": 0.79591924, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81691456, + "num_input_tokens_seen": 307354120, + "step": 14251, + "time_per_iteration": 2.5822744369506836 + }, + { + "auxiliary_loss_clip": 0.01060429, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.0325141, + "balance_loss_mlp": 1.01837111, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 2.219147468797695, + "language_loss": 0.61382186, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63471234, + "num_input_tokens_seen": 307373165, + "step": 14252, + "time_per_iteration": 4.07417893409729 + }, + { + "auxiliary_loss_clip": 0.01076786, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.03744602, + "balance_loss_mlp": 1.01855242, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.7017155846731618, + "language_loss": 0.6971674, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.71823847, + "num_input_tokens_seen": 307391000, + "step": 14253, + "time_per_iteration": 2.576870918273926 + }, + { + "auxiliary_loss_clip": 0.01006685, + "auxiliary_loss_mlp": 0.01004704, + "balance_loss_clip": 1.00649107, + "balance_loss_mlp": 1.0037322, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7932457774483043, + "language_loss": 0.59206748, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61218137, + "num_input_tokens_seen": 307452865, + "step": 14254, + "time_per_iteration": 3.1857376098632812 + }, + { + "auxiliary_loss_clip": 0.01071979, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.03082168, + "balance_loss_mlp": 1.01913249, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 3.155680795368223, + "language_loss": 0.81147933, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83250785, + "num_input_tokens_seen": 307471940, + "step": 14255, + "time_per_iteration": 2.649254560470581 + }, + { + "auxiliary_loss_clip": 0.01096239, + "auxiliary_loss_mlp": 0.01026106, + "balance_loss_clip": 1.03434289, + "balance_loss_mlp": 1.01507926, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 5.99137117283694, + "language_loss": 0.67513049, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69635391, + "num_input_tokens_seen": 307488745, + "step": 14256, + "time_per_iteration": 2.609426498413086 + }, + { + "auxiliary_loss_clip": 0.01087428, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.03422165, + "balance_loss_mlp": 1.01852083, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.8820362173674539, + "language_loss": 0.69802231, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.71919239, + "num_input_tokens_seen": 307506855, + "step": 14257, + "time_per_iteration": 2.605215311050415 + }, + { + "auxiliary_loss_clip": 0.01057763, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.02824295, + "balance_loss_mlp": 1.02045751, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 3.7955713006531777, + "language_loss": 0.76693612, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.78782988, + "num_input_tokens_seen": 307526115, + "step": 14258, + "time_per_iteration": 2.7128982543945312 + }, + { + "auxiliary_loss_clip": 0.01083965, + "auxiliary_loss_mlp": 0.00749126, + "balance_loss_clip": 1.03262401, + "balance_loss_mlp": 1.00027895, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.8323627035264567, + "language_loss": 0.6781522, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69648308, + "num_input_tokens_seen": 307545230, + "step": 14259, + "time_per_iteration": 2.5763516426086426 + }, + { + "auxiliary_loss_clip": 0.01082449, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.03054237, + "balance_loss_mlp": 1.01993251, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.6413689824033932, + "language_loss": 0.771272, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79240841, + "num_input_tokens_seen": 307564900, + "step": 14260, + "time_per_iteration": 2.5565083026885986 + }, + { + "auxiliary_loss_clip": 0.01070089, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.03107142, + "balance_loss_mlp": 1.01616383, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 1.8525365079453444, + "language_loss": 0.73959196, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76056862, + "num_input_tokens_seen": 307583500, + "step": 14261, + "time_per_iteration": 2.5775671005249023 + }, + { + "auxiliary_loss_clip": 0.01083571, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.03443146, + "balance_loss_mlp": 1.01985621, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.6643731420438108, + "language_loss": 0.78952855, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81068307, + "num_input_tokens_seen": 307601430, + "step": 14262, + "time_per_iteration": 2.486337423324585 + }, + { + "auxiliary_loss_clip": 0.01061005, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.0322355, + "balance_loss_mlp": 1.0210948, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.659010661135948, + "language_loss": 0.67567992, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.69660175, + "num_input_tokens_seen": 307621495, + "step": 14263, + "time_per_iteration": 2.6151678562164307 + }, + { + "auxiliary_loss_clip": 0.0106155, + "auxiliary_loss_mlp": 0.00749413, + "balance_loss_clip": 1.03174758, + "balance_loss_mlp": 1.00030947, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.562987540554219, + "language_loss": 0.79528427, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81339395, + "num_input_tokens_seen": 307640840, + "step": 14264, + "time_per_iteration": 4.244789123535156 + }, + { + "auxiliary_loss_clip": 0.01058326, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.03112137, + "balance_loss_mlp": 1.02024829, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.2684053346293296, + "language_loss": 0.69632232, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.7172299, + "num_input_tokens_seen": 307663820, + "step": 14265, + "time_per_iteration": 2.712648391723633 + }, + { + "auxiliary_loss_clip": 0.01093034, + "auxiliary_loss_mlp": 0.01022651, + "balance_loss_clip": 1.03278804, + "balance_loss_mlp": 1.01254201, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.9296521430967088, + "language_loss": 0.66223979, + "learning_rate": 2.085464646918027e-07, + "loss": 0.6833967, + "num_input_tokens_seen": 307682385, + "step": 14266, + "time_per_iteration": 2.5392343997955322 + }, + { + "auxiliary_loss_clip": 0.01076907, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.03525102, + "balance_loss_mlp": 1.01809883, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 2.030330466030133, + "language_loss": 0.75299579, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77405131, + "num_input_tokens_seen": 307704680, + "step": 14267, + "time_per_iteration": 2.6688108444213867 + }, + { + "auxiliary_loss_clip": 0.01082833, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.03242207, + "balance_loss_mlp": 1.01898849, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 2.9938079012122647, + "language_loss": 0.87661874, + "learning_rate": 2.082002873852946e-07, + "loss": 0.89773816, + "num_input_tokens_seen": 307723245, + "step": 14268, + "time_per_iteration": 4.025266170501709 + }, + { + "auxiliary_loss_clip": 0.01087945, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.03403354, + "balance_loss_mlp": 1.01864696, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 1.8572494705034681, + "language_loss": 0.72996581, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.75113964, + "num_input_tokens_seen": 307742510, + "step": 14269, + "time_per_iteration": 2.5533287525177 + }, + { + "auxiliary_loss_clip": 0.01087119, + "auxiliary_loss_mlp": 0.01027385, + "balance_loss_clip": 1.03293967, + "balance_loss_mlp": 1.01645315, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.609456898344024, + "language_loss": 0.66501498, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68615997, + "num_input_tokens_seen": 307766030, + "step": 14270, + "time_per_iteration": 2.7309629917144775 + }, + { + "auxiliary_loss_clip": 0.01065595, + "auxiliary_loss_mlp": 0.01025112, + "balance_loss_clip": 1.02871609, + "balance_loss_mlp": 1.01450813, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.6060972826795574, + "language_loss": 0.73638058, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.75728768, + "num_input_tokens_seen": 307785800, + "step": 14271, + "time_per_iteration": 2.6104350090026855 + }, + { + "auxiliary_loss_clip": 0.00985023, + "auxiliary_loss_mlp": 0.00746677, + "balance_loss_clip": 1.00552869, + "balance_loss_mlp": 0.99973696, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.7961640103211272, + "language_loss": 0.59365851, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61097544, + "num_input_tokens_seen": 307850995, + "step": 14272, + "time_per_iteration": 3.269695520401001 + }, + { + "auxiliary_loss_clip": 0.01071514, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.03133178, + "balance_loss_mlp": 1.02085567, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.8813360241016293, + "language_loss": 0.75185275, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77290231, + "num_input_tokens_seen": 307868585, + "step": 14273, + "time_per_iteration": 2.5702011585235596 + }, + { + "auxiliary_loss_clip": 0.01086671, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.0337038, + "balance_loss_mlp": 1.01915026, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 2.8535671349770952, + "language_loss": 0.82429826, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84546524, + "num_input_tokens_seen": 307886820, + "step": 14274, + "time_per_iteration": 2.5049309730529785 + }, + { + "auxiliary_loss_clip": 0.01012631, + "auxiliary_loss_mlp": 0.00999406, + "balance_loss_clip": 1.00285625, + "balance_loss_mlp": 0.99848789, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.8595191692362238, + "language_loss": 0.60852575, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62864602, + "num_input_tokens_seen": 307944020, + "step": 14275, + "time_per_iteration": 3.1233620643615723 + }, + { + "auxiliary_loss_clip": 0.01082288, + "auxiliary_loss_mlp": 0.01025678, + "balance_loss_clip": 1.03350782, + "balance_loss_mlp": 1.01373911, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 2.1698320468596703, + "language_loss": 0.59176457, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61284429, + "num_input_tokens_seen": 307961055, + "step": 14276, + "time_per_iteration": 2.586793899536133 + }, + { + "auxiliary_loss_clip": 0.01074861, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.03206491, + "balance_loss_mlp": 1.01752198, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 1.8258210636228527, + "language_loss": 0.7609911, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78202176, + "num_input_tokens_seen": 307978690, + "step": 14277, + "time_per_iteration": 2.5671846866607666 + }, + { + "auxiliary_loss_clip": 0.01073427, + "auxiliary_loss_mlp": 0.01027704, + "balance_loss_clip": 1.03088593, + "balance_loss_mlp": 1.01630783, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.5784674454685648, + "language_loss": 0.83643669, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.85744798, + "num_input_tokens_seen": 307995870, + "step": 14278, + "time_per_iteration": 2.6033272743225098 + }, + { + "auxiliary_loss_clip": 0.01076125, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.03394842, + "balance_loss_mlp": 1.01537609, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 1.9845534886851068, + "language_loss": 0.74352086, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76455164, + "num_input_tokens_seen": 308013645, + "step": 14279, + "time_per_iteration": 4.132092714309692 + }, + { + "auxiliary_loss_clip": 0.0109734, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.0350498, + "balance_loss_mlp": 1.02082515, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 2.3400088944504662, + "language_loss": 0.66267556, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68396455, + "num_input_tokens_seen": 308032490, + "step": 14280, + "time_per_iteration": 2.5300087928771973 + }, + { + "auxiliary_loss_clip": 0.010818, + "auxiliary_loss_mlp": 0.01025271, + "balance_loss_clip": 1.03118396, + "balance_loss_mlp": 1.01548433, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 1.870422610695263, + "language_loss": 0.62128592, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64235663, + "num_input_tokens_seen": 308052110, + "step": 14281, + "time_per_iteration": 2.514460802078247 + }, + { + "auxiliary_loss_clip": 0.01073475, + "auxiliary_loss_mlp": 0.00749435, + "balance_loss_clip": 1.03313971, + "balance_loss_mlp": 1.00026131, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.677635991385095, + "language_loss": 0.73505211, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75328112, + "num_input_tokens_seen": 308070660, + "step": 14282, + "time_per_iteration": 2.52891206741333 + }, + { + "auxiliary_loss_clip": 0.01066098, + "auxiliary_loss_mlp": 0.01026448, + "balance_loss_clip": 1.02885902, + "balance_loss_mlp": 1.01636851, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 2.099261537092026, + "language_loss": 0.75643009, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77735555, + "num_input_tokens_seen": 308089520, + "step": 14283, + "time_per_iteration": 2.5514214038848877 + }, + { + "auxiliary_loss_clip": 0.01081009, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.03188312, + "balance_loss_mlp": 1.01987267, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 1.7888393526714563, + "language_loss": 0.60144186, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62256032, + "num_input_tokens_seen": 308111545, + "step": 14284, + "time_per_iteration": 2.6437132358551025 + }, + { + "auxiliary_loss_clip": 0.01070277, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.03292823, + "balance_loss_mlp": 1.01845169, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.8786674623787065, + "language_loss": 0.7520746, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77307129, + "num_input_tokens_seen": 308129690, + "step": 14285, + "time_per_iteration": 2.648883104324341 + }, + { + "auxiliary_loss_clip": 0.01085094, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.03678942, + "balance_loss_mlp": 1.02089202, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 3.201548525571349, + "language_loss": 0.74350178, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76467597, + "num_input_tokens_seen": 308147410, + "step": 14286, + "time_per_iteration": 2.5339627265930176 + }, + { + "auxiliary_loss_clip": 0.01004835, + "auxiliary_loss_mlp": 0.00746587, + "balance_loss_clip": 1.00538433, + "balance_loss_mlp": 0.99970609, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7869769466925854, + "language_loss": 0.49460861, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51212287, + "num_input_tokens_seen": 308204875, + "step": 14287, + "time_per_iteration": 3.0832021236419678 + }, + { + "auxiliary_loss_clip": 0.0108866, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.03489566, + "balance_loss_mlp": 1.01808238, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.8668363944264017, + "language_loss": 0.79147494, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81265104, + "num_input_tokens_seen": 308225690, + "step": 14288, + "time_per_iteration": 2.673257350921631 + }, + { + "auxiliary_loss_clip": 0.01059259, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.03385568, + "balance_loss_mlp": 1.0184083, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 2.287346687838328, + "language_loss": 0.80785286, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82874691, + "num_input_tokens_seen": 308245255, + "step": 14289, + "time_per_iteration": 2.6325936317443848 + }, + { + "auxiliary_loss_clip": 0.01087253, + "auxiliary_loss_mlp": 0.01029625, + "balance_loss_clip": 1.03463435, + "balance_loss_mlp": 1.01896763, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 1.6941534670386158, + "language_loss": 0.65126514, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67243397, + "num_input_tokens_seen": 308261755, + "step": 14290, + "time_per_iteration": 2.5701358318328857 + }, + { + "auxiliary_loss_clip": 0.01076735, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.03295898, + "balance_loss_mlp": 1.01774609, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 2.2540821850707045, + "language_loss": 0.5525952, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.5736568, + "num_input_tokens_seen": 308285145, + "step": 14291, + "time_per_iteration": 2.6165504455566406 + }, + { + "auxiliary_loss_clip": 0.01089485, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.03420675, + "balance_loss_mlp": 1.02069688, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 2.6825723713694876, + "language_loss": 0.71307528, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73428488, + "num_input_tokens_seen": 308304130, + "step": 14292, + "time_per_iteration": 3.971632480621338 + }, + { + "auxiliary_loss_clip": 0.01085751, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.03208232, + "balance_loss_mlp": 1.01801825, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.5204470818819813, + "language_loss": 0.71280771, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73395509, + "num_input_tokens_seen": 308324670, + "step": 14293, + "time_per_iteration": 2.59298038482666 + }, + { + "auxiliary_loss_clip": 0.01068548, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.03287768, + "balance_loss_mlp": 1.02055836, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.5164250212834927, + "language_loss": 0.68671179, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70770943, + "num_input_tokens_seen": 308344215, + "step": 14294, + "time_per_iteration": 2.690255880355835 + }, + { + "auxiliary_loss_clip": 0.01094056, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.03216326, + "balance_loss_mlp": 1.02027071, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 1.8925586480854528, + "language_loss": 0.77895135, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80019599, + "num_input_tokens_seen": 308360520, + "step": 14295, + "time_per_iteration": 2.504138469696045 + }, + { + "auxiliary_loss_clip": 0.01072046, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.03237963, + "balance_loss_mlp": 1.02215672, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 8.791192474178418, + "language_loss": 0.69067883, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71175027, + "num_input_tokens_seen": 308376865, + "step": 14296, + "time_per_iteration": 2.63507342338562 + }, + { + "auxiliary_loss_clip": 0.01067119, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.03091645, + "balance_loss_mlp": 1.01747084, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 2.167618413800849, + "language_loss": 0.79090047, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81186175, + "num_input_tokens_seen": 308395870, + "step": 14297, + "time_per_iteration": 2.643846273422241 + }, + { + "auxiliary_loss_clip": 0.01079134, + "auxiliary_loss_mlp": 0.01028366, + "balance_loss_clip": 1.03005171, + "balance_loss_mlp": 1.01869774, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.6017328400812894, + "language_loss": 0.68067169, + "learning_rate": 2.030402708016954e-07, + "loss": 0.7017467, + "num_input_tokens_seen": 308417250, + "step": 14298, + "time_per_iteration": 2.619216203689575 + }, + { + "auxiliary_loss_clip": 0.01068127, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.03034818, + "balance_loss_mlp": 1.01936865, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 1.8937768396789827, + "language_loss": 0.6853624, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70634413, + "num_input_tokens_seen": 308434565, + "step": 14299, + "time_per_iteration": 2.553718328475952 + }, + { + "auxiliary_loss_clip": 0.01067813, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.03349614, + "balance_loss_mlp": 1.02279425, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 6.0958884293868305, + "language_loss": 0.71345937, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73447466, + "num_input_tokens_seen": 308450040, + "step": 14300, + "time_per_iteration": 2.6928231716156006 + }, + { + "auxiliary_loss_clip": 0.01067302, + "auxiliary_loss_mlp": 0.01038263, + "balance_loss_clip": 1.02908552, + "balance_loss_mlp": 1.02620482, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.5901341590120446, + "language_loss": 0.69483042, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71588612, + "num_input_tokens_seen": 308470545, + "step": 14301, + "time_per_iteration": 2.738110065460205 + }, + { + "auxiliary_loss_clip": 0.01043289, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.03125799, + "balance_loss_mlp": 1.01455915, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 3.5923771078995346, + "language_loss": 0.74367905, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76436931, + "num_input_tokens_seen": 308490020, + "step": 14302, + "time_per_iteration": 2.69653582572937 + }, + { + "auxiliary_loss_clip": 0.01080365, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.03089595, + "balance_loss_mlp": 1.02045047, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 2.077263937210641, + "language_loss": 0.83351779, + "learning_rate": 2.02186225623733e-07, + "loss": 0.85462755, + "num_input_tokens_seen": 308509065, + "step": 14303, + "time_per_iteration": 2.670919895172119 + }, + { + "auxiliary_loss_clip": 0.01082264, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.03104556, + "balance_loss_mlp": 1.02135646, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.1010004402462026, + "language_loss": 0.77134776, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.7925036, + "num_input_tokens_seen": 308524725, + "step": 14304, + "time_per_iteration": 4.104997396469116 + }, + { + "auxiliary_loss_clip": 0.01098355, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.03416049, + "balance_loss_mlp": 1.01881623, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 2.302157116042648, + "language_loss": 0.54091501, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.56220913, + "num_input_tokens_seen": 308543525, + "step": 14305, + "time_per_iteration": 2.4878993034362793 + }, + { + "auxiliary_loss_clip": 0.01097069, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.03437781, + "balance_loss_mlp": 1.01577544, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 2.20219610376634, + "language_loss": 0.8348251, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.85606992, + "num_input_tokens_seen": 308557995, + "step": 14306, + "time_per_iteration": 2.5265133380889893 + }, + { + "auxiliary_loss_clip": 0.01084102, + "auxiliary_loss_mlp": 0.00749305, + "balance_loss_clip": 1.03298998, + "balance_loss_mlp": 1.00025988, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.373062395131079, + "language_loss": 0.71780086, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73613495, + "num_input_tokens_seen": 308582750, + "step": 14307, + "time_per_iteration": 2.6198678016662598 + }, + { + "auxiliary_loss_clip": 0.01084122, + "auxiliary_loss_mlp": 0.00749641, + "balance_loss_clip": 1.03134215, + "balance_loss_mlp": 1.00025928, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 5.650832249986019, + "language_loss": 0.63748503, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65582263, + "num_input_tokens_seen": 308603770, + "step": 14308, + "time_per_iteration": 2.612736940383911 + }, + { + "auxiliary_loss_clip": 0.01003765, + "auxiliary_loss_mlp": 0.01008229, + "balance_loss_clip": 1.00344157, + "balance_loss_mlp": 1.00725162, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.628411872828641, + "language_loss": 0.48476645, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50488639, + "num_input_tokens_seen": 308667735, + "step": 14309, + "time_per_iteration": 4.759142637252808 + }, + { + "auxiliary_loss_clip": 0.01045493, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.0327388, + "balance_loss_mlp": 1.02130628, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.8056970063598918, + "language_loss": 0.67081898, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69160867, + "num_input_tokens_seen": 308686300, + "step": 14310, + "time_per_iteration": 2.673923969268799 + }, + { + "auxiliary_loss_clip": 0.01022943, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.02567172, + "balance_loss_mlp": 1.01817012, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.6958791593339406, + "language_loss": 0.7871573, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80767834, + "num_input_tokens_seen": 308705825, + "step": 14311, + "time_per_iteration": 2.704190731048584 + }, + { + "auxiliary_loss_clip": 0.0108455, + "auxiliary_loss_mlp": 0.01027687, + "balance_loss_clip": 1.03264999, + "balance_loss_mlp": 1.01732183, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.00493222775623, + "language_loss": 0.71716154, + "learning_rate": 2.006532397626639e-07, + "loss": 0.73828387, + "num_input_tokens_seen": 308723340, + "step": 14312, + "time_per_iteration": 2.600811243057251 + }, + { + "auxiliary_loss_clip": 0.01065069, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.02937222, + "balance_loss_mlp": 1.01833296, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 2.2097104715649847, + "language_loss": 0.7795127, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80045676, + "num_input_tokens_seen": 308741280, + "step": 14313, + "time_per_iteration": 2.5781819820404053 + }, + { + "auxiliary_loss_clip": 0.01068964, + "auxiliary_loss_mlp": 0.01028222, + "balance_loss_clip": 1.03086948, + "balance_loss_mlp": 1.01652145, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.4585758197116436, + "language_loss": 0.72923815, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75021005, + "num_input_tokens_seen": 308762875, + "step": 14314, + "time_per_iteration": 2.691164255142212 + }, + { + "auxiliary_loss_clip": 0.01071158, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.03013337, + "balance_loss_mlp": 1.01890612, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.8675339393166586, + "language_loss": 0.69112384, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71212804, + "num_input_tokens_seen": 308780315, + "step": 14315, + "time_per_iteration": 2.8003110885620117 + }, + { + "auxiliary_loss_clip": 0.01085062, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.03379095, + "balance_loss_mlp": 1.02076602, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.8361073215120938, + "language_loss": 0.71628058, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.73744267, + "num_input_tokens_seen": 308799435, + "step": 14316, + "time_per_iteration": 2.6455318927764893 + }, + { + "auxiliary_loss_clip": 0.01079209, + "auxiliary_loss_mlp": 0.0102631, + "balance_loss_clip": 1.03585339, + "balance_loss_mlp": 1.01527691, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 2.0379030015217894, + "language_loss": 0.82786787, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.84892303, + "num_input_tokens_seen": 308817730, + "step": 14317, + "time_per_iteration": 2.6308274269104004 + }, + { + "auxiliary_loss_clip": 0.01077705, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.01750636, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.7298151949376888, + "language_loss": 0.66918808, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69025213, + "num_input_tokens_seen": 308841735, + "step": 14318, + "time_per_iteration": 4.317098379135132 + }, + { + "auxiliary_loss_clip": 0.01082525, + "auxiliary_loss_mlp": 0.01025374, + "balance_loss_clip": 1.03202593, + "balance_loss_mlp": 1.01539004, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.4453148956976172, + "language_loss": 0.71363991, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73471892, + "num_input_tokens_seen": 308865050, + "step": 14319, + "time_per_iteration": 2.714556932449341 + }, + { + "auxiliary_loss_clip": 0.01076199, + "auxiliary_loss_mlp": 0.00749325, + "balance_loss_clip": 1.03356957, + "balance_loss_mlp": 1.00020504, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 1.6309792075234968, + "language_loss": 0.67160153, + "learning_rate": 1.992952252525839e-07, + "loss": 0.68985677, + "num_input_tokens_seen": 308885375, + "step": 14320, + "time_per_iteration": 2.6016619205474854 + }, + { + "auxiliary_loss_clip": 0.01067887, + "auxiliary_loss_mlp": 0.01038908, + "balance_loss_clip": 1.03109527, + "balance_loss_mlp": 1.02611637, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 2.276498456786173, + "language_loss": 0.80291086, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82397878, + "num_input_tokens_seen": 308904700, + "step": 14321, + "time_per_iteration": 2.5529356002807617 + }, + { + "auxiliary_loss_clip": 0.01079694, + "auxiliary_loss_mlp": 0.00749353, + "balance_loss_clip": 1.03151178, + "balance_loss_mlp": 1.00014877, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 2.1379305235744233, + "language_loss": 0.71258235, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.73087275, + "num_input_tokens_seen": 308922985, + "step": 14322, + "time_per_iteration": 2.498976707458496 + }, + { + "auxiliary_loss_clip": 0.01074511, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.03184366, + "balance_loss_mlp": 1.02256131, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 2.1304355279869327, + "language_loss": 0.56411636, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58520794, + "num_input_tokens_seen": 308940765, + "step": 14323, + "time_per_iteration": 2.623185873031616 + }, + { + "auxiliary_loss_clip": 0.01060886, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.03106737, + "balance_loss_mlp": 1.01542461, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.8291388617015945, + "language_loss": 0.76119745, + "learning_rate": 1.986178565813801e-07, + "loss": 0.78206617, + "num_input_tokens_seen": 308960110, + "step": 14324, + "time_per_iteration": 2.6188981533050537 + }, + { + "auxiliary_loss_clip": 0.01048417, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.03079391, + "balance_loss_mlp": 1.02142048, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 2.4609489786448964, + "language_loss": 0.665034, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68586868, + "num_input_tokens_seen": 308976665, + "step": 14325, + "time_per_iteration": 2.6359004974365234 + }, + { + "auxiliary_loss_clip": 0.01087457, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.03382576, + "balance_loss_mlp": 1.01851821, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.6088228032767897, + "language_loss": 0.64727521, + "learning_rate": 1.982795820716472e-07, + "loss": 0.66844893, + "num_input_tokens_seen": 308997015, + "step": 14326, + "time_per_iteration": 2.552359104156494 + }, + { + "auxiliary_loss_clip": 0.01070331, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.02998555, + "balance_loss_mlp": 1.01826882, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 2.235463341633653, + "language_loss": 0.84565943, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86666018, + "num_input_tokens_seen": 309015250, + "step": 14327, + "time_per_iteration": 2.6090426445007324 + }, + { + "auxiliary_loss_clip": 0.01085593, + "auxiliary_loss_mlp": 0.01027579, + "balance_loss_clip": 1.03190064, + "balance_loss_mlp": 1.01670671, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 1.8337917677771032, + "language_loss": 0.75014585, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77127761, + "num_input_tokens_seen": 309034140, + "step": 14328, + "time_per_iteration": 2.570934295654297 + }, + { + "auxiliary_loss_clip": 0.01086462, + "auxiliary_loss_mlp": 0.01025341, + "balance_loss_clip": 1.03340316, + "balance_loss_mlp": 1.0148865, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 1.908933963155342, + "language_loss": 0.80037808, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82149613, + "num_input_tokens_seen": 309055075, + "step": 14329, + "time_per_iteration": 2.6693477630615234 + }, + { + "auxiliary_loss_clip": 0.01070957, + "auxiliary_loss_mlp": 0.01025822, + "balance_loss_clip": 1.03367662, + "balance_loss_mlp": 1.01486063, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 1.802756311014284, + "language_loss": 0.77061236, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.7915802, + "num_input_tokens_seen": 309074650, + "step": 14330, + "time_per_iteration": 2.6446032524108887 + }, + { + "auxiliary_loss_clip": 0.01086175, + "auxiliary_loss_mlp": 0.01026645, + "balance_loss_clip": 1.03320599, + "balance_loss_mlp": 1.01603591, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 1.7818208926142063, + "language_loss": 0.65324295, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67437112, + "num_input_tokens_seen": 309094385, + "step": 14331, + "time_per_iteration": 2.62322735786438 + }, + { + "auxiliary_loss_clip": 0.01073108, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.03380501, + "balance_loss_mlp": 1.02433848, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.5894591309823949, + "language_loss": 0.7609778, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.78205502, + "num_input_tokens_seen": 309111815, + "step": 14332, + "time_per_iteration": 4.106729030609131 + }, + { + "auxiliary_loss_clip": 0.01085847, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.0333302, + "balance_loss_mlp": 1.01708841, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 1.9348331620390486, + "language_loss": 0.67076325, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69191086, + "num_input_tokens_seen": 309131385, + "step": 14333, + "time_per_iteration": 2.5625555515289307 + }, + { + "auxiliary_loss_clip": 0.01072091, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.03356564, + "balance_loss_mlp": 1.01824856, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 2.2945749380392337, + "language_loss": 0.62153649, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64256978, + "num_input_tokens_seen": 309155020, + "step": 14334, + "time_per_iteration": 2.748305559158325 + }, + { + "auxiliary_loss_clip": 0.01063218, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.03325021, + "balance_loss_mlp": 1.03405833, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 2.1667970260522615, + "language_loss": 0.69465399, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71575463, + "num_input_tokens_seen": 309172865, + "step": 14335, + "time_per_iteration": 2.7620677947998047 + }, + { + "auxiliary_loss_clip": 0.01090203, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.0355835, + "balance_loss_mlp": 1.02193022, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.4089477433280555, + "language_loss": 0.82948703, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85072124, + "num_input_tokens_seen": 309193575, + "step": 14336, + "time_per_iteration": 2.619135856628418 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.03353179, + "balance_loss_mlp": 1.0173502, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.721794604864623, + "language_loss": 0.67487222, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69615459, + "num_input_tokens_seen": 309212680, + "step": 14337, + "time_per_iteration": 2.558577060699463 + }, + { + "auxiliary_loss_clip": 0.01053247, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02754068, + "balance_loss_mlp": 1.02231789, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.4837997409381718, + "language_loss": 0.6689477, + "learning_rate": 1.962556758053089e-07, + "loss": 0.68982017, + "num_input_tokens_seen": 309234485, + "step": 14338, + "time_per_iteration": 2.783517360687256 + }, + { + "auxiliary_loss_clip": 0.01077227, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.03491604, + "balance_loss_mlp": 1.01976228, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 2.091671843776991, + "language_loss": 0.61976957, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64084208, + "num_input_tokens_seen": 309253630, + "step": 14339, + "time_per_iteration": 2.57317852973938 + }, + { + "auxiliary_loss_clip": 0.01075154, + "auxiliary_loss_mlp": 0.00749309, + "balance_loss_clip": 1.03224313, + "balance_loss_mlp": 1.00022161, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 2.112163692015945, + "language_loss": 0.62680304, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64504772, + "num_input_tokens_seen": 309270950, + "step": 14340, + "time_per_iteration": 2.606527090072632 + }, + { + "auxiliary_loss_clip": 0.01047738, + "auxiliary_loss_mlp": 0.01023668, + "balance_loss_clip": 1.03132391, + "balance_loss_mlp": 1.0132376, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 2.0557062671627024, + "language_loss": 0.80238366, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82309771, + "num_input_tokens_seen": 309288780, + "step": 14341, + "time_per_iteration": 2.6543126106262207 + }, + { + "auxiliary_loss_clip": 0.01082166, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.03362083, + "balance_loss_mlp": 1.01817787, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 15.271909167052891, + "language_loss": 0.74730766, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76841474, + "num_input_tokens_seen": 309310875, + "step": 14342, + "time_per_iteration": 2.714226722717285 + }, + { + "auxiliary_loss_clip": 0.01057989, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.03186035, + "balance_loss_mlp": 1.01553965, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.6310772560009181, + "language_loss": 0.68475866, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70560807, + "num_input_tokens_seen": 309329900, + "step": 14343, + "time_per_iteration": 2.715029239654541 + }, + { + "auxiliary_loss_clip": 0.01084995, + "auxiliary_loss_mlp": 0.01037566, + "balance_loss_clip": 1.03136468, + "balance_loss_mlp": 1.02584815, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 2.2978675153094676, + "language_loss": 0.67854762, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.69977319, + "num_input_tokens_seen": 309347870, + "step": 14344, + "time_per_iteration": 4.036685943603516 + }, + { + "auxiliary_loss_clip": 0.01050325, + "auxiliary_loss_mlp": 0.01039346, + "balance_loss_clip": 1.0292474, + "balance_loss_mlp": 1.02711558, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.453610059273066, + "language_loss": 0.81574678, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83664352, + "num_input_tokens_seen": 309371695, + "step": 14345, + "time_per_iteration": 2.6704394817352295 + }, + { + "auxiliary_loss_clip": 0.01092307, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.03647184, + "balance_loss_mlp": 1.0196048, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.9990550945062813, + "language_loss": 0.50636137, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52759635, + "num_input_tokens_seen": 309394645, + "step": 14346, + "time_per_iteration": 2.709944725036621 + }, + { + "auxiliary_loss_clip": 0.01012685, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.02798808, + "balance_loss_mlp": 1.01581633, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.4317213646181461, + "language_loss": 0.75287777, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.773283, + "num_input_tokens_seen": 309413170, + "step": 14347, + "time_per_iteration": 2.8060314655303955 + }, + { + "auxiliary_loss_clip": 0.01069572, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.03327441, + "balance_loss_mlp": 1.01768374, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 2.129893730852781, + "language_loss": 0.80558503, + "learning_rate": 1.945766105774449e-07, + "loss": 0.82657456, + "num_input_tokens_seen": 309431315, + "step": 14348, + "time_per_iteration": 2.595654249191284 + }, + { + "auxiliary_loss_clip": 0.01080842, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.03211308, + "balance_loss_mlp": 1.01684606, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.9612493541750844, + "language_loss": 0.65972584, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68080485, + "num_input_tokens_seen": 309453020, + "step": 14349, + "time_per_iteration": 4.3705432415008545 + }, + { + "auxiliary_loss_clip": 0.01085507, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.03251004, + "balance_loss_mlp": 1.02317381, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.57417809587802, + "language_loss": 0.69873476, + "learning_rate": 1.942416188703573e-07, + "loss": 0.7199297, + "num_input_tokens_seen": 309469780, + "step": 14350, + "time_per_iteration": 2.5179738998413086 + }, + { + "auxiliary_loss_clip": 0.01065665, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.03003013, + "balance_loss_mlp": 1.01843762, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 2.5348397077097737, + "language_loss": 0.76732379, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.78827834, + "num_input_tokens_seen": 309489610, + "step": 14351, + "time_per_iteration": 2.6034750938415527 + }, + { + "auxiliary_loss_clip": 0.01085081, + "auxiliary_loss_mlp": 0.0102603, + "balance_loss_clip": 1.03249979, + "balance_loss_mlp": 1.01580143, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.9683470154589136, + "language_loss": 0.84607506, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.86718619, + "num_input_tokens_seen": 309508295, + "step": 14352, + "time_per_iteration": 2.511943817138672 + }, + { + "auxiliary_loss_clip": 0.01015862, + "auxiliary_loss_mlp": 0.01001407, + "balance_loss_clip": 1.00572157, + "balance_loss_mlp": 1.00049496, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7888677330577795, + "language_loss": 0.61920387, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63937658, + "num_input_tokens_seen": 309567960, + "step": 14353, + "time_per_iteration": 3.1430654525756836 + }, + { + "auxiliary_loss_clip": 0.01095924, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.03452158, + "balance_loss_mlp": 1.01547003, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.630293052216747, + "language_loss": 0.81795889, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.83917254, + "num_input_tokens_seen": 309586050, + "step": 14354, + "time_per_iteration": 2.5475802421569824 + }, + { + "auxiliary_loss_clip": 0.01074627, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.03106475, + "balance_loss_mlp": 1.01445222, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 1.9380604777433248, + "language_loss": 0.85697353, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87797534, + "num_input_tokens_seen": 309602910, + "step": 14355, + "time_per_iteration": 2.6626269817352295 + }, + { + "auxiliary_loss_clip": 0.01058467, + "auxiliary_loss_mlp": 0.01030211, + "balance_loss_clip": 1.03123474, + "balance_loss_mlp": 1.01854658, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 5.216420321104383, + "language_loss": 0.58782399, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.60871077, + "num_input_tokens_seen": 309621175, + "step": 14356, + "time_per_iteration": 2.6114513874053955 + }, + { + "auxiliary_loss_clip": 0.01052837, + "auxiliary_loss_mlp": 0.0102983, + "balance_loss_clip": 1.03128791, + "balance_loss_mlp": 1.01768899, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.7508827351472025, + "language_loss": 0.76851338, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.78934014, + "num_input_tokens_seen": 309639395, + "step": 14357, + "time_per_iteration": 2.7768356800079346 + }, + { + "auxiliary_loss_clip": 0.01086457, + "auxiliary_loss_mlp": 0.01027872, + "balance_loss_clip": 1.03334951, + "balance_loss_mlp": 1.01645756, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.0897746299618025, + "language_loss": 0.77541029, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79655361, + "num_input_tokens_seen": 309657265, + "step": 14358, + "time_per_iteration": 2.5320868492126465 + }, + { + "auxiliary_loss_clip": 0.01055448, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.02878046, + "balance_loss_mlp": 1.01924753, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.314724358595367, + "language_loss": 0.75116658, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77204204, + "num_input_tokens_seen": 309678610, + "step": 14359, + "time_per_iteration": 4.434295892715454 + }, + { + "auxiliary_loss_clip": 0.01029427, + "auxiliary_loss_mlp": 0.01025697, + "balance_loss_clip": 1.02650595, + "balance_loss_mlp": 1.01458025, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.7902593730479581, + "language_loss": 0.70713758, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72768885, + "num_input_tokens_seen": 309697710, + "step": 14360, + "time_per_iteration": 2.7525992393493652 + }, + { + "auxiliary_loss_clip": 0.01068698, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.03451645, + "balance_loss_mlp": 1.02219164, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.8783035911099588, + "language_loss": 0.76366878, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78469837, + "num_input_tokens_seen": 309715985, + "step": 14361, + "time_per_iteration": 2.647493362426758 + }, + { + "auxiliary_loss_clip": 0.01022639, + "auxiliary_loss_mlp": 0.01004749, + "balance_loss_clip": 1.00292635, + "balance_loss_mlp": 1.00377738, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9592899452396099, + "language_loss": 0.58806551, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60833943, + "num_input_tokens_seen": 309779930, + "step": 14362, + "time_per_iteration": 3.164846897125244 + }, + { + "auxiliary_loss_clip": 0.01034199, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.03463602, + "balance_loss_mlp": 1.01818967, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 2.3435451449088736, + "language_loss": 0.80435282, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82499981, + "num_input_tokens_seen": 309800580, + "step": 14363, + "time_per_iteration": 2.76674485206604 + }, + { + "auxiliary_loss_clip": 0.0106852, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.02976274, + "balance_loss_mlp": 1.02391529, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.321932089357069, + "language_loss": 0.72830188, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.74936092, + "num_input_tokens_seen": 309821725, + "step": 14364, + "time_per_iteration": 2.6180176734924316 + }, + { + "auxiliary_loss_clip": 0.01070339, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.02986538, + "balance_loss_mlp": 1.02336848, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.6681111768130208, + "language_loss": 0.71934259, + "learning_rate": 1.917379150731755e-07, + "loss": 0.74039984, + "num_input_tokens_seen": 309841565, + "step": 14365, + "time_per_iteration": 2.632803201675415 + }, + { + "auxiliary_loss_clip": 0.01071765, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.0321095, + "balance_loss_mlp": 1.02810454, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 3.005752961357951, + "language_loss": 0.71211821, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73325568, + "num_input_tokens_seen": 309858635, + "step": 14366, + "time_per_iteration": 2.588228464126587 + }, + { + "auxiliary_loss_clip": 0.01068765, + "auxiliary_loss_mlp": 0.01023553, + "balance_loss_clip": 1.03295171, + "balance_loss_mlp": 1.0133785, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 2.061886456266527, + "language_loss": 0.81550443, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.83642757, + "num_input_tokens_seen": 309877885, + "step": 14367, + "time_per_iteration": 2.6353018283843994 + }, + { + "auxiliary_loss_clip": 0.01073278, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.03305507, + "balance_loss_mlp": 1.01687944, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 2.010917237466721, + "language_loss": 0.61800975, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63902843, + "num_input_tokens_seen": 309893140, + "step": 14368, + "time_per_iteration": 2.680886745452881 + }, + { + "auxiliary_loss_clip": 0.01087533, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.03493476, + "balance_loss_mlp": 1.01839316, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 2.2621275766226434, + "language_loss": 0.76294917, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78411686, + "num_input_tokens_seen": 309914175, + "step": 14369, + "time_per_iteration": 2.629958391189575 + }, + { + "auxiliary_loss_clip": 0.01072418, + "auxiliary_loss_mlp": 0.01033471, + "balance_loss_clip": 1.03189957, + "balance_loss_mlp": 1.02148461, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 1.7331841194170372, + "language_loss": 0.643646, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66470492, + "num_input_tokens_seen": 309932395, + "step": 14370, + "time_per_iteration": 2.5843052864074707 + }, + { + "auxiliary_loss_clip": 0.01036264, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.0348798, + "balance_loss_mlp": 1.01825523, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.5733378597035002, + "language_loss": 0.66475248, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68541121, + "num_input_tokens_seen": 309951720, + "step": 14371, + "time_per_iteration": 2.808788299560547 + }, + { + "auxiliary_loss_clip": 0.01004993, + "auxiliary_loss_mlp": 0.01000311, + "balance_loss_clip": 1.00471413, + "balance_loss_mlp": 0.9993515, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8578029387829933, + "language_loss": 0.56925941, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58931243, + "num_input_tokens_seen": 310006120, + "step": 14372, + "time_per_iteration": 4.534762382507324 + }, + { + "auxiliary_loss_clip": 0.01096743, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.03568983, + "balance_loss_mlp": 1.0189116, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.7377690932319374, + "language_loss": 0.79306227, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81433237, + "num_input_tokens_seen": 310026740, + "step": 14373, + "time_per_iteration": 2.5800817012786865 + }, + { + "auxiliary_loss_clip": 0.01097614, + "auxiliary_loss_mlp": 0.01026699, + "balance_loss_clip": 1.0338757, + "balance_loss_mlp": 1.015517, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.8986495618637063, + "language_loss": 0.63213652, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65337962, + "num_input_tokens_seen": 310044135, + "step": 14374, + "time_per_iteration": 2.5256288051605225 + }, + { + "auxiliary_loss_clip": 0.01064739, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.03293943, + "balance_loss_mlp": 1.01949108, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.953520789517305, + "language_loss": 0.77417624, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.7951296, + "num_input_tokens_seen": 310061560, + "step": 14375, + "time_per_iteration": 2.534825563430786 + }, + { + "auxiliary_loss_clip": 0.01050186, + "auxiliary_loss_mlp": 0.00749173, + "balance_loss_clip": 1.02928054, + "balance_loss_mlp": 1.00016677, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.8149787500660919, + "language_loss": 0.60558534, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62357897, + "num_input_tokens_seen": 310087310, + "step": 14376, + "time_per_iteration": 3.0335264205932617 + }, + { + "auxiliary_loss_clip": 0.01050662, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.03006744, + "balance_loss_mlp": 1.02322876, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.433102437894859, + "language_loss": 0.66453075, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68537426, + "num_input_tokens_seen": 310106260, + "step": 14377, + "time_per_iteration": 2.62302303314209 + }, + { + "auxiliary_loss_clip": 0.01066458, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.02838552, + "balance_loss_mlp": 1.02272606, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.8958685662749002, + "language_loss": 0.70597172, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72698545, + "num_input_tokens_seen": 310125305, + "step": 14378, + "time_per_iteration": 2.668043851852417 + }, + { + "auxiliary_loss_clip": 0.01012048, + "auxiliary_loss_mlp": 0.01002616, + "balance_loss_clip": 1.00263453, + "balance_loss_mlp": 1.00160825, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8193073340142822, + "language_loss": 0.60270262, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62284929, + "num_input_tokens_seen": 310189270, + "step": 14379, + "time_per_iteration": 3.182321548461914 + }, + { + "auxiliary_loss_clip": 0.01063143, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.0302515, + "balance_loss_mlp": 1.01922894, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.629816062416499, + "language_loss": 0.74671823, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76764643, + "num_input_tokens_seen": 310208395, + "step": 14380, + "time_per_iteration": 2.6064112186431885 + }, + { + "auxiliary_loss_clip": 0.01062123, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.02901626, + "balance_loss_mlp": 1.01817608, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.303557739051425, + "language_loss": 0.75161147, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77252734, + "num_input_tokens_seen": 310227415, + "step": 14381, + "time_per_iteration": 2.5533788204193115 + }, + { + "auxiliary_loss_clip": 0.01074167, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.03415668, + "balance_loss_mlp": 1.01949382, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 4.673668771471862, + "language_loss": 0.84459472, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86563301, + "num_input_tokens_seen": 310242625, + "step": 14382, + "time_per_iteration": 2.6265416145324707 + }, + { + "auxiliary_loss_clip": 0.01088281, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.03347754, + "balance_loss_mlp": 1.02110398, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 1.730497409189879, + "language_loss": 0.75665092, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77785826, + "num_input_tokens_seen": 310260585, + "step": 14383, + "time_per_iteration": 2.585362195968628 + }, + { + "auxiliary_loss_clip": 0.0107751, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.03562737, + "balance_loss_mlp": 1.01785254, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.6659011183212848, + "language_loss": 0.85278797, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87385345, + "num_input_tokens_seen": 310277210, + "step": 14384, + "time_per_iteration": 4.228732347488403 + }, + { + "auxiliary_loss_clip": 0.01082087, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.03095365, + "balance_loss_mlp": 1.01751506, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 2.126200723973539, + "language_loss": 0.80809879, + "learning_rate": 1.884236463176072e-07, + "loss": 0.82919794, + "num_input_tokens_seen": 310296610, + "step": 14385, + "time_per_iteration": 2.563930034637451 + }, + { + "auxiliary_loss_clip": 0.01081801, + "auxiliary_loss_mlp": 0.01028813, + "balance_loss_clip": 1.03641295, + "balance_loss_mlp": 1.01754165, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 2.4889380861945347, + "language_loss": 0.72487789, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.74598408, + "num_input_tokens_seen": 310316830, + "step": 14386, + "time_per_iteration": 2.600980758666992 + }, + { + "auxiliary_loss_clip": 0.01079229, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.03209782, + "balance_loss_mlp": 1.0216732, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 1.9813472452116188, + "language_loss": 0.82291967, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84404564, + "num_input_tokens_seen": 310334355, + "step": 14387, + "time_per_iteration": 2.5186872482299805 + }, + { + "auxiliary_loss_clip": 0.01094242, + "auxiliary_loss_mlp": 0.01024787, + "balance_loss_clip": 1.03361058, + "balance_loss_mlp": 1.01448727, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 2.6194413343291254, + "language_loss": 0.68431735, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70550764, + "num_input_tokens_seen": 310352900, + "step": 14388, + "time_per_iteration": 2.5490989685058594 + }, + { + "auxiliary_loss_clip": 0.01064203, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.03490341, + "balance_loss_mlp": 1.02104974, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.515631039191705, + "language_loss": 0.90365481, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92460823, + "num_input_tokens_seen": 310372855, + "step": 14389, + "time_per_iteration": 4.273766994476318 + }, + { + "auxiliary_loss_clip": 0.0105608, + "auxiliary_loss_mlp": 0.00749139, + "balance_loss_clip": 1.03615785, + "balance_loss_mlp": 1.00027013, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.6679030163281778, + "language_loss": 0.70755231, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72560447, + "num_input_tokens_seen": 310391595, + "step": 14390, + "time_per_iteration": 2.8025898933410645 + }, + { + "auxiliary_loss_clip": 0.01098535, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.033921, + "balance_loss_mlp": 1.02437496, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.6838943138497104, + "language_loss": 0.8229084, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84424496, + "num_input_tokens_seen": 310410090, + "step": 14391, + "time_per_iteration": 2.5739238262176514 + }, + { + "auxiliary_loss_clip": 0.0099706, + "auxiliary_loss_mlp": 0.01000491, + "balance_loss_clip": 1.00851417, + "balance_loss_mlp": 0.99948996, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.7924070748552198, + "language_loss": 0.6796658, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.69964135, + "num_input_tokens_seen": 310470055, + "step": 14392, + "time_per_iteration": 3.0953257083892822 + }, + { + "auxiliary_loss_clip": 0.01091094, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.03398407, + "balance_loss_mlp": 1.01719308, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 2.530897435333149, + "language_loss": 0.75236833, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.7735703, + "num_input_tokens_seen": 310487665, + "step": 14393, + "time_per_iteration": 2.612090826034546 + }, + { + "auxiliary_loss_clip": 0.01072061, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.02931273, + "balance_loss_mlp": 1.02090859, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 3.178850680352377, + "language_loss": 0.73560011, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.75663799, + "num_input_tokens_seen": 310506130, + "step": 14394, + "time_per_iteration": 2.6128430366516113 + }, + { + "auxiliary_loss_clip": 0.01084526, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.03133106, + "balance_loss_mlp": 1.01731694, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 1.714568930708903, + "language_loss": 0.6514582, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67259407, + "num_input_tokens_seen": 310532445, + "step": 14395, + "time_per_iteration": 2.897508144378662 + }, + { + "auxiliary_loss_clip": 0.0107771, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.03218806, + "balance_loss_mlp": 1.02160501, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 2.6827998443602916, + "language_loss": 0.67814863, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.69925088, + "num_input_tokens_seen": 310552300, + "step": 14396, + "time_per_iteration": 2.6437621116638184 + }, + { + "auxiliary_loss_clip": 0.0108997, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.03478527, + "balance_loss_mlp": 1.02255416, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.559612438673644, + "language_loss": 0.68991125, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71114838, + "num_input_tokens_seen": 310572710, + "step": 14397, + "time_per_iteration": 2.5718038082122803 + }, + { + "auxiliary_loss_clip": 0.01079054, + "auxiliary_loss_mlp": 0.01024322, + "balance_loss_clip": 1.03436279, + "balance_loss_mlp": 1.01398087, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 1.9381529174833365, + "language_loss": 0.6344291, + "learning_rate": 1.86284103591253e-07, + "loss": 0.6554628, + "num_input_tokens_seen": 310592460, + "step": 14398, + "time_per_iteration": 4.214260578155518 + }, + { + "auxiliary_loss_clip": 0.01058159, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.03226459, + "balance_loss_mlp": 1.02211607, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 2.0338468296638252, + "language_loss": 0.76257098, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78349841, + "num_input_tokens_seen": 310609375, + "step": 14399, + "time_per_iteration": 2.6556873321533203 + }, + { + "auxiliary_loss_clip": 0.01080192, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.03356421, + "balance_loss_mlp": 1.01640558, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 2.154573880965244, + "language_loss": 0.93181765, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95288408, + "num_input_tokens_seen": 310627405, + "step": 14400, + "time_per_iteration": 2.5676827430725098 + }, + { + "auxiliary_loss_clip": 0.01027366, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.0297097, + "balance_loss_mlp": 1.02285647, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 1.8692246841508424, + "language_loss": 0.67789626, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69851315, + "num_input_tokens_seen": 310649945, + "step": 14401, + "time_per_iteration": 2.8308515548706055 + }, + { + "auxiliary_loss_clip": 0.01086346, + "auxiliary_loss_mlp": 0.01026847, + "balance_loss_clip": 1.03184271, + "balance_loss_mlp": 1.01564169, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 2.5522983737136418, + "language_loss": 0.73745012, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.758582, + "num_input_tokens_seen": 310668285, + "step": 14402, + "time_per_iteration": 2.5002574920654297 + }, + { + "auxiliary_loss_clip": 0.01022892, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.03194284, + "balance_loss_mlp": 1.02189958, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.873491763781447, + "language_loss": 0.750359, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77091402, + "num_input_tokens_seen": 310687015, + "step": 14403, + "time_per_iteration": 2.763010025024414 + }, + { + "auxiliary_loss_clip": 0.01069331, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.03218734, + "balance_loss_mlp": 1.01899981, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 2.245667052290392, + "language_loss": 0.73377186, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75477868, + "num_input_tokens_seen": 310707580, + "step": 14404, + "time_per_iteration": 2.5831480026245117 + }, + { + "auxiliary_loss_clip": 0.01058613, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.03317189, + "balance_loss_mlp": 1.02001476, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 2.3663765503197305, + "language_loss": 0.70834774, + "learning_rate": 1.851368555901447e-07, + "loss": 0.7292465, + "num_input_tokens_seen": 310727300, + "step": 14405, + "time_per_iteration": 2.6321969032287598 + }, + { + "auxiliary_loss_clip": 0.0108745, + "auxiliary_loss_mlp": 0.00749404, + "balance_loss_clip": 1.03268623, + "balance_loss_mlp": 1.00022304, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.644433646887406, + "language_loss": 0.66065383, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.67902231, + "num_input_tokens_seen": 310744935, + "step": 14406, + "time_per_iteration": 2.586193799972534 + }, + { + "auxiliary_loss_clip": 0.0107507, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.03205371, + "balance_loss_mlp": 1.01421237, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.5404965052118103, + "language_loss": 0.83117414, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85216975, + "num_input_tokens_seen": 310765085, + "step": 14407, + "time_per_iteration": 2.588893175125122 + }, + { + "auxiliary_loss_clip": 0.0108501, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.03374767, + "balance_loss_mlp": 1.0219481, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.7563277711979357, + "language_loss": 0.70027411, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72145057, + "num_input_tokens_seen": 310783260, + "step": 14408, + "time_per_iteration": 2.5911474227905273 + }, + { + "auxiliary_loss_clip": 0.01077111, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.03234518, + "balance_loss_mlp": 1.02107644, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.8774041003574555, + "language_loss": 0.77307951, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79416019, + "num_input_tokens_seen": 310801970, + "step": 14409, + "time_per_iteration": 2.5621590614318848 + }, + { + "auxiliary_loss_clip": 0.01090122, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.03567648, + "balance_loss_mlp": 1.01828969, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.8068608955819767, + "language_loss": 0.76976991, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79097265, + "num_input_tokens_seen": 310822070, + "step": 14410, + "time_per_iteration": 2.582824230194092 + }, + { + "auxiliary_loss_clip": 0.01053078, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.03054988, + "balance_loss_mlp": 1.01830387, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 1.9407173732665184, + "language_loss": 0.77738601, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79821491, + "num_input_tokens_seen": 310838355, + "step": 14411, + "time_per_iteration": 2.6281583309173584 + }, + { + "auxiliary_loss_clip": 0.01069376, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.03142011, + "balance_loss_mlp": 1.02086294, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 1.7334853709177742, + "language_loss": 0.73702335, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.75802517, + "num_input_tokens_seen": 310856055, + "step": 14412, + "time_per_iteration": 4.151912212371826 + }, + { + "auxiliary_loss_clip": 0.01076225, + "auxiliary_loss_mlp": 0.00749344, + "balance_loss_clip": 1.03077006, + "balance_loss_mlp": 1.00027466, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 2.0066912112476043, + "language_loss": 0.69771516, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71597087, + "num_input_tokens_seen": 310876695, + "step": 14413, + "time_per_iteration": 2.62088680267334 + }, + { + "auxiliary_loss_clip": 0.01085775, + "auxiliary_loss_mlp": 0.01028803, + "balance_loss_clip": 1.03342927, + "balance_loss_mlp": 1.01791346, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.243600955917817, + "language_loss": 0.62315339, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.64429915, + "num_input_tokens_seen": 310893880, + "step": 14414, + "time_per_iteration": 2.517151355743408 + }, + { + "auxiliary_loss_clip": 0.0106489, + "auxiliary_loss_mlp": 0.00749214, + "balance_loss_clip": 1.03301811, + "balance_loss_mlp": 1.00021887, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.6589990172818496, + "language_loss": 0.63785446, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.65599543, + "num_input_tokens_seen": 310914145, + "step": 14415, + "time_per_iteration": 2.6377618312835693 + }, + { + "auxiliary_loss_clip": 0.00994491, + "auxiliary_loss_mlp": 0.00999959, + "balance_loss_clip": 1.00456989, + "balance_loss_mlp": 0.99901181, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.7936527949211705, + "language_loss": 0.60395324, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62389773, + "num_input_tokens_seen": 310972825, + "step": 14416, + "time_per_iteration": 3.1743500232696533 + }, + { + "auxiliary_loss_clip": 0.01087931, + "auxiliary_loss_mlp": 0.00749513, + "balance_loss_clip": 1.03235865, + "balance_loss_mlp": 1.00024855, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.7278283412851267, + "language_loss": 0.74814838, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76652282, + "num_input_tokens_seen": 310992050, + "step": 14417, + "time_per_iteration": 2.5015430450439453 + }, + { + "auxiliary_loss_clip": 0.01073347, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.03197503, + "balance_loss_mlp": 1.02220428, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 2.6002834037526212, + "language_loss": 0.74920517, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77027142, + "num_input_tokens_seen": 311011105, + "step": 14418, + "time_per_iteration": 2.5705716609954834 + }, + { + "auxiliary_loss_clip": 0.01083746, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.03175342, + "balance_loss_mlp": 1.02011752, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.5995198387969738, + "language_loss": 0.68355507, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70470047, + "num_input_tokens_seen": 311032080, + "step": 14419, + "time_per_iteration": 2.639909029006958 + }, + { + "auxiliary_loss_clip": 0.01085377, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.03156626, + "balance_loss_mlp": 1.02118325, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 2.019416647895897, + "language_loss": 0.78907824, + "learning_rate": 1.826898250065465e-07, + "loss": 0.81024343, + "num_input_tokens_seen": 311049735, + "step": 14420, + "time_per_iteration": 2.632467746734619 + }, + { + "auxiliary_loss_clip": 0.01078856, + "auxiliary_loss_mlp": 0.01027144, + "balance_loss_clip": 1.03129816, + "balance_loss_mlp": 1.01621222, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.7985847228521634, + "language_loss": 0.83742398, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85848397, + "num_input_tokens_seen": 311067675, + "step": 14421, + "time_per_iteration": 2.5806710720062256 + }, + { + "auxiliary_loss_clip": 0.01005987, + "auxiliary_loss_mlp": 0.01005763, + "balance_loss_clip": 1.00855362, + "balance_loss_mlp": 1.00484467, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7048564092694137, + "language_loss": 0.49073738, + "learning_rate": 1.823647253209941e-07, + "loss": 0.5108549, + "num_input_tokens_seen": 311126605, + "step": 14422, + "time_per_iteration": 3.1986968517303467 + }, + { + "auxiliary_loss_clip": 0.01074671, + "auxiliary_loss_mlp": 0.00749207, + "balance_loss_clip": 1.03272784, + "balance_loss_mlp": 1.00025201, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 1.6878562434961295, + "language_loss": 0.73264742, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75088614, + "num_input_tokens_seen": 311147325, + "step": 14423, + "time_per_iteration": 2.5914411544799805 + }, + { + "auxiliary_loss_clip": 0.01045267, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.02732015, + "balance_loss_mlp": 1.01845956, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.6208450259261196, + "language_loss": 0.76583397, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.78658795, + "num_input_tokens_seen": 311165385, + "step": 14424, + "time_per_iteration": 4.206169128417969 + }, + { + "auxiliary_loss_clip": 0.01051093, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.02794433, + "balance_loss_mlp": 1.02578974, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 1.515103014259011, + "language_loss": 0.71307099, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73394632, + "num_input_tokens_seen": 311185860, + "step": 14425, + "time_per_iteration": 2.647942304611206 + }, + { + "auxiliary_loss_clip": 0.01087048, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.03294516, + "balance_loss_mlp": 1.01920342, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 2.2521319761252134, + "language_loss": 0.68012124, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70129603, + "num_input_tokens_seen": 311205810, + "step": 14426, + "time_per_iteration": 2.57438063621521 + }, + { + "auxiliary_loss_clip": 0.01051795, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.0326072, + "balance_loss_mlp": 1.01521778, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 2.358799791173813, + "language_loss": 0.70662791, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72741115, + "num_input_tokens_seen": 311226080, + "step": 14427, + "time_per_iteration": 2.6264097690582275 + }, + { + "auxiliary_loss_clip": 0.0106725, + "auxiliary_loss_mlp": 0.01024816, + "balance_loss_clip": 1.03312039, + "balance_loss_mlp": 1.01425397, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 1.9460831398570753, + "language_loss": 0.6824953, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70341593, + "num_input_tokens_seen": 311246380, + "step": 14428, + "time_per_iteration": 2.7474565505981445 + }, + { + "auxiliary_loss_clip": 0.01069335, + "auxiliary_loss_mlp": 0.01024576, + "balance_loss_clip": 1.03267872, + "balance_loss_mlp": 1.01380539, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 1.8849043209348917, + "language_loss": 0.70522505, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72616416, + "num_input_tokens_seen": 311266465, + "step": 14429, + "time_per_iteration": 4.1034791469573975 + }, + { + "auxiliary_loss_clip": 0.01075535, + "auxiliary_loss_mlp": 0.0102466, + "balance_loss_clip": 1.03235316, + "balance_loss_mlp": 1.0137521, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 1.9456164362051798, + "language_loss": 0.66427922, + "learning_rate": 1.810670840677151e-07, + "loss": 0.68528116, + "num_input_tokens_seen": 311285075, + "step": 14430, + "time_per_iteration": 2.6091718673706055 + }, + { + "auxiliary_loss_clip": 0.01051502, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.03160918, + "balance_loss_mlp": 1.02280307, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 4.3645418081218486, + "language_loss": 0.69165015, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.7125119, + "num_input_tokens_seen": 311303230, + "step": 14431, + "time_per_iteration": 2.646226406097412 + }, + { + "auxiliary_loss_clip": 0.01081652, + "auxiliary_loss_mlp": 0.0103729, + "balance_loss_clip": 1.03091788, + "balance_loss_mlp": 1.02609003, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.3741157648545372, + "language_loss": 0.63559926, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65678859, + "num_input_tokens_seen": 311318070, + "step": 14432, + "time_per_iteration": 2.5059220790863037 + }, + { + "auxiliary_loss_clip": 0.01087708, + "auxiliary_loss_mlp": 0.0103972, + "balance_loss_clip": 1.03363085, + "balance_loss_mlp": 1.02961099, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 2.7876637514663876, + "language_loss": 0.7868675, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80814177, + "num_input_tokens_seen": 311334885, + "step": 14433, + "time_per_iteration": 2.5147557258605957 + }, + { + "auxiliary_loss_clip": 0.01004027, + "auxiliary_loss_mlp": 0.01011532, + "balance_loss_clip": 1.00398707, + "balance_loss_mlp": 1.01068532, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7059130484972208, + "language_loss": 0.58514726, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60530287, + "num_input_tokens_seen": 311399780, + "step": 14434, + "time_per_iteration": 3.2531495094299316 + }, + { + "auxiliary_loss_clip": 0.01073374, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.03245425, + "balance_loss_mlp": 1.0208596, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.8203003424470476, + "language_loss": 0.79735696, + "learning_rate": 1.802582997433628e-07, + "loss": 0.81839669, + "num_input_tokens_seen": 311419610, + "step": 14435, + "time_per_iteration": 2.829089879989624 + }, + { + "auxiliary_loss_clip": 0.01073853, + "auxiliary_loss_mlp": 0.00749523, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.00025487, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 6.064449172414125, + "language_loss": 0.61963081, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.63786453, + "num_input_tokens_seen": 311440045, + "step": 14436, + "time_per_iteration": 2.748481512069702 + }, + { + "auxiliary_loss_clip": 0.01073945, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.03402019, + "balance_loss_mlp": 1.01586449, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 1.9038239633121137, + "language_loss": 0.70316231, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72418165, + "num_input_tokens_seen": 311456660, + "step": 14437, + "time_per_iteration": 2.5503203868865967 + }, + { + "auxiliary_loss_clip": 0.01061629, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.03168595, + "balance_loss_mlp": 1.01617908, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 2.372854553811027, + "language_loss": 0.80306971, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82395792, + "num_input_tokens_seen": 311475460, + "step": 14438, + "time_per_iteration": 4.1401448249816895 + }, + { + "auxiliary_loss_clip": 0.01079896, + "auxiliary_loss_mlp": 0.01026652, + "balance_loss_clip": 1.0322603, + "balance_loss_mlp": 1.01611352, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.8963317734173266, + "language_loss": 0.67029965, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69136512, + "num_input_tokens_seen": 311494575, + "step": 14439, + "time_per_iteration": 2.5323827266693115 + }, + { + "auxiliary_loss_clip": 0.01083475, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.0320437, + "balance_loss_mlp": 1.02185774, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.8737118714256205, + "language_loss": 0.63976645, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.66092336, + "num_input_tokens_seen": 311515805, + "step": 14440, + "time_per_iteration": 2.6780049800872803 + }, + { + "auxiliary_loss_clip": 0.01084432, + "auxiliary_loss_mlp": 0.01026919, + "balance_loss_clip": 1.03331399, + "balance_loss_mlp": 1.01606464, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.6861594294460112, + "language_loss": 0.65942121, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68053472, + "num_input_tokens_seen": 311536000, + "step": 14441, + "time_per_iteration": 2.6434433460235596 + }, + { + "auxiliary_loss_clip": 0.01079519, + "auxiliary_loss_mlp": 0.01024781, + "balance_loss_clip": 1.0349232, + "balance_loss_mlp": 1.01508331, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.4813556013254845, + "language_loss": 0.66207719, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68312019, + "num_input_tokens_seen": 311556220, + "step": 14442, + "time_per_iteration": 2.6670734882354736 + }, + { + "auxiliary_loss_clip": 0.01070237, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.03230655, + "balance_loss_mlp": 1.01828599, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.8417939761459254, + "language_loss": 0.72652012, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.7475301, + "num_input_tokens_seen": 311572530, + "step": 14443, + "time_per_iteration": 2.5852653980255127 + }, + { + "auxiliary_loss_clip": 0.01098909, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.03467512, + "balance_loss_mlp": 1.01712954, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 4.630112658012786, + "language_loss": 0.83398032, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85525107, + "num_input_tokens_seen": 311591105, + "step": 14444, + "time_per_iteration": 2.5656540393829346 + }, + { + "auxiliary_loss_clip": 0.01058621, + "auxiliary_loss_mlp": 0.01025838, + "balance_loss_clip": 1.03345418, + "balance_loss_mlp": 1.01519227, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 1.7453887420442658, + "language_loss": 0.77172321, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79256785, + "num_input_tokens_seen": 311608350, + "step": 14445, + "time_per_iteration": 2.632904529571533 + }, + { + "auxiliary_loss_clip": 0.0108715, + "auxiliary_loss_mlp": 0.01027618, + "balance_loss_clip": 1.03415275, + "balance_loss_mlp": 1.01710367, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.8644823695714403, + "language_loss": 0.68028289, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.70143056, + "num_input_tokens_seen": 311626380, + "step": 14446, + "time_per_iteration": 2.6067118644714355 + }, + { + "auxiliary_loss_clip": 0.01080136, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.03242826, + "balance_loss_mlp": 1.02266622, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.6877317241690415, + "language_loss": 0.83047426, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85161269, + "num_input_tokens_seen": 311644345, + "step": 14447, + "time_per_iteration": 2.5872573852539062 + }, + { + "auxiliary_loss_clip": 0.01016838, + "auxiliary_loss_mlp": 0.01028274, + "balance_loss_clip": 1.02917349, + "balance_loss_mlp": 1.01750994, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.9469790114355756, + "language_loss": 0.73949265, + "learning_rate": 1.781635359686515e-07, + "loss": 0.75994384, + "num_input_tokens_seen": 311663340, + "step": 14448, + "time_per_iteration": 2.7877256870269775 + }, + { + "auxiliary_loss_clip": 0.01068614, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.03113151, + "balance_loss_mlp": 1.02061486, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.0987971459432875, + "language_loss": 0.80628967, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82730532, + "num_input_tokens_seen": 311679860, + "step": 14449, + "time_per_iteration": 2.56671142578125 + }, + { + "auxiliary_loss_clip": 0.00993743, + "auxiliary_loss_mlp": 0.01001112, + "balance_loss_clip": 1.00414872, + "balance_loss_mlp": 1.00020015, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8525212255515706, + "language_loss": 0.60569608, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62564462, + "num_input_tokens_seen": 311738135, + "step": 14450, + "time_per_iteration": 3.1071712970733643 + }, + { + "auxiliary_loss_clip": 0.01070485, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.03437948, + "balance_loss_mlp": 1.01811481, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.6348689393048026, + "language_loss": 0.76114953, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78214473, + "num_input_tokens_seen": 311756975, + "step": 14451, + "time_per_iteration": 2.636552572250366 + }, + { + "auxiliary_loss_clip": 0.01082533, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.0316118, + "balance_loss_mlp": 1.01584423, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.4990386299633767, + "language_loss": 0.7194972, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74058789, + "num_input_tokens_seen": 311771830, + "step": 14452, + "time_per_iteration": 3.9782042503356934 + }, + { + "auxiliary_loss_clip": 0.01074496, + "auxiliary_loss_mlp": 0.00749384, + "balance_loss_clip": 1.03270054, + "balance_loss_mlp": 1.00020397, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.5543812797531873, + "language_loss": 0.72364175, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74188054, + "num_input_tokens_seen": 311790130, + "step": 14453, + "time_per_iteration": 2.5625765323638916 + }, + { + "auxiliary_loss_clip": 0.01082171, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.03230619, + "balance_loss_mlp": 1.01884747, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 2.904211081002407, + "language_loss": 0.73104191, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75215936, + "num_input_tokens_seen": 311808360, + "step": 14454, + "time_per_iteration": 2.5659918785095215 + }, + { + "auxiliary_loss_clip": 0.01097599, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.03503609, + "balance_loss_mlp": 1.02144933, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 2.051102685894891, + "language_loss": 0.59491777, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.6162113, + "num_input_tokens_seen": 311831325, + "step": 14455, + "time_per_iteration": 2.6270055770874023 + }, + { + "auxiliary_loss_clip": 0.01076371, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.03102612, + "balance_loss_mlp": 1.01618159, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.154629413433309, + "language_loss": 0.80254763, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82358497, + "num_input_tokens_seen": 311848090, + "step": 14456, + "time_per_iteration": 2.547715425491333 + }, + { + "auxiliary_loss_clip": 0.0103511, + "auxiliary_loss_mlp": 0.01039575, + "balance_loss_clip": 1.02955723, + "balance_loss_mlp": 1.02553844, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 2.0553148349220365, + "language_loss": 0.74593687, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.7666837, + "num_input_tokens_seen": 311867855, + "step": 14457, + "time_per_iteration": 2.7645583152770996 + }, + { + "auxiliary_loss_clip": 0.01034362, + "auxiliary_loss_mlp": 0.01025683, + "balance_loss_clip": 1.03075576, + "balance_loss_mlp": 1.01529956, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.4855007708905408, + "language_loss": 0.78398156, + "learning_rate": 1.765601232001328e-07, + "loss": 0.804582, + "num_input_tokens_seen": 311888675, + "step": 14458, + "time_per_iteration": 2.7084696292877197 + }, + { + "auxiliary_loss_clip": 0.01087181, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.03381443, + "balance_loss_mlp": 1.01880431, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.7702782854027592, + "language_loss": 0.71060717, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73178303, + "num_input_tokens_seen": 311907310, + "step": 14459, + "time_per_iteration": 2.5464961528778076 + }, + { + "auxiliary_loss_clip": 0.01071942, + "auxiliary_loss_mlp": 0.01027819, + "balance_loss_clip": 1.03270113, + "balance_loss_mlp": 1.01799655, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.31583807193963, + "language_loss": 0.73843378, + "learning_rate": 1.762402701923398e-07, + "loss": 0.75943136, + "num_input_tokens_seen": 311929635, + "step": 14460, + "time_per_iteration": 2.6702492237091064 + }, + { + "auxiliary_loss_clip": 0.01078052, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.03252959, + "balance_loss_mlp": 1.02019048, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 1.7555046384382806, + "language_loss": 0.65046442, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.67155731, + "num_input_tokens_seen": 311948800, + "step": 14461, + "time_per_iteration": 2.615480899810791 + }, + { + "auxiliary_loss_clip": 0.01079129, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.02998769, + "balance_loss_mlp": 1.01973939, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.664864231374749, + "language_loss": 0.82627851, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84737927, + "num_input_tokens_seen": 311964090, + "step": 14462, + "time_per_iteration": 2.5540549755096436 + }, + { + "auxiliary_loss_clip": 0.01083465, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.03124738, + "balance_loss_mlp": 1.02312493, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 12.835981107692353, + "language_loss": 0.65179485, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67297328, + "num_input_tokens_seen": 311981460, + "step": 14463, + "time_per_iteration": 2.8314313888549805 + }, + { + "auxiliary_loss_clip": 0.0108164, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.03737545, + "balance_loss_mlp": 1.0222038, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 2.1825457640006287, + "language_loss": 0.66905016, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.69020015, + "num_input_tokens_seen": 312000115, + "step": 14464, + "time_per_iteration": 4.119284391403198 + }, + { + "auxiliary_loss_clip": 0.01077177, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.03209805, + "balance_loss_mlp": 1.02074385, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.103937295655954, + "language_loss": 0.62511206, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64620137, + "num_input_tokens_seen": 312020770, + "step": 14465, + "time_per_iteration": 2.626885414123535 + }, + { + "auxiliary_loss_clip": 0.01079299, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.03058505, + "balance_loss_mlp": 1.02603316, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.5474461426500683, + "language_loss": 0.84453356, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86568308, + "num_input_tokens_seen": 312041870, + "step": 14466, + "time_per_iteration": 2.5741519927978516 + }, + { + "auxiliary_loss_clip": 0.01066705, + "auxiliary_loss_mlp": 0.01037297, + "balance_loss_clip": 1.03407836, + "balance_loss_mlp": 1.0248394, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.733551675688841, + "language_loss": 0.62283939, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.64387941, + "num_input_tokens_seen": 312058210, + "step": 14467, + "time_per_iteration": 2.620415687561035 + }, + { + "auxiliary_loss_clip": 0.01091484, + "auxiliary_loss_mlp": 0.01024041, + "balance_loss_clip": 1.03172827, + "balance_loss_mlp": 1.01394963, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.410815252857023, + "language_loss": 0.68795425, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.70910949, + "num_input_tokens_seen": 312082665, + "step": 14468, + "time_per_iteration": 2.6015496253967285 + }, + { + "auxiliary_loss_clip": 0.01071091, + "auxiliary_loss_mlp": 0.01027798, + "balance_loss_clip": 1.03058195, + "balance_loss_mlp": 1.01749849, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.5050179037668323, + "language_loss": 0.7110256, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73201442, + "num_input_tokens_seen": 312101960, + "step": 14469, + "time_per_iteration": 4.122073173522949 + }, + { + "auxiliary_loss_clip": 0.01080842, + "auxiliary_loss_mlp": 0.01025604, + "balance_loss_clip": 1.03288853, + "balance_loss_mlp": 1.01587081, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 2.0625658307761245, + "language_loss": 0.83916938, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86023384, + "num_input_tokens_seen": 312117125, + "step": 14470, + "time_per_iteration": 2.589810609817505 + }, + { + "auxiliary_loss_clip": 0.01068913, + "auxiliary_loss_mlp": 0.01034997, + "balance_loss_clip": 1.03220594, + "balance_loss_mlp": 1.02332687, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.7704651896936499, + "language_loss": 0.73246086, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75349998, + "num_input_tokens_seen": 312135775, + "step": 14471, + "time_per_iteration": 2.6213788986206055 + }, + { + "auxiliary_loss_clip": 0.01095601, + "auxiliary_loss_mlp": 0.01024738, + "balance_loss_clip": 1.03346241, + "balance_loss_mlp": 1.01466513, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 4.826455496762126, + "language_loss": 0.78858149, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.80978483, + "num_input_tokens_seen": 312156070, + "step": 14472, + "time_per_iteration": 2.5640716552734375 + }, + { + "auxiliary_loss_clip": 0.01080025, + "auxiliary_loss_mlp": 0.00749444, + "balance_loss_clip": 1.03276849, + "balance_loss_mlp": 1.00027823, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 2.1538694537377125, + "language_loss": 0.73021317, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74850786, + "num_input_tokens_seen": 312174380, + "step": 14473, + "time_per_iteration": 2.56282377243042 + }, + { + "auxiliary_loss_clip": 0.01099586, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.03483748, + "balance_loss_mlp": 1.01694298, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.6471132673162048, + "language_loss": 0.72455341, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74582851, + "num_input_tokens_seen": 312195130, + "step": 14474, + "time_per_iteration": 2.6106035709381104 + }, + { + "auxiliary_loss_clip": 0.01073053, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.03138793, + "balance_loss_mlp": 1.02216041, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 2.2169593179760083, + "language_loss": 0.67380702, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69487381, + "num_input_tokens_seen": 312212300, + "step": 14475, + "time_per_iteration": 2.568148374557495 + }, + { + "auxiliary_loss_clip": 0.01095951, + "auxiliary_loss_mlp": 0.01024546, + "balance_loss_clip": 1.03180492, + "balance_loss_mlp": 1.01353741, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.504029309956031, + "language_loss": 0.77859968, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79980463, + "num_input_tokens_seen": 312231735, + "step": 14476, + "time_per_iteration": 2.569973945617676 + }, + { + "auxiliary_loss_clip": 0.01077066, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.03101301, + "balance_loss_mlp": 1.02115774, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 2.2616903044763697, + "language_loss": 0.72258389, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74367952, + "num_input_tokens_seen": 312253060, + "step": 14477, + "time_per_iteration": 2.6414878368377686 + }, + { + "auxiliary_loss_clip": 0.01088647, + "auxiliary_loss_mlp": 0.01024578, + "balance_loss_clip": 1.03458881, + "balance_loss_mlp": 1.01387286, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 2.4998218188446857, + "language_loss": 0.5953176, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61644983, + "num_input_tokens_seen": 312269460, + "step": 14478, + "time_per_iteration": 4.057409286499023 + }, + { + "auxiliary_loss_clip": 0.01087309, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.03667367, + "balance_loss_mlp": 1.01915753, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.5537249944390668, + "language_loss": 0.71418244, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73534083, + "num_input_tokens_seen": 312289830, + "step": 14479, + "time_per_iteration": 2.600062370300293 + }, + { + "auxiliary_loss_clip": 0.01071278, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.03108644, + "balance_loss_mlp": 1.0189842, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.7129476783676048, + "language_loss": 0.70703518, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.72804552, + "num_input_tokens_seen": 312311320, + "step": 14480, + "time_per_iteration": 2.8319168090820312 + }, + { + "auxiliary_loss_clip": 0.01042528, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.02792537, + "balance_loss_mlp": 1.01835334, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.5716925725093298, + "language_loss": 0.70510948, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72582626, + "num_input_tokens_seen": 312332095, + "step": 14481, + "time_per_iteration": 2.8098831176757812 + }, + { + "auxiliary_loss_clip": 0.01083551, + "auxiliary_loss_mlp": 0.01026301, + "balance_loss_clip": 1.03200412, + "balance_loss_mlp": 1.01575685, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.7059979883709422, + "language_loss": 0.77276146, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79385996, + "num_input_tokens_seen": 312351225, + "step": 14482, + "time_per_iteration": 2.7179338932037354 + }, + { + "auxiliary_loss_clip": 0.01073142, + "auxiliary_loss_mlp": 0.0103084, + "balance_loss_clip": 1.0321815, + "balance_loss_mlp": 1.0202899, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.6986701756828915, + "language_loss": 0.76684779, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78788757, + "num_input_tokens_seen": 312369730, + "step": 14483, + "time_per_iteration": 2.6308324337005615 + }, + { + "auxiliary_loss_clip": 0.01088561, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.03288031, + "balance_loss_mlp": 1.02267635, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 2.102063395443185, + "language_loss": 0.62116593, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64240348, + "num_input_tokens_seen": 312386780, + "step": 14484, + "time_per_iteration": 2.5964925289154053 + }, + { + "auxiliary_loss_clip": 0.01097601, + "auxiliary_loss_mlp": 0.01027751, + "balance_loss_clip": 1.03550768, + "balance_loss_mlp": 1.01721299, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 1.8840805902306446, + "language_loss": 0.68154734, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70280087, + "num_input_tokens_seen": 312404875, + "step": 14485, + "time_per_iteration": 2.5659258365631104 + }, + { + "auxiliary_loss_clip": 0.01036225, + "auxiliary_loss_mlp": 0.00749385, + "balance_loss_clip": 1.02877975, + "balance_loss_mlp": 1.00026965, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.691706343248516, + "language_loss": 0.62702811, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.64488423, + "num_input_tokens_seen": 312425280, + "step": 14486, + "time_per_iteration": 2.777740240097046 + }, + { + "auxiliary_loss_clip": 0.01099755, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.03299892, + "balance_loss_mlp": 1.01924384, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 1.9637110998834593, + "language_loss": 0.61173493, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.63304627, + "num_input_tokens_seen": 312443835, + "step": 14487, + "time_per_iteration": 2.5091381072998047 + }, + { + "auxiliary_loss_clip": 0.01069027, + "auxiliary_loss_mlp": 0.0074906, + "balance_loss_clip": 1.03239775, + "balance_loss_mlp": 1.00025833, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 1.9368923861167855, + "language_loss": 0.67408246, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69226336, + "num_input_tokens_seen": 312460830, + "step": 14488, + "time_per_iteration": 2.630995750427246 + }, + { + "auxiliary_loss_clip": 0.01079307, + "auxiliary_loss_mlp": 0.00749164, + "balance_loss_clip": 1.034711, + "balance_loss_mlp": 1.00020623, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 1.9517109784939581, + "language_loss": 0.85825467, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87653935, + "num_input_tokens_seen": 312477575, + "step": 14489, + "time_per_iteration": 2.6827452182769775 + }, + { + "auxiliary_loss_clip": 0.01091417, + "auxiliary_loss_mlp": 0.01027706, + "balance_loss_clip": 1.03424346, + "balance_loss_mlp": 1.01611316, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 3.35181436238809, + "language_loss": 0.75485563, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.77604687, + "num_input_tokens_seen": 312492140, + "step": 14490, + "time_per_iteration": 2.646608352661133 + }, + { + "auxiliary_loss_clip": 0.01087451, + "auxiliary_loss_mlp": 0.01029, + "balance_loss_clip": 1.03286529, + "balance_loss_mlp": 1.01755571, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 1.991126466035014, + "language_loss": 0.76361668, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78478122, + "num_input_tokens_seen": 312508400, + "step": 14491, + "time_per_iteration": 2.557461738586426 + }, + { + "auxiliary_loss_clip": 0.01067959, + "auxiliary_loss_mlp": 0.01023499, + "balance_loss_clip": 1.03723288, + "balance_loss_mlp": 1.01344383, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.5502088155970877, + "language_loss": 0.67070103, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69161558, + "num_input_tokens_seen": 312525915, + "step": 14492, + "time_per_iteration": 4.236137390136719 + }, + { + "auxiliary_loss_clip": 0.01083647, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.03296518, + "balance_loss_mlp": 1.01776505, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 2.1581362849673926, + "language_loss": 0.69978184, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.72089529, + "num_input_tokens_seen": 312544735, + "step": 14493, + "time_per_iteration": 2.6607227325439453 + }, + { + "auxiliary_loss_clip": 0.01099978, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.03604245, + "balance_loss_mlp": 1.02150059, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.8297966954331613, + "language_loss": 0.89059532, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91192496, + "num_input_tokens_seen": 312557910, + "step": 14494, + "time_per_iteration": 2.536540985107422 + }, + { + "auxiliary_loss_clip": 0.01065982, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.03693604, + "balance_loss_mlp": 1.01839781, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 1.8189286664970938, + "language_loss": 0.59629738, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61724341, + "num_input_tokens_seen": 312580360, + "step": 14495, + "time_per_iteration": 2.804194927215576 + }, + { + "auxiliary_loss_clip": 0.01064829, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.03145897, + "balance_loss_mlp": 1.02104652, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.8873623318978756, + "language_loss": 0.79969162, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.8206597, + "num_input_tokens_seen": 312597550, + "step": 14496, + "time_per_iteration": 2.6939141750335693 + }, + { + "auxiliary_loss_clip": 0.01068963, + "auxiliary_loss_mlp": 0.01032348, + "balance_loss_clip": 1.03360641, + "balance_loss_mlp": 1.01997447, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 1.9135149101590503, + "language_loss": 0.7884748, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.80948794, + "num_input_tokens_seen": 312616435, + "step": 14497, + "time_per_iteration": 2.605496406555176 + }, + { + "auxiliary_loss_clip": 0.01097557, + "auxiliary_loss_mlp": 0.01028055, + "balance_loss_clip": 1.0332284, + "balance_loss_mlp": 1.01675439, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 2.0207549481915104, + "language_loss": 0.6682061, + "learning_rate": 1.70215677535406e-07, + "loss": 0.68946218, + "num_input_tokens_seen": 312632770, + "step": 14498, + "time_per_iteration": 2.5700650215148926 + }, + { + "auxiliary_loss_clip": 0.01058813, + "auxiliary_loss_mlp": 0.01025633, + "balance_loss_clip": 1.02984631, + "balance_loss_mlp": 1.01523757, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.5326227170630442, + "language_loss": 0.56953734, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59038174, + "num_input_tokens_seen": 312651900, + "step": 14499, + "time_per_iteration": 2.7438430786132812 + }, + { + "auxiliary_loss_clip": 0.01053866, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.03117704, + "balance_loss_mlp": 1.01992035, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 1.995846507756152, + "language_loss": 0.79694068, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.81779242, + "num_input_tokens_seen": 312671380, + "step": 14500, + "time_per_iteration": 2.701422691345215 + }, + { + "auxiliary_loss_clip": 0.01081403, + "auxiliary_loss_mlp": 0.01024061, + "balance_loss_clip": 1.03464055, + "balance_loss_mlp": 1.01331997, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 2.0899217067634885, + "language_loss": 0.72795606, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.74901068, + "num_input_tokens_seen": 312689215, + "step": 14501, + "time_per_iteration": 2.5778610706329346 + }, + { + "auxiliary_loss_clip": 0.01056356, + "auxiliary_loss_mlp": 0.0102996, + "balance_loss_clip": 1.02961242, + "balance_loss_mlp": 1.01802766, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.7744670338738224, + "language_loss": 0.64488125, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66574442, + "num_input_tokens_seen": 312706400, + "step": 14502, + "time_per_iteration": 2.694265365600586 + }, + { + "auxiliary_loss_clip": 0.01070075, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.03126001, + "balance_loss_mlp": 1.0237633, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.607340819296471, + "language_loss": 0.6874125, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.70847774, + "num_input_tokens_seen": 312727985, + "step": 14503, + "time_per_iteration": 2.7709546089172363 + }, + { + "auxiliary_loss_clip": 0.01074322, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.03400385, + "balance_loss_mlp": 1.01588178, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.485902023757385, + "language_loss": 0.68552583, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.70653641, + "num_input_tokens_seen": 312745025, + "step": 14504, + "time_per_iteration": 2.6924619674682617 + }, + { + "auxiliary_loss_clip": 0.01084392, + "auxiliary_loss_mlp": 0.00749413, + "balance_loss_clip": 1.03325737, + "balance_loss_mlp": 1.00021791, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.7088711768246232, + "language_loss": 0.70331478, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72165287, + "num_input_tokens_seen": 312764170, + "step": 14505, + "time_per_iteration": 4.1547746658325195 + }, + { + "auxiliary_loss_clip": 0.01074691, + "auxiliary_loss_mlp": 0.01025939, + "balance_loss_clip": 1.03303587, + "balance_loss_mlp": 1.01599061, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.5097761029869317, + "language_loss": 0.78301662, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80402291, + "num_input_tokens_seen": 312783830, + "step": 14506, + "time_per_iteration": 2.667416572570801 + }, + { + "auxiliary_loss_clip": 0.01075495, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.03086114, + "balance_loss_mlp": 1.01579237, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.70981781312473, + "language_loss": 0.73955977, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76058203, + "num_input_tokens_seen": 312802015, + "step": 14507, + "time_per_iteration": 2.592667579650879 + }, + { + "auxiliary_loss_clip": 0.01034875, + "auxiliary_loss_mlp": 0.01037662, + "balance_loss_clip": 1.02888036, + "balance_loss_mlp": 1.02360773, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 2.288886872254342, + "language_loss": 0.72561425, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74633968, + "num_input_tokens_seen": 312820650, + "step": 14508, + "time_per_iteration": 4.357965469360352 + }, + { + "auxiliary_loss_clip": 0.01078937, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.03408551, + "balance_loss_mlp": 1.02106237, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.6579099374656847, + "language_loss": 0.68819952, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.70931345, + "num_input_tokens_seen": 312841310, + "step": 14509, + "time_per_iteration": 2.856091022491455 + }, + { + "auxiliary_loss_clip": 0.0107283, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.03398299, + "balance_loss_mlp": 1.01728892, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 1.8961009281261814, + "language_loss": 0.58478034, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60578525, + "num_input_tokens_seen": 312862100, + "step": 14510, + "time_per_iteration": 2.880053997039795 + }, + { + "auxiliary_loss_clip": 0.0110314, + "auxiliary_loss_mlp": 0.0103092, + "balance_loss_clip": 1.03481901, + "balance_loss_mlp": 1.01925588, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 4.289364431704872, + "language_loss": 0.67231047, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69365096, + "num_input_tokens_seen": 312880220, + "step": 14511, + "time_per_iteration": 2.8082473278045654 + }, + { + "auxiliary_loss_clip": 0.01049156, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.03342569, + "balance_loss_mlp": 1.01988602, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 2.0603289866732237, + "language_loss": 0.81532174, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.83612823, + "num_input_tokens_seen": 312900765, + "step": 14512, + "time_per_iteration": 3.0420312881469727 + }, + { + "auxiliary_loss_clip": 0.01006639, + "auxiliary_loss_mlp": 0.0100343, + "balance_loss_clip": 1.00620437, + "balance_loss_mlp": 1.00242841, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7911494152230859, + "language_loss": 0.58592379, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.6060245, + "num_input_tokens_seen": 312955840, + "step": 14513, + "time_per_iteration": 3.2025697231292725 + }, + { + "auxiliary_loss_clip": 0.01085662, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.03317785, + "balance_loss_mlp": 1.01658392, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.6942138450244253, + "language_loss": 0.767021, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78815305, + "num_input_tokens_seen": 312973565, + "step": 14514, + "time_per_iteration": 2.788374423980713 + }, + { + "auxiliary_loss_clip": 0.01091746, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.03537488, + "balance_loss_mlp": 1.01549864, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 1.9581683996760408, + "language_loss": 0.65343869, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67461425, + "num_input_tokens_seen": 312994660, + "step": 14515, + "time_per_iteration": 2.833242893218994 + }, + { + "auxiliary_loss_clip": 0.0108344, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.03213787, + "balance_loss_mlp": 1.01959538, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 1.9093259177679405, + "language_loss": 0.78841341, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.80956364, + "num_input_tokens_seen": 313009860, + "step": 14516, + "time_per_iteration": 2.8123292922973633 + }, + { + "auxiliary_loss_clip": 0.01100278, + "auxiliary_loss_mlp": 0.01025814, + "balance_loss_clip": 1.03450155, + "balance_loss_mlp": 1.01448917, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 2.0429792072231874, + "language_loss": 0.72098422, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74224508, + "num_input_tokens_seen": 313027025, + "step": 14517, + "time_per_iteration": 2.7479336261749268 + }, + { + "auxiliary_loss_clip": 0.01058256, + "auxiliary_loss_mlp": 0.01022909, + "balance_loss_clip": 1.03065848, + "balance_loss_mlp": 1.01333642, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 2.0015340685425667, + "language_loss": 0.72456461, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74537623, + "num_input_tokens_seen": 313046830, + "step": 14518, + "time_per_iteration": 4.446070194244385 + }, + { + "auxiliary_loss_clip": 0.01066425, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.02970839, + "balance_loss_mlp": 1.02287674, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.5018570205713846, + "language_loss": 0.7453289, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76633614, + "num_input_tokens_seen": 313067715, + "step": 14519, + "time_per_iteration": 2.8106226921081543 + }, + { + "auxiliary_loss_clip": 0.01087719, + "auxiliary_loss_mlp": 0.0102661, + "balance_loss_clip": 1.03385758, + "balance_loss_mlp": 1.01500523, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.4019836026326895, + "language_loss": 0.76351559, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78465891, + "num_input_tokens_seen": 313082305, + "step": 14520, + "time_per_iteration": 2.7111122608184814 + }, + { + "auxiliary_loss_clip": 0.01075648, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.03299165, + "balance_loss_mlp": 1.01832414, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.6205938124745147, + "language_loss": 0.8191784, + "learning_rate": 1.666178664801816e-07, + "loss": 0.84023207, + "num_input_tokens_seen": 313101190, + "step": 14521, + "time_per_iteration": 2.822519540786743 + }, + { + "auxiliary_loss_clip": 0.01085954, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03452218, + "balance_loss_mlp": 1.01978588, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 2.0555826986363814, + "language_loss": 0.76359367, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78476679, + "num_input_tokens_seen": 313118965, + "step": 14522, + "time_per_iteration": 2.709176540374756 + }, + { + "auxiliary_loss_clip": 0.01082071, + "auxiliary_loss_mlp": 0.00749135, + "balance_loss_clip": 1.0311532, + "balance_loss_mlp": 1.00022101, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 2.0304226446801072, + "language_loss": 0.75992823, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77824032, + "num_input_tokens_seen": 313139280, + "step": 14523, + "time_per_iteration": 2.841521739959717 + }, + { + "auxiliary_loss_clip": 0.0108408, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.01631165, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 8.580039232384065, + "language_loss": 0.78716677, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80827379, + "num_input_tokens_seen": 313156655, + "step": 14524, + "time_per_iteration": 2.7911391258239746 + }, + { + "auxiliary_loss_clip": 0.01077125, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.01741695, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 2.0789108705902417, + "language_loss": 0.77490789, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.79594922, + "num_input_tokens_seen": 313174050, + "step": 14525, + "time_per_iteration": 2.855581283569336 + }, + { + "auxiliary_loss_clip": 0.0100812, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.0305059, + "balance_loss_mlp": 1.02133739, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.6836367927028202, + "language_loss": 0.68973207, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71013522, + "num_input_tokens_seen": 313192765, + "step": 14526, + "time_per_iteration": 3.312709093093872 + }, + { + "auxiliary_loss_clip": 0.01049844, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.03370273, + "balance_loss_mlp": 1.02448487, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 1.969739759902618, + "language_loss": 0.6091603, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63002515, + "num_input_tokens_seen": 313210925, + "step": 14527, + "time_per_iteration": 3.301025152206421 + }, + { + "auxiliary_loss_clip": 0.01088835, + "auxiliary_loss_mlp": 0.01028756, + "balance_loss_clip": 1.03672588, + "balance_loss_mlp": 1.01619768, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.9616743046370284, + "language_loss": 0.65541863, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.6765945, + "num_input_tokens_seen": 313228250, + "step": 14528, + "time_per_iteration": 2.88474178314209 + }, + { + "auxiliary_loss_clip": 0.01066872, + "auxiliary_loss_mlp": 0.01025178, + "balance_loss_clip": 1.03475678, + "balance_loss_mlp": 1.01449108, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 1.9973594131974939, + "language_loss": 0.89644635, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.9173668, + "num_input_tokens_seen": 313247880, + "step": 14529, + "time_per_iteration": 2.9231514930725098 + }, + { + "auxiliary_loss_clip": 0.01072804, + "auxiliary_loss_mlp": 0.01026076, + "balance_loss_clip": 1.03170347, + "balance_loss_mlp": 1.01503742, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.7359298750845156, + "language_loss": 0.84747338, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.86846209, + "num_input_tokens_seen": 313266790, + "step": 14530, + "time_per_iteration": 2.8220319747924805 + }, + { + "auxiliary_loss_clip": 0.0107172, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.03091419, + "balance_loss_mlp": 1.02110934, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.6981413955525924, + "language_loss": 0.74662262, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76765132, + "num_input_tokens_seen": 313286805, + "step": 14531, + "time_per_iteration": 2.856654644012451 + }, + { + "auxiliary_loss_clip": 0.01082284, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.03070748, + "balance_loss_mlp": 1.01832342, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 1.8866128505957902, + "language_loss": 0.62046885, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.64157671, + "num_input_tokens_seen": 313305415, + "step": 14532, + "time_per_iteration": 2.76768159866333 + }, + { + "auxiliary_loss_clip": 0.01013353, + "auxiliary_loss_mlp": 0.01000343, + "balance_loss_clip": 1.00355053, + "balance_loss_mlp": 0.99935311, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.8243048975718604, + "language_loss": 0.58709407, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60723102, + "num_input_tokens_seen": 313369940, + "step": 14533, + "time_per_iteration": 4.886747598648071 + }, + { + "auxiliary_loss_clip": 0.01066167, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.03140068, + "balance_loss_mlp": 1.01753128, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.4896465990632826, + "language_loss": 0.76888859, + "learning_rate": 1.646005846335954e-07, + "loss": 0.78983271, + "num_input_tokens_seen": 313390965, + "step": 14534, + "time_per_iteration": 2.9377100467681885 + }, + { + "auxiliary_loss_clip": 0.01065347, + "auxiliary_loss_mlp": 0.01027752, + "balance_loss_clip": 1.0293107, + "balance_loss_mlp": 1.01636183, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 1.8645078409226108, + "language_loss": 0.74879628, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.76972723, + "num_input_tokens_seen": 313409680, + "step": 14535, + "time_per_iteration": 2.804520845413208 + }, + { + "auxiliary_loss_clip": 0.01094878, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.03165483, + "balance_loss_mlp": 1.01766515, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 2.338330420017704, + "language_loss": 0.74675143, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76798767, + "num_input_tokens_seen": 313431335, + "step": 14536, + "time_per_iteration": 2.76996111869812 + }, + { + "auxiliary_loss_clip": 0.01074522, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.0307349, + "balance_loss_mlp": 1.0164386, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 2.1291522630723883, + "language_loss": 0.63906616, + "learning_rate": 1.641367279482304e-07, + "loss": 0.66007626, + "num_input_tokens_seen": 313449225, + "step": 14537, + "time_per_iteration": 2.7781286239624023 + }, + { + "auxiliary_loss_clip": 0.01081429, + "auxiliary_loss_mlp": 0.01024952, + "balance_loss_clip": 1.03062546, + "balance_loss_mlp": 1.01360285, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 2.0422733136813633, + "language_loss": 0.58761716, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60868096, + "num_input_tokens_seen": 313467715, + "step": 14538, + "time_per_iteration": 2.7761940956115723 + }, + { + "auxiliary_loss_clip": 0.01085109, + "auxiliary_loss_mlp": 0.01023516, + "balance_loss_clip": 1.0342437, + "balance_loss_mlp": 1.01252484, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 2.7249344065786394, + "language_loss": 0.68490833, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70599455, + "num_input_tokens_seen": 313486805, + "step": 14539, + "time_per_iteration": 2.8927693367004395 + }, + { + "auxiliary_loss_clip": 0.01088738, + "auxiliary_loss_mlp": 0.01027597, + "balance_loss_clip": 1.03222084, + "balance_loss_mlp": 1.01606989, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.0216426235605622, + "language_loss": 0.74179006, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76295346, + "num_input_tokens_seen": 313504880, + "step": 14540, + "time_per_iteration": 2.7171175479888916 + }, + { + "auxiliary_loss_clip": 0.01067688, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.03014255, + "balance_loss_mlp": 1.02349281, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 1.8512614323593537, + "language_loss": 0.79215479, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81318855, + "num_input_tokens_seen": 313524995, + "step": 14541, + "time_per_iteration": 2.8140664100646973 + }, + { + "auxiliary_loss_clip": 0.01054144, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.03082871, + "balance_loss_mlp": 1.01800871, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.176957257364291, + "language_loss": 0.66993171, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.69078326, + "num_input_tokens_seen": 313541740, + "step": 14542, + "time_per_iteration": 3.1830103397369385 + }, + { + "auxiliary_loss_clip": 0.01022263, + "auxiliary_loss_mlp": 0.01003134, + "balance_loss_clip": 1.00256753, + "balance_loss_mlp": 1.00220442, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.787759998538159, + "language_loss": 0.54441261, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56466657, + "num_input_tokens_seen": 313593445, + "step": 14543, + "time_per_iteration": 3.059898853302002 + }, + { + "auxiliary_loss_clip": 0.01071608, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.03374827, + "balance_loss_mlp": 1.02047217, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 2.1515710816281257, + "language_loss": 0.69761443, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71864927, + "num_input_tokens_seen": 313615640, + "step": 14544, + "time_per_iteration": 2.8071086406707764 + }, + { + "auxiliary_loss_clip": 0.01054028, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.03445256, + "balance_loss_mlp": 1.01646256, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.823598470719301, + "language_loss": 0.7586875, + "learning_rate": 1.62902840325714e-07, + "loss": 0.7794919, + "num_input_tokens_seen": 313635550, + "step": 14545, + "time_per_iteration": 4.48663854598999 + }, + { + "auxiliary_loss_clip": 0.01083156, + "auxiliary_loss_mlp": 0.00749556, + "balance_loss_clip": 1.03147721, + "balance_loss_mlp": 1.00021052, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.6626624921854243, + "language_loss": 0.66211969, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.6804468, + "num_input_tokens_seen": 313659275, + "step": 14546, + "time_per_iteration": 2.9588677883148193 + }, + { + "auxiliary_loss_clip": 0.0109579, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.03300524, + "balance_loss_mlp": 1.0180614, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.5912880270261138, + "language_loss": 0.72828913, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.7495352, + "num_input_tokens_seen": 313680595, + "step": 14547, + "time_per_iteration": 2.8157501220703125 + }, + { + "auxiliary_loss_clip": 0.01103462, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.03446746, + "balance_loss_mlp": 1.02227759, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.577952757954595, + "language_loss": 0.69453061, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71590769, + "num_input_tokens_seen": 313699730, + "step": 14548, + "time_per_iteration": 2.9032788276672363 + }, + { + "auxiliary_loss_clip": 0.01078884, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.03335166, + "balance_loss_mlp": 1.01872766, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 2.192645682740787, + "language_loss": 0.70214033, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.7232306, + "num_input_tokens_seen": 313720090, + "step": 14549, + "time_per_iteration": 4.455063343048096 + }, + { + "auxiliary_loss_clip": 0.01085241, + "auxiliary_loss_mlp": 0.00749582, + "balance_loss_clip": 1.03119504, + "balance_loss_mlp": 1.00024414, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.5064156088458573, + "language_loss": 0.84088981, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85923803, + "num_input_tokens_seen": 313736795, + "step": 14550, + "time_per_iteration": 2.8387510776519775 + }, + { + "auxiliary_loss_clip": 0.01089491, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.03465557, + "balance_loss_mlp": 1.02437305, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 1.8590322709198506, + "language_loss": 0.71641856, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.73766261, + "num_input_tokens_seen": 313754820, + "step": 14551, + "time_per_iteration": 2.7232444286346436 + }, + { + "auxiliary_loss_clip": 0.01087017, + "auxiliary_loss_mlp": 0.00749278, + "balance_loss_clip": 1.03426123, + "balance_loss_mlp": 1.00023866, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 1.906753456052296, + "language_loss": 0.64169002, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.6600529, + "num_input_tokens_seen": 313775830, + "step": 14552, + "time_per_iteration": 2.8232126235961914 + }, + { + "auxiliary_loss_clip": 0.01064789, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.03129745, + "balance_loss_mlp": 1.01638126, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 1.6648823459361415, + "language_loss": 0.79653037, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81746459, + "num_input_tokens_seen": 313795745, + "step": 14553, + "time_per_iteration": 2.9018514156341553 + }, + { + "auxiliary_loss_clip": 0.01078457, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.02993536, + "balance_loss_mlp": 1.02266598, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 1.778893454824468, + "language_loss": 0.70106113, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72217882, + "num_input_tokens_seen": 313813895, + "step": 14554, + "time_per_iteration": 2.778398275375366 + }, + { + "auxiliary_loss_clip": 0.01077874, + "auxiliary_loss_mlp": 0.00749323, + "balance_loss_clip": 1.0347929, + "balance_loss_mlp": 1.00023007, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.3888825462838064, + "language_loss": 0.83875448, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85702646, + "num_input_tokens_seen": 313834225, + "step": 14555, + "time_per_iteration": 2.8308844566345215 + }, + { + "auxiliary_loss_clip": 0.01084732, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.03219163, + "balance_loss_mlp": 1.01674151, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.717346593660196, + "language_loss": 0.71093696, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73206604, + "num_input_tokens_seen": 313854430, + "step": 14556, + "time_per_iteration": 2.78444766998291 + }, + { + "auxiliary_loss_clip": 0.01079002, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.03143251, + "balance_loss_mlp": 1.01674724, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 2.4176534734816792, + "language_loss": 0.7679221, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78900373, + "num_input_tokens_seen": 313871600, + "step": 14557, + "time_per_iteration": 2.723588466644287 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.03257501, + "balance_loss_mlp": 1.02536821, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 1.6716563653912908, + "language_loss": 0.82820183, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.84915674, + "num_input_tokens_seen": 313891570, + "step": 14558, + "time_per_iteration": 4.471888303756714 + }, + { + "auxiliary_loss_clip": 0.01022752, + "auxiliary_loss_mlp": 0.01002595, + "balance_loss_clip": 1.0028758, + "balance_loss_mlp": 1.00162935, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8061541872085768, + "language_loss": 0.56091225, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58116567, + "num_input_tokens_seen": 313951290, + "step": 14559, + "time_per_iteration": 3.217630386352539 + }, + { + "auxiliary_loss_clip": 0.0108597, + "auxiliary_loss_mlp": 0.01031762, + "balance_loss_clip": 1.03334713, + "balance_loss_mlp": 1.02099133, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 1.9375097314308034, + "language_loss": 0.66118377, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68236113, + "num_input_tokens_seen": 313968645, + "step": 14560, + "time_per_iteration": 2.817202568054199 + }, + { + "auxiliary_loss_clip": 0.0109369, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.03191781, + "balance_loss_mlp": 1.01530313, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.881042345501695, + "language_loss": 0.78964603, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.81083912, + "num_input_tokens_seen": 313987580, + "step": 14561, + "time_per_iteration": 2.78794002532959 + }, + { + "auxiliary_loss_clip": 0.01099072, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.03292966, + "balance_loss_mlp": 1.01805449, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 1.9675562266018418, + "language_loss": 0.77291226, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.79420322, + "num_input_tokens_seen": 314004460, + "step": 14562, + "time_per_iteration": 2.706805944442749 + }, + { + "auxiliary_loss_clip": 0.01092053, + "auxiliary_loss_mlp": 0.01024827, + "balance_loss_clip": 1.03223872, + "balance_loss_mlp": 1.01492715, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.787164013548358, + "language_loss": 0.71579361, + "learning_rate": 1.601428988367981e-07, + "loss": 0.73696244, + "num_input_tokens_seen": 314026855, + "step": 14563, + "time_per_iteration": 2.8505849838256836 + }, + { + "auxiliary_loss_clip": 0.01101397, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.03592539, + "balance_loss_mlp": 1.01989317, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.150186867524384, + "language_loss": 0.65335053, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67467451, + "num_input_tokens_seen": 314042830, + "step": 14564, + "time_per_iteration": 2.6807899475097656 + }, + { + "auxiliary_loss_clip": 0.0108363, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.03219843, + "balance_loss_mlp": 1.02243614, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.7263981097679406, + "language_loss": 0.70489979, + "learning_rate": 1.598376334037408e-07, + "loss": 0.7260654, + "num_input_tokens_seen": 314062225, + "step": 14565, + "time_per_iteration": 2.7556865215301514 + }, + { + "auxiliary_loss_clip": 0.01081649, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.03459334, + "balance_loss_mlp": 1.01977718, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.8976771187389563, + "language_loss": 0.7781319, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79926991, + "num_input_tokens_seen": 314082325, + "step": 14566, + "time_per_iteration": 2.7684552669525146 + }, + { + "auxiliary_loss_clip": 0.0107853, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.03781736, + "balance_loss_mlp": 1.02138209, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.5164826561243028, + "language_loss": 0.71235365, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73345989, + "num_input_tokens_seen": 314100310, + "step": 14567, + "time_per_iteration": 2.791724681854248 + }, + { + "auxiliary_loss_clip": 0.01076397, + "auxiliary_loss_mlp": 0.00749469, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.0002563, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.8365716176125357, + "language_loss": 0.74151385, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.75977254, + "num_input_tokens_seen": 314121330, + "step": 14568, + "time_per_iteration": 2.8368399143218994 + }, + { + "auxiliary_loss_clip": 0.01055769, + "auxiliary_loss_mlp": 0.01029986, + "balance_loss_clip": 1.02935088, + "balance_loss_mlp": 1.01959062, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 2.0478640229814196, + "language_loss": 0.87063706, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.89149463, + "num_input_tokens_seen": 314139875, + "step": 14569, + "time_per_iteration": 2.841388702392578 + }, + { + "auxiliary_loss_clip": 0.0106144, + "auxiliary_loss_mlp": 0.01028759, + "balance_loss_clip": 1.03195858, + "balance_loss_mlp": 1.01822114, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 4.241245428046948, + "language_loss": 0.74195492, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76285696, + "num_input_tokens_seen": 314157850, + "step": 14570, + "time_per_iteration": 2.8143608570098877 + }, + { + "auxiliary_loss_clip": 0.01089364, + "auxiliary_loss_mlp": 0.00749403, + "balance_loss_clip": 1.03384805, + "balance_loss_mlp": 1.00024152, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.6934527628584752, + "language_loss": 0.680116, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69850367, + "num_input_tokens_seen": 314176720, + "step": 14571, + "time_per_iteration": 2.847689628601074 + }, + { + "auxiliary_loss_clip": 0.01058201, + "auxiliary_loss_mlp": 0.01028695, + "balance_loss_clip": 1.03245211, + "balance_loss_mlp": 1.01843739, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 1.8851683505139505, + "language_loss": 0.62431812, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64518714, + "num_input_tokens_seen": 314196645, + "step": 14572, + "time_per_iteration": 2.847642421722412 + }, + { + "auxiliary_loss_clip": 0.01084037, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.03316092, + "balance_loss_mlp": 1.01609087, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.7084483146861424, + "language_loss": 0.73704696, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75814635, + "num_input_tokens_seen": 314217430, + "step": 14573, + "time_per_iteration": 4.321120977401733 + }, + { + "auxiliary_loss_clip": 0.01048562, + "auxiliary_loss_mlp": 0.00749099, + "balance_loss_clip": 1.03172839, + "balance_loss_mlp": 1.00016046, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 2.173571892043367, + "language_loss": 0.72700977, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74498647, + "num_input_tokens_seen": 314235310, + "step": 14574, + "time_per_iteration": 2.8529844284057617 + }, + { + "auxiliary_loss_clip": 0.01082907, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.03154731, + "balance_loss_mlp": 1.01931882, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 3.2469136237529854, + "language_loss": 0.75894225, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.78007424, + "num_input_tokens_seen": 314252355, + "step": 14575, + "time_per_iteration": 2.761566162109375 + }, + { + "auxiliary_loss_clip": 0.01068318, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.03366852, + "balance_loss_mlp": 1.02499604, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.7505358168893577, + "language_loss": 0.66931367, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.69035041, + "num_input_tokens_seen": 314272755, + "step": 14576, + "time_per_iteration": 2.855498790740967 + }, + { + "auxiliary_loss_clip": 0.01063749, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.02889752, + "balance_loss_mlp": 1.01773787, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 2.113416622623735, + "language_loss": 0.66787893, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68879616, + "num_input_tokens_seen": 314291365, + "step": 14577, + "time_per_iteration": 2.8338396549224854 + }, + { + "auxiliary_loss_clip": 0.01087677, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.03488863, + "balance_loss_mlp": 1.01878881, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 2.91937311108099, + "language_loss": 0.71162224, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73280168, + "num_input_tokens_seen": 314310075, + "step": 14578, + "time_per_iteration": 2.8085808753967285 + }, + { + "auxiliary_loss_clip": 0.01098185, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.03300619, + "balance_loss_mlp": 1.01962161, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 2.4845580182048423, + "language_loss": 0.71185148, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73314035, + "num_input_tokens_seen": 314325695, + "step": 14579, + "time_per_iteration": 2.701359748840332 + }, + { + "auxiliary_loss_clip": 0.01060105, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.02836514, + "balance_loss_mlp": 1.02203572, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.9838496319841419, + "language_loss": 0.70418358, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72512263, + "num_input_tokens_seen": 314343605, + "step": 14580, + "time_per_iteration": 2.750444173812866 + }, + { + "auxiliary_loss_clip": 0.01095997, + "auxiliary_loss_mlp": 0.00749287, + "balance_loss_clip": 1.03482747, + "balance_loss_mlp": 1.00028217, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.6231037515664488, + "language_loss": 0.65436697, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67281985, + "num_input_tokens_seen": 314364275, + "step": 14581, + "time_per_iteration": 2.694795608520508 + }, + { + "auxiliary_loss_clip": 0.01069581, + "auxiliary_loss_mlp": 0.01026902, + "balance_loss_clip": 1.03214335, + "balance_loss_mlp": 1.01675129, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.9669883844835, + "language_loss": 0.736727, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75769186, + "num_input_tokens_seen": 314385140, + "step": 14582, + "time_per_iteration": 2.766960859298706 + }, + { + "auxiliary_loss_clip": 0.01093777, + "auxiliary_loss_mlp": 0.00749315, + "balance_loss_clip": 1.03131557, + "balance_loss_mlp": 1.00021839, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.208480097277747, + "language_loss": 0.67391372, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.69234467, + "num_input_tokens_seen": 314403715, + "step": 14583, + "time_per_iteration": 2.68483304977417 + }, + { + "auxiliary_loss_clip": 0.01089293, + "auxiliary_loss_mlp": 0.00749312, + "balance_loss_clip": 1.03389955, + "balance_loss_mlp": 1.00024855, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.5969342252473173, + "language_loss": 0.79195476, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81034076, + "num_input_tokens_seen": 314421880, + "step": 14584, + "time_per_iteration": 2.7347021102905273 + }, + { + "auxiliary_loss_clip": 0.01066268, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.03222537, + "balance_loss_mlp": 1.01891589, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.615765831949759, + "language_loss": 0.72351348, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74447381, + "num_input_tokens_seen": 314441585, + "step": 14585, + "time_per_iteration": 4.4135353565216064 + }, + { + "auxiliary_loss_clip": 0.01072888, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.03058386, + "balance_loss_mlp": 1.01549649, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.7547876736876016, + "language_loss": 0.74013066, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76112771, + "num_input_tokens_seen": 314459020, + "step": 14586, + "time_per_iteration": 2.823411226272583 + }, + { + "auxiliary_loss_clip": 0.01095037, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.03181219, + "balance_loss_mlp": 1.01842988, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.878187747438904, + "language_loss": 0.78752732, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80877435, + "num_input_tokens_seen": 314478935, + "step": 14587, + "time_per_iteration": 2.6078126430511475 + }, + { + "auxiliary_loss_clip": 0.01086664, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.03410125, + "balance_loss_mlp": 1.01716757, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.750480009403196, + "language_loss": 0.73658884, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.75774789, + "num_input_tokens_seen": 314497635, + "step": 14588, + "time_per_iteration": 4.26780366897583 + }, + { + "auxiliary_loss_clip": 0.01030654, + "auxiliary_loss_mlp": 0.00749319, + "balance_loss_clip": 1.02973282, + "balance_loss_mlp": 1.00022268, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.9528121575061999, + "language_loss": 0.66481745, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68261725, + "num_input_tokens_seen": 314515445, + "step": 14589, + "time_per_iteration": 2.886685609817505 + }, + { + "auxiliary_loss_clip": 0.01087669, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.03479743, + "balance_loss_mlp": 1.02024698, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.8223531225274217, + "language_loss": 0.70530319, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.72648805, + "num_input_tokens_seen": 314533040, + "step": 14590, + "time_per_iteration": 2.7354133129119873 + }, + { + "auxiliary_loss_clip": 0.01071867, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.03317189, + "balance_loss_mlp": 1.02603447, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 2.0228885800924514, + "language_loss": 0.74900115, + "learning_rate": 1.558945991776086e-07, + "loss": 0.77009875, + "num_input_tokens_seen": 314548280, + "step": 14591, + "time_per_iteration": 2.752450704574585 + }, + { + "auxiliary_loss_clip": 0.01091434, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.03231955, + "balance_loss_mlp": 1.01506734, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.6819776783011622, + "language_loss": 0.79875433, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.81991625, + "num_input_tokens_seen": 314565345, + "step": 14592, + "time_per_iteration": 2.6947126388549805 + }, + { + "auxiliary_loss_clip": 0.01091919, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.03200269, + "balance_loss_mlp": 1.01717901, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.6266592493058454, + "language_loss": 0.82389534, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84508568, + "num_input_tokens_seen": 314584190, + "step": 14593, + "time_per_iteration": 2.7091190814971924 + }, + { + "auxiliary_loss_clip": 0.01076924, + "auxiliary_loss_mlp": 0.01023027, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.0124886, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 1.7839199426670667, + "language_loss": 0.7615571, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78255659, + "num_input_tokens_seen": 314605625, + "step": 14594, + "time_per_iteration": 2.8581740856170654 + }, + { + "auxiliary_loss_clip": 0.0104128, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.02717173, + "balance_loss_mlp": 1.02249742, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.4771438604154867, + "language_loss": 0.77852476, + "learning_rate": 1.552921717241651e-07, + "loss": 0.79929268, + "num_input_tokens_seen": 314622630, + "step": 14595, + "time_per_iteration": 2.8241231441497803 + }, + { + "auxiliary_loss_clip": 0.01064756, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03266811, + "balance_loss_mlp": 1.01884139, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.338070946531402, + "language_loss": 0.70619625, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72714043, + "num_input_tokens_seen": 314642460, + "step": 14596, + "time_per_iteration": 2.8416051864624023 + }, + { + "auxiliary_loss_clip": 0.01062746, + "auxiliary_loss_mlp": 0.01021386, + "balance_loss_clip": 1.03248453, + "balance_loss_mlp": 1.01123524, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.8112081755122578, + "language_loss": 0.86081946, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88166076, + "num_input_tokens_seen": 314659875, + "step": 14597, + "time_per_iteration": 2.8295676708221436 + }, + { + "auxiliary_loss_clip": 0.0108472, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.03328133, + "balance_loss_mlp": 1.01927364, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.7589151207230633, + "language_loss": 0.73061532, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.75175828, + "num_input_tokens_seen": 314680260, + "step": 14598, + "time_per_iteration": 4.218869924545288 + }, + { + "auxiliary_loss_clip": 0.01078429, + "auxiliary_loss_mlp": 0.00749232, + "balance_loss_clip": 1.03162313, + "balance_loss_mlp": 1.00013232, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.467264933783073, + "language_loss": 0.77758181, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.7958585, + "num_input_tokens_seen": 314696260, + "step": 14599, + "time_per_iteration": 2.7403459548950195 + }, + { + "auxiliary_loss_clip": 0.01064355, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.03122616, + "balance_loss_mlp": 1.01651287, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.235945743394874, + "language_loss": 0.67495012, + "learning_rate": 1.545407113589332e-07, + "loss": 0.69586653, + "num_input_tokens_seen": 314714215, + "step": 14600, + "time_per_iteration": 2.823370933532715 + }, + { + "auxiliary_loss_clip": 0.01085368, + "auxiliary_loss_mlp": 0.01035673, + "balance_loss_clip": 1.03198981, + "balance_loss_mlp": 1.0245862, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 1.7687334523233713, + "language_loss": 0.69577336, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71698374, + "num_input_tokens_seen": 314735700, + "step": 14601, + "time_per_iteration": 2.9569783210754395 + }, + { + "auxiliary_loss_clip": 0.01090687, + "auxiliary_loss_mlp": 0.01025675, + "balance_loss_clip": 1.03447711, + "balance_loss_mlp": 1.01511323, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 2.0299474398140385, + "language_loss": 0.73145694, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75262052, + "num_input_tokens_seen": 314753335, + "step": 14602, + "time_per_iteration": 2.7751083374023438 + }, + { + "auxiliary_loss_clip": 0.01095496, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.03291333, + "balance_loss_mlp": 1.01823437, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 2.0559205033086205, + "language_loss": 0.70979655, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73103803, + "num_input_tokens_seen": 314770800, + "step": 14603, + "time_per_iteration": 2.733630657196045 + }, + { + "auxiliary_loss_clip": 0.01005988, + "auxiliary_loss_mlp": 0.01008147, + "balance_loss_clip": 1.00597548, + "balance_loss_mlp": 1.0073061, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7746987148968175, + "language_loss": 0.54129684, + "learning_rate": 1.539408026725344e-07, + "loss": 0.5614382, + "num_input_tokens_seen": 314837275, + "step": 14604, + "time_per_iteration": 3.3162641525268555 + }, + { + "auxiliary_loss_clip": 0.00992782, + "auxiliary_loss_mlp": 0.01009258, + "balance_loss_clip": 1.00283682, + "balance_loss_mlp": 1.00801229, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7050402194328953, + "language_loss": 0.59417129, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61419165, + "num_input_tokens_seen": 314902220, + "step": 14605, + "time_per_iteration": 3.299149990081787 + }, + { + "auxiliary_loss_clip": 0.0105696, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.03386998, + "balance_loss_mlp": 1.01975441, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.8395307356781079, + "language_loss": 0.85185194, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87273276, + "num_input_tokens_seen": 314921645, + "step": 14606, + "time_per_iteration": 3.012761354446411 + }, + { + "auxiliary_loss_clip": 0.01098593, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.0340507, + "balance_loss_mlp": 1.01756382, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 3.410821268331083, + "language_loss": 0.70641428, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72769487, + "num_input_tokens_seen": 314939390, + "step": 14607, + "time_per_iteration": 2.7277214527130127 + }, + { + "auxiliary_loss_clip": 0.01073343, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.03180444, + "balance_loss_mlp": 1.0226264, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 2.146799540866835, + "language_loss": 0.7206285, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74168634, + "num_input_tokens_seen": 314959205, + "step": 14608, + "time_per_iteration": 2.853182077407837 + }, + { + "auxiliary_loss_clip": 0.01084711, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.0315516, + "balance_loss_mlp": 1.02371693, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 2.086915420008482, + "language_loss": 0.87284917, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89404368, + "num_input_tokens_seen": 314977485, + "step": 14609, + "time_per_iteration": 2.7494189739227295 + }, + { + "auxiliary_loss_clip": 0.01051156, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.03509235, + "balance_loss_mlp": 1.01855958, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.6917475521808236, + "language_loss": 0.70251155, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72331834, + "num_input_tokens_seen": 314997830, + "step": 14610, + "time_per_iteration": 2.9761786460876465 + }, + { + "auxiliary_loss_clip": 0.0107916, + "auxiliary_loss_mlp": 0.00749235, + "balance_loss_clip": 1.0343709, + "balance_loss_mlp": 1.00018251, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 2.219984740630385, + "language_loss": 0.80319577, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82147974, + "num_input_tokens_seen": 315016480, + "step": 14611, + "time_per_iteration": 2.8463077545166016 + }, + { + "auxiliary_loss_clip": 0.01097462, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.03394389, + "balance_loss_mlp": 1.0218358, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.4809649883071485, + "language_loss": 0.76292503, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78422505, + "num_input_tokens_seen": 315036135, + "step": 14612, + "time_per_iteration": 2.7212820053100586 + }, + { + "auxiliary_loss_clip": 0.01044946, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.03232503, + "balance_loss_mlp": 1.02420139, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.4780690617765018, + "language_loss": 0.72353041, + "learning_rate": 1.525951038422002e-07, + "loss": 0.744326, + "num_input_tokens_seen": 315057995, + "step": 14613, + "time_per_iteration": 4.520952463150024 + }, + { + "auxiliary_loss_clip": 0.00996447, + "auxiliary_loss_mlp": 0.01000316, + "balance_loss_clip": 1.00606894, + "balance_loss_mlp": 0.99929714, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.0406780323233324, + "language_loss": 0.64716196, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66712964, + "num_input_tokens_seen": 315104010, + "step": 14614, + "time_per_iteration": 3.069279909133911 + }, + { + "auxiliary_loss_clip": 0.01022582, + "auxiliary_loss_mlp": 0.00999408, + "balance_loss_clip": 1.00280738, + "balance_loss_mlp": 0.99837703, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6627735346745184, + "language_loss": 0.58555734, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60577726, + "num_input_tokens_seen": 315174550, + "step": 14615, + "time_per_iteration": 3.2580301761627197 + }, + { + "auxiliary_loss_clip": 0.01048512, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.02986789, + "balance_loss_mlp": 1.01679468, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 1.9447264630824, + "language_loss": 0.72637594, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.74713409, + "num_input_tokens_seen": 315191825, + "step": 14616, + "time_per_iteration": 2.903433084487915 + }, + { + "auxiliary_loss_clip": 0.01022542, + "auxiliary_loss_mlp": 0.01001578, + "balance_loss_clip": 1.00285769, + "balance_loss_mlp": 1.00052917, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8041781095363423, + "language_loss": 0.57960677, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.59984803, + "num_input_tokens_seen": 315255075, + "step": 14617, + "time_per_iteration": 3.300477981567383 + }, + { + "auxiliary_loss_clip": 0.01067878, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.03083301, + "balance_loss_mlp": 1.02135944, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.7032114002133545, + "language_loss": 0.83653092, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85753995, + "num_input_tokens_seen": 315273995, + "step": 14618, + "time_per_iteration": 3.006436824798584 + }, + { + "auxiliary_loss_clip": 0.01070066, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.0331645, + "balance_loss_mlp": 1.01714587, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 1.6409217188555993, + "language_loss": 0.69186211, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71283495, + "num_input_tokens_seen": 315294485, + "step": 14619, + "time_per_iteration": 2.850338935852051 + }, + { + "auxiliary_loss_clip": 0.01053805, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.03063881, + "balance_loss_mlp": 1.02108383, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 1.795700354953153, + "language_loss": 0.77437985, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79523402, + "num_input_tokens_seen": 315310420, + "step": 14620, + "time_per_iteration": 2.8380770683288574 + }, + { + "auxiliary_loss_clip": 0.01073436, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.03465509, + "balance_loss_mlp": 1.01826334, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.6126677536171878, + "language_loss": 0.79260439, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81363857, + "num_input_tokens_seen": 315330110, + "step": 14621, + "time_per_iteration": 2.9231760501861572 + }, + { + "auxiliary_loss_clip": 0.01076727, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.03226113, + "balance_loss_mlp": 1.02001953, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 1.8441918248797267, + "language_loss": 0.66746998, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.6885494, + "num_input_tokens_seen": 315350080, + "step": 14622, + "time_per_iteration": 2.8068456649780273 + }, + { + "auxiliary_loss_clip": 0.01070655, + "auxiliary_loss_mlp": 0.01032071, + "balance_loss_clip": 1.03402352, + "balance_loss_mlp": 1.02187872, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.950123141861385, + "language_loss": 0.73250639, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75353372, + "num_input_tokens_seen": 315366360, + "step": 14623, + "time_per_iteration": 2.8201849460601807 + }, + { + "auxiliary_loss_clip": 0.01040843, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.02747416, + "balance_loss_mlp": 1.0191319, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 1.617246823555053, + "language_loss": 0.78621566, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80692351, + "num_input_tokens_seen": 315385890, + "step": 14624, + "time_per_iteration": 2.905097484588623 + }, + { + "auxiliary_loss_clip": 0.01084501, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.03104854, + "balance_loss_mlp": 1.01892555, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.895351816992799, + "language_loss": 0.79777193, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.81892073, + "num_input_tokens_seen": 315403400, + "step": 14625, + "time_per_iteration": 4.339824676513672 + }, + { + "auxiliary_loss_clip": 0.01072629, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.03254235, + "balance_loss_mlp": 1.02202523, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 2.0570108495650725, + "language_loss": 0.74222976, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76328349, + "num_input_tokens_seen": 315423670, + "step": 14626, + "time_per_iteration": 2.852424383163452 + }, + { + "auxiliary_loss_clip": 0.01086022, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.03306389, + "balance_loss_mlp": 1.01991749, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.6232131237974954, + "language_loss": 0.70969564, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73086542, + "num_input_tokens_seen": 315446265, + "step": 14627, + "time_per_iteration": 2.9267489910125732 + }, + { + "auxiliary_loss_clip": 0.01067704, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.03634894, + "balance_loss_mlp": 1.01549888, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.9072716665130023, + "language_loss": 0.72004724, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74099213, + "num_input_tokens_seen": 315464655, + "step": 14628, + "time_per_iteration": 4.395111083984375 + }, + { + "auxiliary_loss_clip": 0.01061892, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.0298543, + "balance_loss_mlp": 1.01891804, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.810579892314501, + "language_loss": 0.68947673, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71040201, + "num_input_tokens_seen": 315481090, + "step": 14629, + "time_per_iteration": 2.8487703800201416 + }, + { + "auxiliary_loss_clip": 0.01070427, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.03007722, + "balance_loss_mlp": 1.02044547, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.5067743004839975, + "language_loss": 0.68714261, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70814431, + "num_input_tokens_seen": 315502010, + "step": 14630, + "time_per_iteration": 2.859794855117798 + }, + { + "auxiliary_loss_clip": 0.01070897, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.03183007, + "balance_loss_mlp": 1.02071857, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.6634436624783058, + "language_loss": 0.74047697, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76150382, + "num_input_tokens_seen": 315523040, + "step": 14631, + "time_per_iteration": 2.8483400344848633 + }, + { + "auxiliary_loss_clip": 0.01073515, + "auxiliary_loss_mlp": 0.00749133, + "balance_loss_clip": 1.03305387, + "balance_loss_mlp": 1.00014842, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 2.113869697396277, + "language_loss": 0.69160438, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.70983088, + "num_input_tokens_seen": 315541865, + "step": 14632, + "time_per_iteration": 2.9097349643707275 + }, + { + "auxiliary_loss_clip": 0.01075386, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.03314912, + "balance_loss_mlp": 1.01725769, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.90045001649323, + "language_loss": 0.6493448, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.6703707, + "num_input_tokens_seen": 315561470, + "step": 14633, + "time_per_iteration": 2.7876360416412354 + }, + { + "auxiliary_loss_clip": 0.01061463, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.03085768, + "balance_loss_mlp": 1.01970768, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.4062093645171512, + "language_loss": 0.84042567, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86134511, + "num_input_tokens_seen": 315583140, + "step": 14634, + "time_per_iteration": 2.917534589767456 + }, + { + "auxiliary_loss_clip": 0.01073921, + "auxiliary_loss_mlp": 0.00749296, + "balance_loss_clip": 1.03282225, + "balance_loss_mlp": 1.00022912, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.6856690088706934, + "language_loss": 0.79984605, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.81807822, + "num_input_tokens_seen": 315601935, + "step": 14635, + "time_per_iteration": 2.833425998687744 + }, + { + "auxiliary_loss_clip": 0.01068335, + "auxiliary_loss_mlp": 0.01022894, + "balance_loss_clip": 1.03308845, + "balance_loss_mlp": 1.01251698, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 2.4057792763189307, + "language_loss": 0.65409982, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67501211, + "num_input_tokens_seen": 315619995, + "step": 14636, + "time_per_iteration": 2.8347225189208984 + }, + { + "auxiliary_loss_clip": 0.0106807, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.03123724, + "balance_loss_mlp": 1.02225244, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 3.2168794522728574, + "language_loss": 0.70328826, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72430748, + "num_input_tokens_seen": 315637895, + "step": 14637, + "time_per_iteration": 2.767489433288574 + }, + { + "auxiliary_loss_clip": 0.01074452, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.03505325, + "balance_loss_mlp": 1.01770675, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 1.92991890245419, + "language_loss": 0.66059029, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.6816172, + "num_input_tokens_seen": 315655520, + "step": 14638, + "time_per_iteration": 4.55507755279541 + }, + { + "auxiliary_loss_clip": 0.01086706, + "auxiliary_loss_mlp": 0.01027733, + "balance_loss_clip": 1.03332496, + "balance_loss_mlp": 1.01710582, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 3.139450837910474, + "language_loss": 0.57978928, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60093367, + "num_input_tokens_seen": 315678955, + "step": 14639, + "time_per_iteration": 2.8758928775787354 + }, + { + "auxiliary_loss_clip": 0.01065595, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.03032708, + "balance_loss_mlp": 1.02170181, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 3.996276378153884, + "language_loss": 0.74241149, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76339412, + "num_input_tokens_seen": 315700360, + "step": 14640, + "time_per_iteration": 2.864820957183838 + }, + { + "auxiliary_loss_clip": 0.01069814, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_clip": 1.03069246, + "balance_loss_mlp": 1.02558362, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 1.776772721628758, + "language_loss": 0.69488978, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71595716, + "num_input_tokens_seen": 315719270, + "step": 14641, + "time_per_iteration": 2.789695978164673 + }, + { + "auxiliary_loss_clip": 0.01083443, + "auxiliary_loss_mlp": 0.01028005, + "balance_loss_clip": 1.03217816, + "balance_loss_mlp": 1.01603091, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 1.9202885738933066, + "language_loss": 0.84985614, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.87097067, + "num_input_tokens_seen": 315737425, + "step": 14642, + "time_per_iteration": 2.7386510372161865 + }, + { + "auxiliary_loss_clip": 0.01059459, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.03837395, + "balance_loss_mlp": 1.01971662, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 4.958668773173069, + "language_loss": 0.78523862, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.80614746, + "num_input_tokens_seen": 315755725, + "step": 14643, + "time_per_iteration": 2.864617347717285 + }, + { + "auxiliary_loss_clip": 0.01079418, + "auxiliary_loss_mlp": 0.01021966, + "balance_loss_clip": 1.02954698, + "balance_loss_mlp": 1.01202989, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.5669824026522623, + "language_loss": 0.73036796, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75138187, + "num_input_tokens_seen": 315773835, + "step": 14644, + "time_per_iteration": 2.724435806274414 + }, + { + "auxiliary_loss_clip": 0.01098678, + "auxiliary_loss_mlp": 0.00749375, + "balance_loss_clip": 1.03373885, + "balance_loss_mlp": 1.00027919, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 1.9060778111230705, + "language_loss": 0.79566002, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81414056, + "num_input_tokens_seen": 315790615, + "step": 14645, + "time_per_iteration": 2.6960723400115967 + }, + { + "auxiliary_loss_clip": 0.01079252, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.03053474, + "balance_loss_mlp": 1.02079058, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 2.042287481590555, + "language_loss": 0.64186692, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66297591, + "num_input_tokens_seen": 315811010, + "step": 14646, + "time_per_iteration": 2.723066806793213 + }, + { + "auxiliary_loss_clip": 0.01066775, + "auxiliary_loss_mlp": 0.01030655, + "balance_loss_clip": 1.0343498, + "balance_loss_mlp": 1.01904953, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.137993118014194, + "language_loss": 0.77407098, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79504526, + "num_input_tokens_seen": 315828130, + "step": 14647, + "time_per_iteration": 2.792618751525879 + }, + { + "auxiliary_loss_clip": 0.01094041, + "auxiliary_loss_mlp": 0.01028252, + "balance_loss_clip": 1.03303075, + "balance_loss_mlp": 1.0179168, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 1.952054443116244, + "language_loss": 0.75582612, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77704906, + "num_input_tokens_seen": 315844900, + "step": 14648, + "time_per_iteration": 2.774667501449585 + }, + { + "auxiliary_loss_clip": 0.01074386, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.03000581, + "balance_loss_mlp": 1.01720834, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 2.077836765926061, + "language_loss": 0.65482241, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67584574, + "num_input_tokens_seen": 315863745, + "step": 14649, + "time_per_iteration": 2.7930986881256104 + }, + { + "auxiliary_loss_clip": 0.01059431, + "auxiliary_loss_mlp": 0.01027608, + "balance_loss_clip": 1.0329051, + "balance_loss_mlp": 1.01651537, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.2692416775457704, + "language_loss": 0.62740099, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64827144, + "num_input_tokens_seen": 315885765, + "step": 14650, + "time_per_iteration": 2.8537964820861816 + }, + { + "auxiliary_loss_clip": 0.01068624, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.03140378, + "balance_loss_mlp": 1.01751924, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.458528710441327, + "language_loss": 0.72924197, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.75020486, + "num_input_tokens_seen": 315907340, + "step": 14651, + "time_per_iteration": 2.7676138877868652 + }, + { + "auxiliary_loss_clip": 0.01088766, + "auxiliary_loss_mlp": 0.01027956, + "balance_loss_clip": 1.03389883, + "balance_loss_mlp": 1.01595759, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.8704193882624591, + "language_loss": 0.7222442, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74341142, + "num_input_tokens_seen": 315924935, + "step": 14652, + "time_per_iteration": 2.7269015312194824 + }, + { + "auxiliary_loss_clip": 0.01075284, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.03112221, + "balance_loss_mlp": 1.0211879, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 6.440504547714996, + "language_loss": 0.74479866, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.76587051, + "num_input_tokens_seen": 315943165, + "step": 14653, + "time_per_iteration": 4.24505615234375 + }, + { + "auxiliary_loss_clip": 0.01099942, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.03398848, + "balance_loss_mlp": 1.01914799, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 1.6828513648370473, + "language_loss": 0.71129572, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73259926, + "num_input_tokens_seen": 315961340, + "step": 14654, + "time_per_iteration": 2.6889758110046387 + }, + { + "auxiliary_loss_clip": 0.01049232, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.03043127, + "balance_loss_mlp": 1.01767242, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 1.834662960711539, + "language_loss": 0.71339023, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73417705, + "num_input_tokens_seen": 315981335, + "step": 14655, + "time_per_iteration": 2.9054486751556396 + }, + { + "auxiliary_loss_clip": 0.01046386, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.03009319, + "balance_loss_mlp": 1.01780319, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.7746339116898378, + "language_loss": 0.81253517, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83329844, + "num_input_tokens_seen": 316001325, + "step": 14656, + "time_per_iteration": 2.865051507949829 + }, + { + "auxiliary_loss_clip": 0.01073134, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.03548026, + "balance_loss_mlp": 1.01728451, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.8681421823165023, + "language_loss": 0.68537986, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70638764, + "num_input_tokens_seen": 316022540, + "step": 14657, + "time_per_iteration": 2.8117921352386475 + }, + { + "auxiliary_loss_clip": 0.01074485, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.03582418, + "balance_loss_mlp": 1.02159035, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 2.0926614015069127, + "language_loss": 0.83875614, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.85982597, + "num_input_tokens_seen": 316037735, + "step": 14658, + "time_per_iteration": 2.877751111984253 + }, + { + "auxiliary_loss_clip": 0.01080475, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.03396916, + "balance_loss_mlp": 1.0204798, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 1.7084497233716087, + "language_loss": 0.7730934, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79422021, + "num_input_tokens_seen": 316058105, + "step": 14659, + "time_per_iteration": 2.834092140197754 + }, + { + "auxiliary_loss_clip": 0.0107414, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.03262913, + "balance_loss_mlp": 1.02180529, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 2.57592701644134, + "language_loss": 0.60234326, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62340969, + "num_input_tokens_seen": 316074415, + "step": 14660, + "time_per_iteration": 2.841801166534424 + }, + { + "auxiliary_loss_clip": 0.0108181, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.03585911, + "balance_loss_mlp": 1.02121544, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.7352275637097603, + "language_loss": 0.7799083, + "learning_rate": 1.455139770123972e-07, + "loss": 0.8010639, + "num_input_tokens_seen": 316094405, + "step": 14661, + "time_per_iteration": 3.009276866912842 + }, + { + "auxiliary_loss_clip": 0.01055546, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.03489244, + "balance_loss_mlp": 1.02793574, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.9235525900843669, + "language_loss": 0.77070737, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79165459, + "num_input_tokens_seen": 316113390, + "step": 14662, + "time_per_iteration": 2.7688891887664795 + }, + { + "auxiliary_loss_clip": 0.01044438, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.02937722, + "balance_loss_mlp": 1.01678073, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 8.2199973007194, + "language_loss": 0.74137604, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.76208699, + "num_input_tokens_seen": 316131085, + "step": 14663, + "time_per_iteration": 2.911759376525879 + }, + { + "auxiliary_loss_clip": 0.0108792, + "auxiliary_loss_mlp": 0.00749207, + "balance_loss_clip": 1.03459859, + "balance_loss_mlp": 1.00021446, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.6033329502014735, + "language_loss": 0.69792891, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71630013, + "num_input_tokens_seen": 316151440, + "step": 14664, + "time_per_iteration": 2.780169725418091 + }, + { + "auxiliary_loss_clip": 0.01015281, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.02774739, + "balance_loss_mlp": 1.02312851, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.478319791454668, + "language_loss": 0.81065941, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83115244, + "num_input_tokens_seen": 316170750, + "step": 14665, + "time_per_iteration": 4.921172380447388 + }, + { + "auxiliary_loss_clip": 0.01071099, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.03230488, + "balance_loss_mlp": 1.02330518, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 2.359093264803708, + "language_loss": 0.58447289, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60552013, + "num_input_tokens_seen": 316187265, + "step": 14666, + "time_per_iteration": 3.2852492332458496 + }, + { + "auxiliary_loss_clip": 0.01088788, + "auxiliary_loss_mlp": 0.01032104, + "balance_loss_clip": 1.03532946, + "balance_loss_mlp": 1.02045774, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 2.135954278366092, + "language_loss": 0.8357724, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85698128, + "num_input_tokens_seen": 316206555, + "step": 14667, + "time_per_iteration": 2.7834832668304443 + }, + { + "auxiliary_loss_clip": 0.0109662, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.03360677, + "balance_loss_mlp": 1.01995969, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.9087830727576534, + "language_loss": 0.61980873, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64108783, + "num_input_tokens_seen": 316225210, + "step": 14668, + "time_per_iteration": 4.266758680343628 + }, + { + "auxiliary_loss_clip": 0.01083418, + "auxiliary_loss_mlp": 0.01026107, + "balance_loss_clip": 1.03342175, + "balance_loss_mlp": 1.0166539, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.2860234067648095, + "language_loss": 0.56704068, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.5881359, + "num_input_tokens_seen": 316242685, + "step": 14669, + "time_per_iteration": 2.7660205364227295 + }, + { + "auxiliary_loss_clip": 0.01095025, + "auxiliary_loss_mlp": 0.01028428, + "balance_loss_clip": 1.03229141, + "balance_loss_mlp": 1.01787806, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 6.176938319230521, + "language_loss": 0.71648586, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73772043, + "num_input_tokens_seen": 316260935, + "step": 14670, + "time_per_iteration": 2.696254014968872 + }, + { + "auxiliary_loss_clip": 0.0107574, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.02963233, + "balance_loss_mlp": 1.01680672, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 2.1779232308638847, + "language_loss": 0.74107236, + "learning_rate": 1.44059115283929e-07, + "loss": 0.76210499, + "num_input_tokens_seen": 316281190, + "step": 14671, + "time_per_iteration": 2.7467472553253174 + }, + { + "auxiliary_loss_clip": 0.01076752, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.03151274, + "balance_loss_mlp": 1.01701856, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 2.1357763102777483, + "language_loss": 0.84688169, + "learning_rate": 1.43914016096218e-07, + "loss": 0.86793768, + "num_input_tokens_seen": 316297115, + "step": 14672, + "time_per_iteration": 2.7146785259246826 + }, + { + "auxiliary_loss_clip": 0.01056953, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.02929091, + "balance_loss_mlp": 1.01589465, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5248201753934283, + "language_loss": 0.72393829, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74477458, + "num_input_tokens_seen": 316318235, + "step": 14673, + "time_per_iteration": 2.792872905731201 + }, + { + "auxiliary_loss_clip": 0.01002761, + "auxiliary_loss_mlp": 0.01002805, + "balance_loss_clip": 1.00294185, + "balance_loss_mlp": 1.00191116, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.7982356326734206, + "language_loss": 0.49410409, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51415974, + "num_input_tokens_seen": 316384705, + "step": 14674, + "time_per_iteration": 3.316185235977173 + }, + { + "auxiliary_loss_clip": 0.01069639, + "auxiliary_loss_mlp": 0.0074934, + "balance_loss_clip": 1.03081417, + "balance_loss_mlp": 1.00022793, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 2.3741121126948337, + "language_loss": 0.76019138, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.77838111, + "num_input_tokens_seen": 316401165, + "step": 14675, + "time_per_iteration": 2.6943507194519043 + }, + { + "auxiliary_loss_clip": 0.01070622, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.03233957, + "balance_loss_mlp": 1.02001655, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 1.8102698818829024, + "language_loss": 0.79568684, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81670332, + "num_input_tokens_seen": 316418780, + "step": 14676, + "time_per_iteration": 2.715120315551758 + }, + { + "auxiliary_loss_clip": 0.00995955, + "auxiliary_loss_mlp": 0.00998755, + "balance_loss_clip": 1.01081038, + "balance_loss_mlp": 0.99767572, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6900126504417943, + "language_loss": 0.54732609, + "learning_rate": 1.431895760121109e-07, + "loss": 0.5672732, + "num_input_tokens_seen": 316482030, + "step": 14677, + "time_per_iteration": 3.3691718578338623 + }, + { + "auxiliary_loss_clip": 0.01093284, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.03179455, + "balance_loss_mlp": 1.01604843, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 4.250632041533301, + "language_loss": 0.65112996, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67232955, + "num_input_tokens_seen": 316499175, + "step": 14678, + "time_per_iteration": 2.592259407043457 + }, + { + "auxiliary_loss_clip": 0.01071124, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.02995181, + "balance_loss_mlp": 1.01878095, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 2.123343376494301, + "language_loss": 0.70814824, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.72915828, + "num_input_tokens_seen": 316519495, + "step": 14679, + "time_per_iteration": 4.342397212982178 + }, + { + "auxiliary_loss_clip": 0.01070554, + "auxiliary_loss_mlp": 0.01027753, + "balance_loss_clip": 1.03136158, + "balance_loss_mlp": 1.01844287, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.601568885336667, + "language_loss": 0.64078563, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66176873, + "num_input_tokens_seen": 316538180, + "step": 14680, + "time_per_iteration": 2.771594762802124 + }, + { + "auxiliary_loss_clip": 0.01095293, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.03347194, + "balance_loss_mlp": 1.0220629, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.431961892215522, + "language_loss": 0.76982117, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79109859, + "num_input_tokens_seen": 316551750, + "step": 14681, + "time_per_iteration": 2.5954813957214355 + }, + { + "auxiliary_loss_clip": 0.01070621, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.03022945, + "balance_loss_mlp": 1.02001524, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.6614081039562343, + "language_loss": 0.72884429, + "learning_rate": 1.424668961888047e-07, + "loss": 0.74986607, + "num_input_tokens_seen": 316570680, + "step": 14682, + "time_per_iteration": 2.7228646278381348 + }, + { + "auxiliary_loss_clip": 0.01056017, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.03626919, + "balance_loss_mlp": 1.0178721, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.8797468855934039, + "language_loss": 0.74643838, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76729751, + "num_input_tokens_seen": 316588635, + "step": 14683, + "time_per_iteration": 2.8007445335388184 + }, + { + "auxiliary_loss_clip": 0.0106254, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.03218079, + "balance_loss_mlp": 1.0185225, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.9483360090093145, + "language_loss": 0.65755546, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67847687, + "num_input_tokens_seen": 316607550, + "step": 14684, + "time_per_iteration": 2.7969446182250977 + }, + { + "auxiliary_loss_clip": 0.01080783, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.03187823, + "balance_loss_mlp": 1.01630068, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.6821662706170508, + "language_loss": 0.69476026, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.715832, + "num_input_tokens_seen": 316624460, + "step": 14685, + "time_per_iteration": 2.714348077774048 + }, + { + "auxiliary_loss_clip": 0.01041927, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.03172016, + "balance_loss_mlp": 1.02228117, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 2.101272614423035, + "language_loss": 0.7436918, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76446521, + "num_input_tokens_seen": 316640765, + "step": 14686, + "time_per_iteration": 2.8392343521118164 + }, + { + "auxiliary_loss_clip": 0.01042391, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.02927637, + "balance_loss_mlp": 1.01697433, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 1.8119647440763043, + "language_loss": 0.62797523, + "learning_rate": 1.417459773114007e-07, + "loss": 0.64867818, + "num_input_tokens_seen": 316656120, + "step": 14687, + "time_per_iteration": 2.837313175201416 + }, + { + "auxiliary_loss_clip": 0.01088148, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.03374577, + "balance_loss_mlp": 1.02100134, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 2.019652624158236, + "language_loss": 0.68645972, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.70765728, + "num_input_tokens_seen": 316676095, + "step": 14688, + "time_per_iteration": 2.768284797668457 + }, + { + "auxiliary_loss_clip": 0.01083679, + "auxiliary_loss_mlp": 0.01025397, + "balance_loss_clip": 1.03323269, + "balance_loss_mlp": 1.01454878, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.6374228018420733, + "language_loss": 0.67028332, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69137406, + "num_input_tokens_seen": 316696235, + "step": 14689, + "time_per_iteration": 2.807143211364746 + }, + { + "auxiliary_loss_clip": 0.01069159, + "auxiliary_loss_mlp": 0.01026941, + "balance_loss_clip": 1.03527796, + "balance_loss_mlp": 1.01680803, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.355926278739295, + "language_loss": 0.74504405, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76600504, + "num_input_tokens_seen": 316719680, + "step": 14690, + "time_per_iteration": 2.811765193939209 + }, + { + "auxiliary_loss_clip": 0.01070222, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.0309639, + "balance_loss_mlp": 1.0209837, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4062193061576858, + "language_loss": 0.72578967, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.74681985, + "num_input_tokens_seen": 316739830, + "step": 14691, + "time_per_iteration": 2.687939405441284 + }, + { + "auxiliary_loss_clip": 0.01065497, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.03350675, + "balance_loss_mlp": 1.01396644, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 1.6741439864880236, + "language_loss": 0.51442778, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.53533649, + "num_input_tokens_seen": 316758105, + "step": 14692, + "time_per_iteration": 2.6358158588409424 + }, + { + "auxiliary_loss_clip": 0.01061131, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.03349972, + "balance_loss_mlp": 1.01948047, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 1.975043953782265, + "language_loss": 0.6032244, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62413883, + "num_input_tokens_seen": 316777455, + "step": 14693, + "time_per_iteration": 4.095088958740234 + }, + { + "auxiliary_loss_clip": 0.01094555, + "auxiliary_loss_mlp": 0.01023446, + "balance_loss_clip": 1.03480339, + "balance_loss_mlp": 1.01350403, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 2.7272427194784097, + "language_loss": 0.75372404, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77490401, + "num_input_tokens_seen": 316796300, + "step": 14694, + "time_per_iteration": 2.5307960510253906 + }, + { + "auxiliary_loss_clip": 0.0107856, + "auxiliary_loss_mlp": 0.0102922, + "balance_loss_clip": 1.03088689, + "balance_loss_mlp": 1.01884913, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 2.660759146519985, + "language_loss": 0.73059547, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.75167328, + "num_input_tokens_seen": 316819090, + "step": 14695, + "time_per_iteration": 2.5974268913269043 + }, + { + "auxiliary_loss_clip": 0.01079616, + "auxiliary_loss_mlp": 0.010257, + "balance_loss_clip": 1.03152907, + "balance_loss_mlp": 1.01576972, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.5871220775754293, + "language_loss": 0.8017562, + "learning_rate": 1.404527630961998e-07, + "loss": 0.8228094, + "num_input_tokens_seen": 316839250, + "step": 14696, + "time_per_iteration": 2.576113700866699 + }, + { + "auxiliary_loss_clip": 0.01056012, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03269517, + "balance_loss_mlp": 1.0207566, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 2.118740914298157, + "language_loss": 0.74856269, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76943415, + "num_input_tokens_seen": 316861315, + "step": 14697, + "time_per_iteration": 2.701951026916504 + }, + { + "auxiliary_loss_clip": 0.01075566, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.03372455, + "balance_loss_mlp": 1.01982605, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 1.9218909631468666, + "language_loss": 0.71751302, + "learning_rate": 1.401661576761779e-07, + "loss": 0.73857331, + "num_input_tokens_seen": 316879325, + "step": 14698, + "time_per_iteration": 2.684347629547119 + }, + { + "auxiliary_loss_clip": 0.01012109, + "auxiliary_loss_mlp": 0.01001291, + "balance_loss_clip": 1.0025115, + "balance_loss_mlp": 1.00044465, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.803097442634315, + "language_loss": 0.53696322, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55709732, + "num_input_tokens_seen": 316936425, + "step": 14699, + "time_per_iteration": 3.153076648712158 + }, + { + "auxiliary_loss_clip": 0.01073646, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.03180456, + "balance_loss_mlp": 1.01638055, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.5818336870454814, + "language_loss": 0.76645446, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.78746992, + "num_input_tokens_seen": 316956360, + "step": 14700, + "time_per_iteration": 2.6589691638946533 + }, + { + "auxiliary_loss_clip": 0.010627, + "auxiliary_loss_mlp": 0.010279, + "balance_loss_clip": 1.03218651, + "balance_loss_mlp": 1.01736164, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 5.438840504471579, + "language_loss": 0.73095906, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75186509, + "num_input_tokens_seen": 316975295, + "step": 14701, + "time_per_iteration": 2.663505792617798 + }, + { + "auxiliary_loss_clip": 0.01070838, + "auxiliary_loss_mlp": 0.01036054, + "balance_loss_clip": 1.03079367, + "balance_loss_mlp": 1.02343583, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 4.991500905273372, + "language_loss": 0.70515931, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.72622824, + "num_input_tokens_seen": 316994520, + "step": 14702, + "time_per_iteration": 2.6933698654174805 + }, + { + "auxiliary_loss_clip": 0.01056586, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.03083563, + "balance_loss_mlp": 1.0250566, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 1.573513098294073, + "language_loss": 0.71457583, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73552144, + "num_input_tokens_seen": 317018095, + "step": 14703, + "time_per_iteration": 2.9540419578552246 + }, + { + "auxiliary_loss_clip": 0.01052294, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.03315353, + "balance_loss_mlp": 1.01942837, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 2.372485746837716, + "language_loss": 0.66673374, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68755007, + "num_input_tokens_seen": 317035755, + "step": 14704, + "time_per_iteration": 2.6741180419921875 + }, + { + "auxiliary_loss_clip": 0.01074203, + "auxiliary_loss_mlp": 0.01026669, + "balance_loss_clip": 1.03106821, + "balance_loss_mlp": 1.01683426, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 2.3708810428217757, + "language_loss": 0.7084167, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72942543, + "num_input_tokens_seen": 317055765, + "step": 14705, + "time_per_iteration": 4.025746583938599 + }, + { + "auxiliary_loss_clip": 0.01074174, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.03192639, + "balance_loss_mlp": 1.02244592, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.5384429758423315, + "language_loss": 0.70959759, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73065889, + "num_input_tokens_seen": 317077955, + "step": 14706, + "time_per_iteration": 2.651235818862915 + }, + { + "auxiliary_loss_clip": 0.01085526, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.03223228, + "balance_loss_mlp": 1.01881254, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.7665439798966913, + "language_loss": 0.74473596, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76588202, + "num_input_tokens_seen": 317095825, + "step": 14707, + "time_per_iteration": 2.5583455562591553 + }, + { + "auxiliary_loss_clip": 0.0099727, + "auxiliary_loss_mlp": 0.01001642, + "balance_loss_clip": 1.00680566, + "balance_loss_mlp": 1.00059342, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7975471613684099, + "language_loss": 0.60408944, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62407857, + "num_input_tokens_seen": 317152875, + "step": 14708, + "time_per_iteration": 4.54711127281189 + }, + { + "auxiliary_loss_clip": 0.01062532, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.02880394, + "balance_loss_mlp": 1.01843429, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 1.8239750491676963, + "language_loss": 0.67661518, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.697523, + "num_input_tokens_seen": 317176725, + "step": 14709, + "time_per_iteration": 2.7784311771392822 + }, + { + "auxiliary_loss_clip": 0.01079244, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.0334444, + "balance_loss_mlp": 1.02040863, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.634387995849144, + "language_loss": 0.62349737, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64461577, + "num_input_tokens_seen": 317206880, + "step": 14710, + "time_per_iteration": 2.8392913341522217 + }, + { + "auxiliary_loss_clip": 0.01062612, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.03171241, + "balance_loss_mlp": 1.01753592, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.179762158730358, + "language_loss": 0.63461739, + "learning_rate": 1.38310100580431e-07, + "loss": 0.65551704, + "num_input_tokens_seen": 317224135, + "step": 14711, + "time_per_iteration": 2.636876344680786 + }, + { + "auxiliary_loss_clip": 0.01057871, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.03027129, + "balance_loss_mlp": 1.01720679, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 2.038238056233733, + "language_loss": 0.76341069, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78427196, + "num_input_tokens_seen": 317244505, + "step": 14712, + "time_per_iteration": 2.7326982021331787 + }, + { + "auxiliary_loss_clip": 0.01031077, + "auxiliary_loss_mlp": 0.0102805, + "balance_loss_clip": 1.02645361, + "balance_loss_mlp": 1.01674914, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 2.399432058004727, + "language_loss": 0.80762219, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.82821345, + "num_input_tokens_seen": 317257830, + "step": 14713, + "time_per_iteration": 2.674286365509033 + }, + { + "auxiliary_loss_clip": 0.01073125, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.03156841, + "balance_loss_mlp": 1.01667678, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.4459967016969233, + "language_loss": 0.55342025, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57442915, + "num_input_tokens_seen": 317278430, + "step": 14714, + "time_per_iteration": 2.6539700031280518 + }, + { + "auxiliary_loss_clip": 0.01039461, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.03365326, + "balance_loss_mlp": 1.02310634, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.9790797264764395, + "language_loss": 0.73624557, + "learning_rate": 1.377414057838755e-07, + "loss": 0.75699008, + "num_input_tokens_seen": 317295970, + "step": 14715, + "time_per_iteration": 2.7837891578674316 + }, + { + "auxiliary_loss_clip": 0.01086031, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.03221583, + "balance_loss_mlp": 1.0193646, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 1.5211667813796412, + "language_loss": 0.75249684, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77365285, + "num_input_tokens_seen": 317316185, + "step": 14716, + "time_per_iteration": 2.582092046737671 + }, + { + "auxiliary_loss_clip": 0.01061783, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.03241515, + "balance_loss_mlp": 1.02337956, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 2.110378308130191, + "language_loss": 0.71277797, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73373485, + "num_input_tokens_seen": 317333275, + "step": 14717, + "time_per_iteration": 2.7252931594848633 + }, + { + "auxiliary_loss_clip": 0.01081535, + "auxiliary_loss_mlp": 0.0102844, + "balance_loss_clip": 1.03247011, + "balance_loss_mlp": 1.01834869, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 2.1212513116567595, + "language_loss": 0.74273372, + "learning_rate": 1.373156261464208e-07, + "loss": 0.7638334, + "num_input_tokens_seen": 317351245, + "step": 14718, + "time_per_iteration": 2.687211036682129 + }, + { + "auxiliary_loss_clip": 0.01045235, + "auxiliary_loss_mlp": 0.01024971, + "balance_loss_clip": 1.03287196, + "balance_loss_mlp": 1.01378918, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 2.0111786334570683, + "language_loss": 0.78621632, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80691838, + "num_input_tokens_seen": 317370740, + "step": 14719, + "time_per_iteration": 4.366718530654907 + }, + { + "auxiliary_loss_clip": 0.01097041, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.03323102, + "balance_loss_mlp": 1.01506519, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.8615907108024732, + "language_loss": 0.71940708, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74063373, + "num_input_tokens_seen": 317388370, + "step": 14720, + "time_per_iteration": 2.5424187183380127 + }, + { + "auxiliary_loss_clip": 0.01075529, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.03242862, + "balance_loss_mlp": 1.01806164, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 2.237914065436522, + "language_loss": 0.82587147, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84691882, + "num_input_tokens_seen": 317407390, + "step": 14721, + "time_per_iteration": 2.6184163093566895 + }, + { + "auxiliary_loss_clip": 0.01074215, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.03027511, + "balance_loss_mlp": 1.01979983, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.5828184550152662, + "language_loss": 0.62227732, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.6433326, + "num_input_tokens_seen": 317430825, + "step": 14722, + "time_per_iteration": 2.8971645832061768 + }, + { + "auxiliary_loss_clip": 0.01086631, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.03244901, + "balance_loss_mlp": 1.01811862, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 1.9417800101894402, + "language_loss": 0.68764758, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.70880145, + "num_input_tokens_seen": 317451905, + "step": 14723, + "time_per_iteration": 2.7682600021362305 + }, + { + "auxiliary_loss_clip": 0.01056544, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.02858603, + "balance_loss_mlp": 1.02153707, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.6175047967997451, + "language_loss": 0.77981901, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80071062, + "num_input_tokens_seen": 317470030, + "step": 14724, + "time_per_iteration": 2.7562415599823 + }, + { + "auxiliary_loss_clip": 0.01013179, + "auxiliary_loss_mlp": 0.0099935, + "balance_loss_clip": 1.00344026, + "balance_loss_mlp": 0.99828303, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.8000532029910746, + "language_loss": 0.58935702, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60948229, + "num_input_tokens_seen": 317527460, + "step": 14725, + "time_per_iteration": 3.093061685562134 + }, + { + "auxiliary_loss_clip": 0.01072448, + "auxiliary_loss_mlp": 0.00749609, + "balance_loss_clip": 1.03058529, + "balance_loss_mlp": 1.00025797, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 3.485536654559485, + "language_loss": 0.69171524, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.70993578, + "num_input_tokens_seen": 317544070, + "step": 14726, + "time_per_iteration": 2.6765451431274414 + }, + { + "auxiliary_loss_clip": 0.01080617, + "auxiliary_loss_mlp": 0.0074925, + "balance_loss_clip": 1.03159189, + "balance_loss_mlp": 1.00021279, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.3226055370155485, + "language_loss": 0.69545954, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71375817, + "num_input_tokens_seen": 317570275, + "step": 14727, + "time_per_iteration": 2.7873055934906006 + }, + { + "auxiliary_loss_clip": 0.01077479, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.03722262, + "balance_loss_mlp": 1.02346146, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 2.1264342137590315, + "language_loss": 0.6980654, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.71918786, + "num_input_tokens_seen": 317590160, + "step": 14728, + "time_per_iteration": 2.6777122020721436 + }, + { + "auxiliary_loss_clip": 0.01063005, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.0320344, + "balance_loss_mlp": 1.01790965, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.9828931697060805, + "language_loss": 0.66328549, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68419504, + "num_input_tokens_seen": 317608340, + "step": 14729, + "time_per_iteration": 2.696845054626465 + }, + { + "auxiliary_loss_clip": 0.01066474, + "auxiliary_loss_mlp": 0.01032364, + "balance_loss_clip": 1.03372145, + "balance_loss_mlp": 1.02250552, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.7847747131949803, + "language_loss": 0.62933481, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65032321, + "num_input_tokens_seen": 317629910, + "step": 14730, + "time_per_iteration": 2.7575626373291016 + }, + { + "auxiliary_loss_clip": 0.01059859, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.0313468, + "balance_loss_mlp": 1.01785731, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.609806290718879, + "language_loss": 0.79351842, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81439805, + "num_input_tokens_seen": 317650265, + "step": 14731, + "time_per_iteration": 2.7283830642700195 + }, + { + "auxiliary_loss_clip": 0.01058981, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.0288043, + "balance_loss_mlp": 1.02092791, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.6401373331843192, + "language_loss": 0.83193189, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85283816, + "num_input_tokens_seen": 317669045, + "step": 14732, + "time_per_iteration": 4.155628442764282 + }, + { + "auxiliary_loss_clip": 0.01005557, + "auxiliary_loss_mlp": 0.01004616, + "balance_loss_clip": 1.00586462, + "balance_loss_mlp": 1.00366843, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.897764663889021, + "language_loss": 0.59965873, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61976045, + "num_input_tokens_seen": 317728065, + "step": 14733, + "time_per_iteration": 3.2160632610321045 + }, + { + "auxiliary_loss_clip": 0.01097128, + "auxiliary_loss_mlp": 0.00749272, + "balance_loss_clip": 1.03421962, + "balance_loss_mlp": 1.00024426, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.8820899764806989, + "language_loss": 0.6638363, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68230033, + "num_input_tokens_seen": 317746120, + "step": 14734, + "time_per_iteration": 2.7521097660064697 + }, + { + "auxiliary_loss_clip": 0.01067603, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.03349531, + "balance_loss_mlp": 1.02565265, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 2.032781655644467, + "language_loss": 0.75533408, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77637035, + "num_input_tokens_seen": 317762280, + "step": 14735, + "time_per_iteration": 2.5360724925994873 + }, + { + "auxiliary_loss_clip": 0.01056846, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.03139436, + "balance_loss_mlp": 1.02057791, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.7535878486547118, + "language_loss": 0.70217979, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72306633, + "num_input_tokens_seen": 317780615, + "step": 14736, + "time_per_iteration": 2.707101345062256 + }, + { + "auxiliary_loss_clip": 0.01064836, + "auxiliary_loss_mlp": 0.0102661, + "balance_loss_clip": 1.0339011, + "balance_loss_mlp": 1.01567256, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 2.2827613443687254, + "language_loss": 0.84354341, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86445785, + "num_input_tokens_seen": 317798830, + "step": 14737, + "time_per_iteration": 2.595555067062378 + }, + { + "auxiliary_loss_clip": 0.01070069, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.03316569, + "balance_loss_mlp": 1.02141356, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 2.04134060374156, + "language_loss": 0.68516034, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70619118, + "num_input_tokens_seen": 317819235, + "step": 14738, + "time_per_iteration": 2.8403589725494385 + }, + { + "auxiliary_loss_clip": 0.01090048, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.03446603, + "balance_loss_mlp": 1.01739478, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.6165008992257908, + "language_loss": 0.74909973, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77028942, + "num_input_tokens_seen": 317836785, + "step": 14739, + "time_per_iteration": 2.6482653617858887 + }, + { + "auxiliary_loss_clip": 0.0108318, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.03307593, + "balance_loss_mlp": 1.01900864, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.8545968796568504, + "language_loss": 0.87406063, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89518225, + "num_input_tokens_seen": 317854225, + "step": 14740, + "time_per_iteration": 2.6353824138641357 + }, + { + "auxiliary_loss_clip": 0.01046687, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.0308857, + "balance_loss_mlp": 1.01923823, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 2.5042975257794016, + "language_loss": 0.63591045, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65668166, + "num_input_tokens_seen": 317874865, + "step": 14741, + "time_per_iteration": 2.764965057373047 + }, + { + "auxiliary_loss_clip": 0.01096151, + "auxiliary_loss_mlp": 0.01025604, + "balance_loss_clip": 1.03331578, + "balance_loss_mlp": 1.01516104, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 1.696179008429235, + "language_loss": 0.72964191, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.75085944, + "num_input_tokens_seen": 317892830, + "step": 14742, + "time_per_iteration": 2.54915189743042 + }, + { + "auxiliary_loss_clip": 0.01080525, + "auxiliary_loss_mlp": 0.00749289, + "balance_loss_clip": 1.03066969, + "balance_loss_mlp": 1.00027037, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.948385293391686, + "language_loss": 0.59180874, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61010695, + "num_input_tokens_seen": 317911780, + "step": 14743, + "time_per_iteration": 2.689279317855835 + }, + { + "auxiliary_loss_clip": 0.0106689, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.03231645, + "balance_loss_mlp": 1.018291, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 4.115342100655592, + "language_loss": 0.60053766, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62150931, + "num_input_tokens_seen": 317932855, + "step": 14744, + "time_per_iteration": 2.7030892372131348 + }, + { + "auxiliary_loss_clip": 0.01081332, + "auxiliary_loss_mlp": 0.00749377, + "balance_loss_clip": 1.03479397, + "balance_loss_mlp": 1.00022078, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.8627253803210102, + "language_loss": 0.76844025, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78674734, + "num_input_tokens_seen": 317952090, + "step": 14745, + "time_per_iteration": 4.1717634201049805 + }, + { + "auxiliary_loss_clip": 0.01094944, + "auxiliary_loss_mlp": 0.00749277, + "balance_loss_clip": 1.03367877, + "balance_loss_mlp": 1.00022972, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.7443434656760555, + "language_loss": 0.76957524, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.78801751, + "num_input_tokens_seen": 317970370, + "step": 14746, + "time_per_iteration": 2.5276782512664795 + }, + { + "auxiliary_loss_clip": 0.01073934, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.03533244, + "balance_loss_mlp": 1.02263331, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 1.835184269723658, + "language_loss": 0.76542413, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78650749, + "num_input_tokens_seen": 317989125, + "step": 14747, + "time_per_iteration": 2.603766679763794 + }, + { + "auxiliary_loss_clip": 0.01069995, + "auxiliary_loss_mlp": 0.00749143, + "balance_loss_clip": 1.03008115, + "balance_loss_mlp": 1.00022018, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.628145975944361, + "language_loss": 0.82875884, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84695017, + "num_input_tokens_seen": 318007820, + "step": 14748, + "time_per_iteration": 4.234054803848267 + }, + { + "auxiliary_loss_clip": 0.01087463, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.03451085, + "balance_loss_mlp": 1.01980591, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 2.8049518509460585, + "language_loss": 0.77541852, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79660332, + "num_input_tokens_seen": 318030435, + "step": 14749, + "time_per_iteration": 2.889357328414917 + }, + { + "auxiliary_loss_clip": 0.01032256, + "auxiliary_loss_mlp": 0.00749412, + "balance_loss_clip": 1.03240252, + "balance_loss_mlp": 1.00033331, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 4.50200170808836, + "language_loss": 0.69579494, + "learning_rate": 1.328135602550451e-07, + "loss": 0.7136116, + "num_input_tokens_seen": 318049465, + "step": 14750, + "time_per_iteration": 2.767787456512451 + }, + { + "auxiliary_loss_clip": 0.01085572, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.03338897, + "balance_loss_mlp": 1.01628947, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 4.694729966256252, + "language_loss": 0.59102231, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61214286, + "num_input_tokens_seen": 318067760, + "step": 14751, + "time_per_iteration": 2.5680673122406006 + }, + { + "auxiliary_loss_clip": 0.01096629, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.03352749, + "balance_loss_mlp": 1.0208807, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.8801405935308, + "language_loss": 0.81690586, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83819336, + "num_input_tokens_seen": 318082785, + "step": 14752, + "time_per_iteration": 2.614400863647461 + }, + { + "auxiliary_loss_clip": 0.01078231, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.03542328, + "balance_loss_mlp": 1.02015615, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 1.867555649944097, + "language_loss": 0.80515409, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82625353, + "num_input_tokens_seen": 318101925, + "step": 14753, + "time_per_iteration": 2.630810260772705 + }, + { + "auxiliary_loss_clip": 0.01095158, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.03295767, + "balance_loss_mlp": 1.01992881, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.7404300330499973, + "language_loss": 0.65091115, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67216802, + "num_input_tokens_seen": 318119945, + "step": 14754, + "time_per_iteration": 2.508758544921875 + }, + { + "auxiliary_loss_clip": 0.01098422, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.03455722, + "balance_loss_mlp": 1.01922584, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 5.502844405575041, + "language_loss": 0.74648333, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76776803, + "num_input_tokens_seen": 318139685, + "step": 14755, + "time_per_iteration": 2.5413403511047363 + }, + { + "auxiliary_loss_clip": 0.01067633, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.02844095, + "balance_loss_mlp": 1.0202142, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.4706296519055995, + "language_loss": 0.78016067, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.80115747, + "num_input_tokens_seen": 318160375, + "step": 14756, + "time_per_iteration": 2.6036767959594727 + }, + { + "auxiliary_loss_clip": 0.01077326, + "auxiliary_loss_mlp": 0.01034405, + "balance_loss_clip": 1.03395522, + "balance_loss_mlp": 1.02360415, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 2.994604281342035, + "language_loss": 0.76578796, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78690529, + "num_input_tokens_seen": 318177995, + "step": 14757, + "time_per_iteration": 2.6186063289642334 + }, + { + "auxiliary_loss_clip": 0.01031318, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.02864039, + "balance_loss_mlp": 1.02217424, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 4.2728676676250705, + "language_loss": 0.68087143, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70151794, + "num_input_tokens_seen": 318197030, + "step": 14758, + "time_per_iteration": 2.755514621734619 + }, + { + "auxiliary_loss_clip": 0.01096879, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.03358555, + "balance_loss_mlp": 1.01938987, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.7123587629787655, + "language_loss": 0.69144595, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71272802, + "num_input_tokens_seen": 318221780, + "step": 14759, + "time_per_iteration": 2.7281839847564697 + }, + { + "auxiliary_loss_clip": 0.01093245, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.03124523, + "balance_loss_mlp": 1.01786709, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 1.8509027749286138, + "language_loss": 0.74892008, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.77013594, + "num_input_tokens_seen": 318239710, + "step": 14760, + "time_per_iteration": 3.9601247310638428 + }, + { + "auxiliary_loss_clip": 0.01078629, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.03433371, + "balance_loss_mlp": 1.01758075, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 4.049728834610488, + "language_loss": 0.76395744, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.78503019, + "num_input_tokens_seen": 318257425, + "step": 14761, + "time_per_iteration": 2.5383803844451904 + }, + { + "auxiliary_loss_clip": 0.01098312, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.0332396, + "balance_loss_mlp": 1.02132392, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 1.5603633959391732, + "language_loss": 0.61139953, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63270259, + "num_input_tokens_seen": 318278485, + "step": 14762, + "time_per_iteration": 2.5280613899230957 + }, + { + "auxiliary_loss_clip": 0.01080963, + "auxiliary_loss_mlp": 0.0102898, + "balance_loss_clip": 1.03144193, + "balance_loss_mlp": 1.01744032, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 2.902510810190777, + "language_loss": 0.63904595, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.6601454, + "num_input_tokens_seen": 318297560, + "step": 14763, + "time_per_iteration": 2.521641492843628 + }, + { + "auxiliary_loss_clip": 0.01072001, + "auxiliary_loss_mlp": 0.00749298, + "balance_loss_clip": 1.03371871, + "balance_loss_mlp": 1.00022864, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.925574555155933, + "language_loss": 0.71605849, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.73427153, + "num_input_tokens_seen": 318313060, + "step": 14764, + "time_per_iteration": 2.533043384552002 + }, + { + "auxiliary_loss_clip": 0.01101001, + "auxiliary_loss_mlp": 0.01038488, + "balance_loss_clip": 1.03453469, + "balance_loss_mlp": 1.02712774, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.1033113934140015, + "language_loss": 0.65969706, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68109196, + "num_input_tokens_seen": 318332030, + "step": 14765, + "time_per_iteration": 2.5255165100097656 + }, + { + "auxiliary_loss_clip": 0.01056353, + "auxiliary_loss_mlp": 0.01025751, + "balance_loss_clip": 1.03254437, + "balance_loss_mlp": 1.01603556, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.6768989968724115, + "language_loss": 0.76256245, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78338349, + "num_input_tokens_seen": 318351090, + "step": 14766, + "time_per_iteration": 2.6111161708831787 + }, + { + "auxiliary_loss_clip": 0.01063856, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.0312413, + "balance_loss_mlp": 1.02109253, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 1.8919372900372318, + "language_loss": 0.72934645, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75029969, + "num_input_tokens_seen": 318372000, + "step": 14767, + "time_per_iteration": 2.595715284347534 + }, + { + "auxiliary_loss_clip": 0.01093909, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.03318584, + "balance_loss_mlp": 1.01774991, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 2.128631350544637, + "language_loss": 0.70886278, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73008132, + "num_input_tokens_seen": 318391530, + "step": 14768, + "time_per_iteration": 2.5977752208709717 + }, + { + "auxiliary_loss_clip": 0.01079005, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.0308547, + "balance_loss_mlp": 1.01942158, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.6996054230585917, + "language_loss": 0.70354301, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72463334, + "num_input_tokens_seen": 318410690, + "step": 14769, + "time_per_iteration": 2.5361666679382324 + }, + { + "auxiliary_loss_clip": 0.01067382, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.03170419, + "balance_loss_mlp": 1.02363515, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 1.8867676678018919, + "language_loss": 0.67161369, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69262356, + "num_input_tokens_seen": 318427380, + "step": 14770, + "time_per_iteration": 2.548762798309326 + }, + { + "auxiliary_loss_clip": 0.01081896, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.03241146, + "balance_loss_mlp": 1.01864052, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.922565059748801, + "language_loss": 0.65318143, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67429447, + "num_input_tokens_seen": 318448530, + "step": 14771, + "time_per_iteration": 2.6111600399017334 + }, + { + "auxiliary_loss_clip": 0.01071018, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.030756, + "balance_loss_mlp": 1.0154494, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.5826607086193745, + "language_loss": 0.82301474, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84398985, + "num_input_tokens_seen": 318468655, + "step": 14772, + "time_per_iteration": 4.198916912078857 + }, + { + "auxiliary_loss_clip": 0.01066211, + "auxiliary_loss_mlp": 0.01021752, + "balance_loss_clip": 1.02869856, + "balance_loss_mlp": 1.01195908, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.5364853174783544, + "language_loss": 0.76247394, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78335357, + "num_input_tokens_seen": 318488740, + "step": 14773, + "time_per_iteration": 2.5886263847351074 + }, + { + "auxiliary_loss_clip": 0.01070649, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.03123701, + "balance_loss_mlp": 1.01584387, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 2.9287014593614975, + "language_loss": 0.75214362, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77310777, + "num_input_tokens_seen": 318508810, + "step": 14774, + "time_per_iteration": 2.6107239723205566 + }, + { + "auxiliary_loss_clip": 0.01050876, + "auxiliary_loss_mlp": 0.00749218, + "balance_loss_clip": 1.03236985, + "balance_loss_mlp": 1.00020647, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 4.438532118923555, + "language_loss": 0.72445083, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.74245173, + "num_input_tokens_seen": 318526860, + "step": 14775, + "time_per_iteration": 2.606814384460449 + }, + { + "auxiliary_loss_clip": 0.01094891, + "auxiliary_loss_mlp": 0.01025964, + "balance_loss_clip": 1.03271985, + "balance_loss_mlp": 1.0154916, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.5955239949551239, + "language_loss": 0.80312711, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82433563, + "num_input_tokens_seen": 318545180, + "step": 14776, + "time_per_iteration": 2.4467968940734863 + }, + { + "auxiliary_loss_clip": 0.01089421, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.03175795, + "balance_loss_mlp": 1.02203035, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 1.9512606366843102, + "language_loss": 0.69734502, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71857125, + "num_input_tokens_seen": 318564350, + "step": 14777, + "time_per_iteration": 2.554830551147461 + }, + { + "auxiliary_loss_clip": 0.01078066, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.02875376, + "balance_loss_mlp": 1.0234133, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 1.80637586192894, + "language_loss": 0.70774859, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72887325, + "num_input_tokens_seen": 318582275, + "step": 14778, + "time_per_iteration": 2.487755060195923 + }, + { + "auxiliary_loss_clip": 0.01097546, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.03413486, + "balance_loss_mlp": 1.02325273, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.7905948034318622, + "language_loss": 0.77356482, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79487938, + "num_input_tokens_seen": 318601230, + "step": 14779, + "time_per_iteration": 2.492642402648926 + }, + { + "auxiliary_loss_clip": 0.01004581, + "auxiliary_loss_mlp": 0.01001767, + "balance_loss_clip": 1.00741148, + "balance_loss_mlp": 1.00071812, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8736140212102518, + "language_loss": 0.56791747, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58798099, + "num_input_tokens_seen": 318645595, + "step": 14780, + "time_per_iteration": 2.9224655628204346 + }, + { + "auxiliary_loss_clip": 0.01023164, + "auxiliary_loss_mlp": 0.00998748, + "balance_loss_clip": 1.00330591, + "balance_loss_mlp": 0.99774086, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7985868116607461, + "language_loss": 0.62419516, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64441425, + "num_input_tokens_seen": 318707850, + "step": 14781, + "time_per_iteration": 3.099404811859131 + }, + { + "auxiliary_loss_clip": 0.00959883, + "auxiliary_loss_mlp": 0.010017, + "balance_loss_clip": 1.00988996, + "balance_loss_mlp": 1.00080574, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7882366137978241, + "language_loss": 0.58173299, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60134888, + "num_input_tokens_seen": 318764915, + "step": 14782, + "time_per_iteration": 3.2765982151031494 + }, + { + "auxiliary_loss_clip": 0.01095838, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.03489447, + "balance_loss_mlp": 1.01829934, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.699915019686732, + "language_loss": 0.65885389, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.68009424, + "num_input_tokens_seen": 318785660, + "step": 14783, + "time_per_iteration": 3.085683584213257 + }, + { + "auxiliary_loss_clip": 0.01100251, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.03447926, + "balance_loss_mlp": 1.01951742, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.5076812124144734, + "language_loss": 0.77334642, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79465413, + "num_input_tokens_seen": 318806080, + "step": 14784, + "time_per_iteration": 2.5106923580169678 + }, + { + "auxiliary_loss_clip": 0.01067605, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.03164768, + "balance_loss_mlp": 1.01951396, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.423959099342896, + "language_loss": 0.60206437, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62305063, + "num_input_tokens_seen": 318826445, + "step": 14785, + "time_per_iteration": 4.700080156326294 + }, + { + "auxiliary_loss_clip": 0.01058871, + "auxiliary_loss_mlp": 0.01029693, + "balance_loss_clip": 1.03039169, + "balance_loss_mlp": 1.01893425, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.7202542692453529, + "language_loss": 0.65027249, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67115808, + "num_input_tokens_seen": 318843915, + "step": 14786, + "time_per_iteration": 2.5967955589294434 + }, + { + "auxiliary_loss_clip": 0.01064173, + "auxiliary_loss_mlp": 0.00749519, + "balance_loss_clip": 1.03161979, + "balance_loss_mlp": 1.00023615, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 3.0401285793025883, + "language_loss": 0.85621327, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87435019, + "num_input_tokens_seen": 318859670, + "step": 14787, + "time_per_iteration": 2.585827112197876 + }, + { + "auxiliary_loss_clip": 0.01071121, + "auxiliary_loss_mlp": 0.01028212, + "balance_loss_clip": 1.03180408, + "balance_loss_mlp": 1.01745963, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 1.9604982947977072, + "language_loss": 0.70997256, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73096585, + "num_input_tokens_seen": 318877855, + "step": 14788, + "time_per_iteration": 2.5565245151519775 + }, + { + "auxiliary_loss_clip": 0.01094035, + "auxiliary_loss_mlp": 0.0102793, + "balance_loss_clip": 1.03379273, + "balance_loss_mlp": 1.01776099, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.6529145783602623, + "language_loss": 0.69927782, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72049743, + "num_input_tokens_seen": 318896045, + "step": 14789, + "time_per_iteration": 3.990177631378174 + }, + { + "auxiliary_loss_clip": 0.01088033, + "auxiliary_loss_mlp": 0.01022484, + "balance_loss_clip": 1.03454542, + "balance_loss_mlp": 1.01148105, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.7768180224586623, + "language_loss": 0.70460004, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72570521, + "num_input_tokens_seen": 318915515, + "step": 14790, + "time_per_iteration": 2.515690803527832 + }, + { + "auxiliary_loss_clip": 0.01077704, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.0344224, + "balance_loss_mlp": 1.01598334, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.8439925293208443, + "language_loss": 0.7312668, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75230342, + "num_input_tokens_seen": 318934305, + "step": 14791, + "time_per_iteration": 2.5433638095855713 + }, + { + "auxiliary_loss_clip": 0.01060719, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.03167748, + "balance_loss_mlp": 1.02298069, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.5041864931573905, + "language_loss": 0.7396974, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.7606467, + "num_input_tokens_seen": 318953880, + "step": 14792, + "time_per_iteration": 2.6065337657928467 + }, + { + "auxiliary_loss_clip": 0.01024354, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.02928805, + "balance_loss_mlp": 1.0189693, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 4.93859688314116, + "language_loss": 0.65940058, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.6799531, + "num_input_tokens_seen": 318971395, + "step": 14793, + "time_per_iteration": 2.7224650382995605 + }, + { + "auxiliary_loss_clip": 0.01066644, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.03232241, + "balance_loss_mlp": 1.0179925, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.7005505708107427, + "language_loss": 0.71813285, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73909396, + "num_input_tokens_seen": 318990580, + "step": 14794, + "time_per_iteration": 2.6278128623962402 + }, + { + "auxiliary_loss_clip": 0.01086699, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.03402591, + "balance_loss_mlp": 1.01631439, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 1.5817564129565307, + "language_loss": 0.7520076, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77315402, + "num_input_tokens_seen": 319010040, + "step": 14795, + "time_per_iteration": 2.6159162521362305 + }, + { + "auxiliary_loss_clip": 0.01001555, + "auxiliary_loss_mlp": 0.01002991, + "balance_loss_clip": 1.01074636, + "balance_loss_mlp": 1.00198364, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7728735909928289, + "language_loss": 0.56152552, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58157098, + "num_input_tokens_seen": 319063860, + "step": 14796, + "time_per_iteration": 3.0557541847229004 + }, + { + "auxiliary_loss_clip": 0.01098832, + "auxiliary_loss_mlp": 0.01027882, + "balance_loss_clip": 1.03404522, + "balance_loss_mlp": 1.01600289, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 1.9077175172263758, + "language_loss": 0.70140064, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72266781, + "num_input_tokens_seen": 319082335, + "step": 14797, + "time_per_iteration": 2.5037083625793457 + }, + { + "auxiliary_loss_clip": 0.01006848, + "auxiliary_loss_mlp": 0.01001563, + "balance_loss_clip": 1.00572395, + "balance_loss_mlp": 1.00054955, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7453909401747619, + "language_loss": 0.57984555, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.59992963, + "num_input_tokens_seen": 319147075, + "step": 14798, + "time_per_iteration": 3.1767067909240723 + }, + { + "auxiliary_loss_clip": 0.01084216, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.03174829, + "balance_loss_mlp": 1.01536942, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.820432440113801, + "language_loss": 0.79361778, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81472611, + "num_input_tokens_seen": 319166630, + "step": 14799, + "time_per_iteration": 2.5701584815979004 + }, + { + "auxiliary_loss_clip": 0.01012457, + "auxiliary_loss_mlp": 0.01001148, + "balance_loss_clip": 1.00282073, + "balance_loss_mlp": 1.00014675, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.9083143888353352, + "language_loss": 0.58122861, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60136473, + "num_input_tokens_seen": 319221865, + "step": 14800, + "time_per_iteration": 4.486688613891602 + }, + { + "auxiliary_loss_clip": 0.01086753, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.03440607, + "balance_loss_mlp": 1.02016783, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.4488066303727773, + "language_loss": 0.6626271, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68379825, + "num_input_tokens_seen": 319240710, + "step": 14801, + "time_per_iteration": 2.6239359378814697 + }, + { + "auxiliary_loss_clip": 0.0105814, + "auxiliary_loss_mlp": 0.01035686, + "balance_loss_clip": 1.03294587, + "balance_loss_mlp": 1.02306151, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.3502289001511554, + "language_loss": 0.75686324, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77780151, + "num_input_tokens_seen": 319256495, + "step": 14802, + "time_per_iteration": 2.5849485397338867 + }, + { + "auxiliary_loss_clip": 0.0108228, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.03454661, + "balance_loss_mlp": 1.01817918, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 2.094366686429793, + "language_loss": 0.73222661, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75333333, + "num_input_tokens_seen": 319273620, + "step": 14803, + "time_per_iteration": 2.5126328468322754 + }, + { + "auxiliary_loss_clip": 0.01067256, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.03186297, + "balance_loss_mlp": 1.02044535, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 1.802284738852848, + "language_loss": 0.71664727, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73763245, + "num_input_tokens_seen": 319291720, + "step": 14804, + "time_per_iteration": 2.6418845653533936 + }, + { + "auxiliary_loss_clip": 0.01082357, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.03223205, + "balance_loss_mlp": 1.01952195, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.7774572945057134, + "language_loss": 0.81256044, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83368772, + "num_input_tokens_seen": 319310380, + "step": 14805, + "time_per_iteration": 2.581512212753296 + }, + { + "auxiliary_loss_clip": 0.0108876, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.03368592, + "balance_loss_mlp": 1.02078819, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 3.523493677501772, + "language_loss": 0.67209506, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69330174, + "num_input_tokens_seen": 319331765, + "step": 14806, + "time_per_iteration": 2.5753629207611084 + }, + { + "auxiliary_loss_clip": 0.01069541, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.02987909, + "balance_loss_mlp": 1.01878452, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 2.022105855241874, + "language_loss": 0.67685086, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.697846, + "num_input_tokens_seen": 319349135, + "step": 14807, + "time_per_iteration": 2.565305709838867 + }, + { + "auxiliary_loss_clip": 0.01070683, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.03070569, + "balance_loss_mlp": 1.01697314, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.8587859485671734, + "language_loss": 0.75457752, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77555609, + "num_input_tokens_seen": 319368410, + "step": 14808, + "time_per_iteration": 2.609316825866699 + }, + { + "auxiliary_loss_clip": 0.01063284, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.0309813, + "balance_loss_mlp": 1.01939881, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 2.613774069547725, + "language_loss": 0.81392306, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83485079, + "num_input_tokens_seen": 319387535, + "step": 14809, + "time_per_iteration": 2.5906989574432373 + }, + { + "auxiliary_loss_clip": 0.01086209, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.03230762, + "balance_loss_mlp": 1.01741672, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.8131844921468347, + "language_loss": 0.68359292, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70473254, + "num_input_tokens_seen": 319407210, + "step": 14810, + "time_per_iteration": 2.6005964279174805 + }, + { + "auxiliary_loss_clip": 0.01060597, + "auxiliary_loss_mlp": 0.01025643, + "balance_loss_clip": 1.02895331, + "balance_loss_mlp": 1.01393652, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 6.20759799892488, + "language_loss": 0.70207298, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72293544, + "num_input_tokens_seen": 319425340, + "step": 14811, + "time_per_iteration": 2.6120593547821045 + }, + { + "auxiliary_loss_clip": 0.01061335, + "auxiliary_loss_mlp": 0.00749398, + "balance_loss_clip": 1.03115368, + "balance_loss_mlp": 1.00030339, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 1.904789555779886, + "language_loss": 0.65652275, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67463005, + "num_input_tokens_seen": 319448150, + "step": 14812, + "time_per_iteration": 2.91027569770813 + }, + { + "auxiliary_loss_clip": 0.0105078, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.0329721, + "balance_loss_mlp": 1.01991081, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.9274792130339373, + "language_loss": 0.68652952, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70733643, + "num_input_tokens_seen": 319466115, + "step": 14813, + "time_per_iteration": 4.150950908660889 + }, + { + "auxiliary_loss_clip": 0.01066809, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.0282917, + "balance_loss_mlp": 1.02327716, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 2.0494801315690494, + "language_loss": 0.7515431, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77259624, + "num_input_tokens_seen": 319485255, + "step": 14814, + "time_per_iteration": 2.655270576477051 + }, + { + "auxiliary_loss_clip": 0.01088056, + "auxiliary_loss_mlp": 0.01027483, + "balance_loss_clip": 1.03297424, + "balance_loss_mlp": 1.01592529, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 2.806541450549402, + "language_loss": 0.74355376, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76470912, + "num_input_tokens_seen": 319501800, + "step": 14815, + "time_per_iteration": 2.5584237575531006 + }, + { + "auxiliary_loss_clip": 0.01054433, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.02971768, + "balance_loss_mlp": 1.01851702, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 1.7954261054998253, + "language_loss": 0.75338215, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77422416, + "num_input_tokens_seen": 319520415, + "step": 14816, + "time_per_iteration": 2.7146811485290527 + }, + { + "auxiliary_loss_clip": 0.01075357, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.03132594, + "balance_loss_mlp": 1.01758933, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 1.8033229986909989, + "language_loss": 0.77597213, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.79701078, + "num_input_tokens_seen": 319538410, + "step": 14817, + "time_per_iteration": 2.596313953399658 + }, + { + "auxiliary_loss_clip": 0.0099555, + "auxiliary_loss_mlp": 0.0099984, + "balance_loss_clip": 1.00566578, + "balance_loss_mlp": 0.99888009, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7697238894555439, + "language_loss": 0.56491178, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58486569, + "num_input_tokens_seen": 319602565, + "step": 14818, + "time_per_iteration": 3.1914286613464355 + }, + { + "auxiliary_loss_clip": 0.01047909, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.03199053, + "balance_loss_mlp": 1.01988292, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.798279947490571, + "language_loss": 0.6434055, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66418791, + "num_input_tokens_seen": 319624645, + "step": 14819, + "time_per_iteration": 2.6716113090515137 + }, + { + "auxiliary_loss_clip": 0.0108649, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.03293753, + "balance_loss_mlp": 1.01799667, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 1.6220916878860339, + "language_loss": 0.78370231, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80486286, + "num_input_tokens_seen": 319644040, + "step": 14820, + "time_per_iteration": 2.603982925415039 + }, + { + "auxiliary_loss_clip": 0.01056203, + "auxiliary_loss_mlp": 0.00749148, + "balance_loss_clip": 1.03141308, + "balance_loss_mlp": 1.00015903, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.7693829511331205, + "language_loss": 0.76394248, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78199595, + "num_input_tokens_seen": 319663930, + "step": 14821, + "time_per_iteration": 2.638669013977051 + }, + { + "auxiliary_loss_clip": 0.01015202, + "auxiliary_loss_mlp": 0.00746589, + "balance_loss_clip": 1.00813067, + "balance_loss_mlp": 0.99983132, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7908662162471514, + "language_loss": 0.592327, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.60994494, + "num_input_tokens_seen": 319721245, + "step": 14822, + "time_per_iteration": 2.9907212257385254 + }, + { + "auxiliary_loss_clip": 0.01081466, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.03348124, + "balance_loss_mlp": 1.01744735, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 1.8575470737653021, + "language_loss": 0.69071084, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71180964, + "num_input_tokens_seen": 319741200, + "step": 14823, + "time_per_iteration": 2.5528836250305176 + }, + { + "auxiliary_loss_clip": 0.01076262, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.03108037, + "balance_loss_mlp": 1.01871896, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.644575722656606, + "language_loss": 0.68766403, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.7087301, + "num_input_tokens_seen": 319759265, + "step": 14824, + "time_per_iteration": 2.5682640075683594 + }, + { + "auxiliary_loss_clip": 0.01042782, + "auxiliary_loss_mlp": 0.0103726, + "balance_loss_clip": 1.02917635, + "balance_loss_mlp": 1.02445674, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 2.3146446194773707, + "language_loss": 0.70925152, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.73005193, + "num_input_tokens_seen": 319777560, + "step": 14825, + "time_per_iteration": 4.179324626922607 + }, + { + "auxiliary_loss_clip": 0.01072498, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03070557, + "balance_loss_mlp": 1.01995635, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 2.3957727798588633, + "language_loss": 0.70936096, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73039854, + "num_input_tokens_seen": 319794125, + "step": 14826, + "time_per_iteration": 2.5523476600646973 + }, + { + "auxiliary_loss_clip": 0.01085718, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.03389215, + "balance_loss_mlp": 1.01664484, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 1.9883901967211235, + "language_loss": 0.75066382, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.771788, + "num_input_tokens_seen": 319810310, + "step": 14827, + "time_per_iteration": 2.5983073711395264 + }, + { + "auxiliary_loss_clip": 0.01087989, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.03327632, + "balance_loss_mlp": 1.01790559, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.8304204577498777, + "language_loss": 0.78061372, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80178249, + "num_input_tokens_seen": 319828505, + "step": 14828, + "time_per_iteration": 4.186205863952637 + }, + { + "auxiliary_loss_clip": 0.01055771, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.03171849, + "balance_loss_mlp": 1.01938069, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 2.438859165976149, + "language_loss": 0.75374597, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77460992, + "num_input_tokens_seen": 319848680, + "step": 14829, + "time_per_iteration": 2.67280912399292 + }, + { + "auxiliary_loss_clip": 0.01096357, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.03289485, + "balance_loss_mlp": 1.02245641, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.5119386497831875, + "language_loss": 0.84638631, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86767197, + "num_input_tokens_seen": 319868835, + "step": 14830, + "time_per_iteration": 2.575455904006958 + }, + { + "auxiliary_loss_clip": 0.01081141, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.03121102, + "balance_loss_mlp": 1.01788688, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.531050867761043, + "language_loss": 0.75071913, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.77180797, + "num_input_tokens_seen": 319891585, + "step": 14831, + "time_per_iteration": 2.59812331199646 + }, + { + "auxiliary_loss_clip": 0.01087894, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.03189516, + "balance_loss_mlp": 1.01697969, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.7691413395832418, + "language_loss": 0.72871268, + "learning_rate": 1.216083607088847e-07, + "loss": 0.74987245, + "num_input_tokens_seen": 319910315, + "step": 14832, + "time_per_iteration": 2.599120855331421 + }, + { + "auxiliary_loss_clip": 0.01030616, + "auxiliary_loss_mlp": 0.00749532, + "balance_loss_clip": 1.02834439, + "balance_loss_mlp": 1.00028229, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 1.8268582787355718, + "language_loss": 0.66710126, + "learning_rate": 1.214746621848355e-07, + "loss": 0.68490273, + "num_input_tokens_seen": 319932275, + "step": 14833, + "time_per_iteration": 2.744523286819458 + }, + { + "auxiliary_loss_clip": 0.01093032, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.0358932, + "balance_loss_mlp": 1.02171862, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 2.7317465109835615, + "language_loss": 0.73821855, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.75948399, + "num_input_tokens_seen": 319955335, + "step": 14834, + "time_per_iteration": 2.666011333465576 + }, + { + "auxiliary_loss_clip": 0.01048777, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.03067446, + "balance_loss_mlp": 1.01888847, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 1.7678664741702363, + "language_loss": 0.78879851, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.80958378, + "num_input_tokens_seen": 319973990, + "step": 14835, + "time_per_iteration": 2.671698570251465 + }, + { + "auxiliary_loss_clip": 0.01092395, + "auxiliary_loss_mlp": 0.01025334, + "balance_loss_clip": 1.03143096, + "balance_loss_mlp": 1.01533246, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.5640489985415187, + "language_loss": 0.74152845, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76270568, + "num_input_tokens_seen": 319995555, + "step": 14836, + "time_per_iteration": 2.5793981552124023 + }, + { + "auxiliary_loss_clip": 0.010739, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.03023696, + "balance_loss_mlp": 1.02001238, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.175009663893799, + "language_loss": 0.68639624, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.70744658, + "num_input_tokens_seen": 320012385, + "step": 14837, + "time_per_iteration": 2.5447654724121094 + }, + { + "auxiliary_loss_clip": 0.01035923, + "auxiliary_loss_mlp": 0.01031773, + "balance_loss_clip": 1.0287025, + "balance_loss_mlp": 1.02018023, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.8786923327781566, + "language_loss": 0.67734641, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69802338, + "num_input_tokens_seen": 320032390, + "step": 14838, + "time_per_iteration": 2.7030467987060547 + }, + { + "auxiliary_loss_clip": 0.01083204, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.03050363, + "balance_loss_mlp": 1.01637101, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 2.160135893155369, + "language_loss": 0.76088703, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.7820031, + "num_input_tokens_seen": 320052885, + "step": 14839, + "time_per_iteration": 4.046815395355225 + }, + { + "auxiliary_loss_clip": 0.00997133, + "auxiliary_loss_mlp": 0.00746489, + "balance_loss_clip": 1.00655174, + "balance_loss_mlp": 0.9997347, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6861792057695382, + "language_loss": 0.49480209, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51223832, + "num_input_tokens_seen": 320113685, + "step": 14840, + "time_per_iteration": 3.1752846240997314 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.03372347, + "balance_loss_mlp": 1.02090073, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.6720450206144624, + "language_loss": 0.64054191, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66189539, + "num_input_tokens_seen": 320130810, + "step": 14841, + "time_per_iteration": 2.640059232711792 + }, + { + "auxiliary_loss_clip": 0.01059046, + "auxiliary_loss_mlp": 0.00749154, + "balance_loss_clip": 1.03043365, + "balance_loss_mlp": 1.00026321, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 4.455964314831457, + "language_loss": 0.68205708, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70013911, + "num_input_tokens_seen": 320152170, + "step": 14842, + "time_per_iteration": 2.7126126289367676 + }, + { + "auxiliary_loss_clip": 0.01094849, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.03347647, + "balance_loss_mlp": 1.02112007, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 2.2413383769023048, + "language_loss": 0.80218196, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.82343942, + "num_input_tokens_seen": 320172360, + "step": 14843, + "time_per_iteration": 2.6432294845581055 + }, + { + "auxiliary_loss_clip": 0.01070422, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.03051722, + "balance_loss_mlp": 1.01989269, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 4.112872193570577, + "language_loss": 0.68680775, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70783055, + "num_input_tokens_seen": 320192130, + "step": 14844, + "time_per_iteration": 2.5952847003936768 + }, + { + "auxiliary_loss_clip": 0.01046932, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.0301187, + "balance_loss_mlp": 1.02102995, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 2.4561089624234334, + "language_loss": 0.91119075, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93198025, + "num_input_tokens_seen": 320207760, + "step": 14845, + "time_per_iteration": 2.581282615661621 + }, + { + "auxiliary_loss_clip": 0.01076575, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.03207326, + "balance_loss_mlp": 1.01670945, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 2.2061871539247533, + "language_loss": 0.72641468, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74744987, + "num_input_tokens_seen": 320225325, + "step": 14846, + "time_per_iteration": 2.556561231613159 + }, + { + "auxiliary_loss_clip": 0.0107084, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.03500819, + "balance_loss_mlp": 1.0201993, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.7651638616889227, + "language_loss": 0.56843805, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.58946186, + "num_input_tokens_seen": 320247645, + "step": 14847, + "time_per_iteration": 2.802147626876831 + }, + { + "auxiliary_loss_clip": 0.01057735, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.0317502, + "balance_loss_mlp": 1.01902723, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 1.9347096207857764, + "language_loss": 0.76280487, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78367519, + "num_input_tokens_seen": 320266005, + "step": 14848, + "time_per_iteration": 2.6692757606506348 + }, + { + "auxiliary_loss_clip": 0.01023808, + "auxiliary_loss_mlp": 0.01042157, + "balance_loss_clip": 1.0258131, + "balance_loss_mlp": 1.02935374, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 2.053016346768646, + "language_loss": 0.68946767, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71012729, + "num_input_tokens_seen": 320285555, + "step": 14849, + "time_per_iteration": 2.742558240890503 + }, + { + "auxiliary_loss_clip": 0.0108772, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.03488088, + "balance_loss_mlp": 1.02322221, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.8542673367013025, + "language_loss": 0.80682594, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.82803899, + "num_input_tokens_seen": 320305395, + "step": 14850, + "time_per_iteration": 2.563265085220337 + }, + { + "auxiliary_loss_clip": 0.01074815, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.03236854, + "balance_loss_mlp": 1.02234578, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.66777446211582, + "language_loss": 0.74894947, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.7700274, + "num_input_tokens_seen": 320324220, + "step": 14851, + "time_per_iteration": 2.600905418395996 + }, + { + "auxiliary_loss_clip": 0.01072198, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.03193069, + "balance_loss_mlp": 1.0185957, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.5411064985595446, + "language_loss": 0.78218085, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.8032003, + "num_input_tokens_seen": 320347195, + "step": 14852, + "time_per_iteration": 2.6819324493408203 + }, + { + "auxiliary_loss_clip": 0.01079916, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.03468668, + "balance_loss_mlp": 1.02133274, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.3538977365827813, + "language_loss": 0.69244105, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71355844, + "num_input_tokens_seen": 320366850, + "step": 14853, + "time_per_iteration": 4.235818386077881 + }, + { + "auxiliary_loss_clip": 0.01049118, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.03350604, + "balance_loss_mlp": 1.02228665, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.9375323976756613, + "language_loss": 0.67380792, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69463211, + "num_input_tokens_seen": 320388895, + "step": 14854, + "time_per_iteration": 2.7253379821777344 + }, + { + "auxiliary_loss_clip": 0.01065546, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.02996635, + "balance_loss_mlp": 1.02307677, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.9577083332026053, + "language_loss": 0.74555582, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76655519, + "num_input_tokens_seen": 320408520, + "step": 14855, + "time_per_iteration": 2.675262451171875 + }, + { + "auxiliary_loss_clip": 0.01074297, + "auxiliary_loss_mlp": 0.01028097, + "balance_loss_clip": 1.03263521, + "balance_loss_mlp": 1.01757121, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.8672240571112613, + "language_loss": 0.64574689, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66677082, + "num_input_tokens_seen": 320427400, + "step": 14856, + "time_per_iteration": 2.6189916133880615 + }, + { + "auxiliary_loss_clip": 0.01095681, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.03261614, + "balance_loss_mlp": 1.01726103, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.586905806842973, + "language_loss": 0.66550028, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68673247, + "num_input_tokens_seen": 320447570, + "step": 14857, + "time_per_iteration": 2.5732200145721436 + }, + { + "auxiliary_loss_clip": 0.01051988, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.03403306, + "balance_loss_mlp": 1.02013493, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.453686755183749, + "language_loss": 0.75216687, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77300227, + "num_input_tokens_seen": 320464405, + "step": 14858, + "time_per_iteration": 2.7141940593719482 + }, + { + "auxiliary_loss_clip": 0.01085498, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.03338742, + "balance_loss_mlp": 1.01770043, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.7498349040320071, + "language_loss": 0.69316477, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.7143082, + "num_input_tokens_seen": 320485525, + "step": 14859, + "time_per_iteration": 2.632446765899658 + }, + { + "auxiliary_loss_clip": 0.01044298, + "auxiliary_loss_mlp": 0.01027395, + "balance_loss_clip": 1.02984858, + "balance_loss_mlp": 1.01772726, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.485321772119528, + "language_loss": 0.75497413, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77569103, + "num_input_tokens_seen": 320506725, + "step": 14860, + "time_per_iteration": 2.6675455570220947 + }, + { + "auxiliary_loss_clip": 0.01072293, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.03211987, + "balance_loss_mlp": 1.01581073, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 1.9997557641436678, + "language_loss": 0.57437712, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59537524, + "num_input_tokens_seen": 320525425, + "step": 14861, + "time_per_iteration": 2.5968661308288574 + }, + { + "auxiliary_loss_clip": 0.01063804, + "auxiliary_loss_mlp": 0.01027187, + "balance_loss_clip": 1.02851379, + "balance_loss_mlp": 1.0164758, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 2.136417692164156, + "language_loss": 0.63636696, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65727687, + "num_input_tokens_seen": 320543010, + "step": 14862, + "time_per_iteration": 2.6117544174194336 + }, + { + "auxiliary_loss_clip": 0.01082137, + "auxiliary_loss_mlp": 0.01024903, + "balance_loss_clip": 1.03128088, + "balance_loss_mlp": 1.01422143, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 2.1537521508464934, + "language_loss": 0.77888238, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.79995281, + "num_input_tokens_seen": 320562180, + "step": 14863, + "time_per_iteration": 2.5722579956054688 + }, + { + "auxiliary_loss_clip": 0.01065876, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.02939272, + "balance_loss_mlp": 1.02355862, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 1.9501997149678436, + "language_loss": 0.70847631, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.72948122, + "num_input_tokens_seen": 320580395, + "step": 14864, + "time_per_iteration": 4.038920879364014 + }, + { + "auxiliary_loss_clip": 0.0108772, + "auxiliary_loss_mlp": 0.01036193, + "balance_loss_clip": 1.0336535, + "balance_loss_mlp": 1.02432013, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 2.221290357244633, + "language_loss": 0.75690711, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.77814627, + "num_input_tokens_seen": 320599505, + "step": 14865, + "time_per_iteration": 2.5068156719207764 + }, + { + "auxiliary_loss_clip": 0.01058235, + "auxiliary_loss_mlp": 0.01027675, + "balance_loss_clip": 1.0319674, + "balance_loss_mlp": 1.01796579, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.5792623482985377, + "language_loss": 0.7181288, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73898792, + "num_input_tokens_seen": 320619825, + "step": 14866, + "time_per_iteration": 2.659320116043091 + }, + { + "auxiliary_loss_clip": 0.0108966, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.03466427, + "balance_loss_mlp": 1.01628852, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 1.6446660328253353, + "language_loss": 0.83967257, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86085165, + "num_input_tokens_seen": 320638515, + "step": 14867, + "time_per_iteration": 2.5529253482818604 + }, + { + "auxiliary_loss_clip": 0.01085862, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.03210485, + "balance_loss_mlp": 1.01890695, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.590778778899297, + "language_loss": 0.8080914, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82923645, + "num_input_tokens_seen": 320659430, + "step": 14868, + "time_per_iteration": 2.590196371078491 + }, + { + "auxiliary_loss_clip": 0.01070735, + "auxiliary_loss_mlp": 0.01028069, + "balance_loss_clip": 1.03430009, + "balance_loss_mlp": 1.01693463, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 1.9202706076488778, + "language_loss": 0.77457398, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79556209, + "num_input_tokens_seen": 320679295, + "step": 14869, + "time_per_iteration": 4.120209455490112 + }, + { + "auxiliary_loss_clip": 0.01085101, + "auxiliary_loss_mlp": 0.00749132, + "balance_loss_clip": 1.03357577, + "balance_loss_mlp": 1.00019836, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.472869589175318, + "language_loss": 0.6533041, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67164642, + "num_input_tokens_seen": 320697535, + "step": 14870, + "time_per_iteration": 2.564204216003418 + }, + { + "auxiliary_loss_clip": 0.00983004, + "auxiliary_loss_mlp": 0.00999472, + "balance_loss_clip": 1.00351202, + "balance_loss_mlp": 0.99814332, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7981882961519446, + "language_loss": 0.55923271, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57905751, + "num_input_tokens_seen": 320758635, + "step": 14871, + "time_per_iteration": 3.260091543197632 + }, + { + "auxiliary_loss_clip": 0.01081553, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.03376675, + "balance_loss_mlp": 1.01929533, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 2.6788729101110693, + "language_loss": 0.7719211, + "learning_rate": 1.16316031981331e-07, + "loss": 0.79303253, + "num_input_tokens_seen": 320777175, + "step": 14872, + "time_per_iteration": 2.576042652130127 + }, + { + "auxiliary_loss_clip": 0.01080565, + "auxiliary_loss_mlp": 0.01025907, + "balance_loss_clip": 1.03146958, + "balance_loss_mlp": 1.01650095, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.6696043080199785, + "language_loss": 0.66846263, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.68952733, + "num_input_tokens_seen": 320797670, + "step": 14873, + "time_per_iteration": 2.6013243198394775 + }, + { + "auxiliary_loss_clip": 0.01093265, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.03233504, + "balance_loss_mlp": 1.02254474, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.8448793131614933, + "language_loss": 0.59850168, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61977005, + "num_input_tokens_seen": 320817410, + "step": 14874, + "time_per_iteration": 2.499277353286743 + }, + { + "auxiliary_loss_clip": 0.010687, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.03556335, + "balance_loss_mlp": 1.02014351, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 1.8785384895899624, + "language_loss": 0.75725049, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77825558, + "num_input_tokens_seen": 320836745, + "step": 14875, + "time_per_iteration": 2.6717164516448975 + }, + { + "auxiliary_loss_clip": 0.01064079, + "auxiliary_loss_mlp": 0.01029693, + "balance_loss_clip": 1.0316081, + "balance_loss_mlp": 1.01713455, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 2.193708433270643, + "language_loss": 0.77101797, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79195565, + "num_input_tokens_seen": 320853305, + "step": 14876, + "time_per_iteration": 2.6319305896759033 + }, + { + "auxiliary_loss_clip": 0.01083836, + "auxiliary_loss_mlp": 0.01026672, + "balance_loss_clip": 1.03242278, + "balance_loss_mlp": 1.01641965, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 1.7035993263487408, + "language_loss": 0.78946459, + "learning_rate": 1.156625201573287e-07, + "loss": 0.8105697, + "num_input_tokens_seen": 320872885, + "step": 14877, + "time_per_iteration": 2.562950611114502 + }, + { + "auxiliary_loss_clip": 0.01041685, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.0281508, + "balance_loss_mlp": 1.0200882, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 2.3817004981254666, + "language_loss": 0.74954569, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77029026, + "num_input_tokens_seen": 320889755, + "step": 14878, + "time_per_iteration": 2.6371355056762695 + }, + { + "auxiliary_loss_clip": 0.010794, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.03070545, + "balance_loss_mlp": 1.01582038, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.7123974353352547, + "language_loss": 0.75961488, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.7806949, + "num_input_tokens_seen": 320907860, + "step": 14879, + "time_per_iteration": 4.017613172531128 + }, + { + "auxiliary_loss_clip": 0.01067396, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.03432393, + "balance_loss_mlp": 1.01916814, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.7923465094296172, + "language_loss": 0.74671346, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76768708, + "num_input_tokens_seen": 320925825, + "step": 14880, + "time_per_iteration": 2.6571907997131348 + }, + { + "auxiliary_loss_clip": 0.01076802, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.03083062, + "balance_loss_mlp": 1.01859546, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.6840567586436839, + "language_loss": 0.82750046, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.84856927, + "num_input_tokens_seen": 320946165, + "step": 14881, + "time_per_iteration": 2.609402656555176 + }, + { + "auxiliary_loss_clip": 0.01054546, + "auxiliary_loss_mlp": 0.00749456, + "balance_loss_clip": 1.03040946, + "balance_loss_mlp": 1.00028253, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 2.336980778093553, + "language_loss": 0.67618054, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69422054, + "num_input_tokens_seen": 320969330, + "step": 14882, + "time_per_iteration": 2.700885057449341 + }, + { + "auxiliary_loss_clip": 0.01068685, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.02987945, + "balance_loss_mlp": 1.02017045, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 1.9895029830852222, + "language_loss": 0.75143665, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77245104, + "num_input_tokens_seen": 320985055, + "step": 14883, + "time_per_iteration": 2.601877212524414 + }, + { + "auxiliary_loss_clip": 0.01070309, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.03155148, + "balance_loss_mlp": 1.02147734, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.5761094827246769, + "language_loss": 0.72399569, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74501646, + "num_input_tokens_seen": 321004720, + "step": 14884, + "time_per_iteration": 2.656294584274292 + }, + { + "auxiliary_loss_clip": 0.01062039, + "auxiliary_loss_mlp": 0.01026144, + "balance_loss_clip": 1.02818561, + "balance_loss_mlp": 1.01542115, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 2.5500355293090293, + "language_loss": 0.75818217, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77906394, + "num_input_tokens_seen": 321022350, + "step": 14885, + "time_per_iteration": 2.5971672534942627 + }, + { + "auxiliary_loss_clip": 0.01069948, + "auxiliary_loss_mlp": 0.01028033, + "balance_loss_clip": 1.03279996, + "balance_loss_mlp": 1.01708949, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 2.6190575120619415, + "language_loss": 0.82106173, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.84204149, + "num_input_tokens_seen": 321040450, + "step": 14886, + "time_per_iteration": 2.6236023902893066 + }, + { + "auxiliary_loss_clip": 0.01032619, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.0293901, + "balance_loss_mlp": 1.01506495, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.6144428981296914, + "language_loss": 0.6342026, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.6547817, + "num_input_tokens_seen": 321063970, + "step": 14887, + "time_per_iteration": 3.027088165283203 + }, + { + "auxiliary_loss_clip": 0.01072968, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.03087687, + "balance_loss_mlp": 1.02345872, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.6368507333365465, + "language_loss": 0.61101925, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63209355, + "num_input_tokens_seen": 321083840, + "step": 14888, + "time_per_iteration": 2.738783836364746 + }, + { + "auxiliary_loss_clip": 0.01097977, + "auxiliary_loss_mlp": 0.01022639, + "balance_loss_clip": 1.03287578, + "balance_loss_mlp": 1.01233363, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 1.9315698617057697, + "language_loss": 0.69914156, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.7203477, + "num_input_tokens_seen": 321104165, + "step": 14889, + "time_per_iteration": 2.625145435333252 + }, + { + "auxiliary_loss_clip": 0.01085346, + "auxiliary_loss_mlp": 0.00749475, + "balance_loss_clip": 1.03367019, + "balance_loss_mlp": 1.00026298, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.422309307370776, + "language_loss": 0.71154618, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.7298944, + "num_input_tokens_seen": 321117290, + "step": 14890, + "time_per_iteration": 2.595635175704956 + }, + { + "auxiliary_loss_clip": 0.01007503, + "auxiliary_loss_mlp": 0.00749621, + "balance_loss_clip": 1.02523053, + "balance_loss_mlp": 1.00028002, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.5360018414194019, + "language_loss": 0.75776917, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77534038, + "num_input_tokens_seen": 321137115, + "step": 14891, + "time_per_iteration": 2.925813913345337 + }, + { + "auxiliary_loss_clip": 0.01037656, + "auxiliary_loss_mlp": 0.01028006, + "balance_loss_clip": 1.028566, + "balance_loss_mlp": 1.01685965, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.759029448917643, + "language_loss": 0.76625967, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78691626, + "num_input_tokens_seen": 321154490, + "step": 14892, + "time_per_iteration": 2.9957306385040283 + }, + { + "auxiliary_loss_clip": 0.010822, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.03572512, + "balance_loss_mlp": 1.01946974, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.39062836379093, + "language_loss": 0.81626332, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.8373853, + "num_input_tokens_seen": 321175625, + "step": 14893, + "time_per_iteration": 4.153652191162109 + }, + { + "auxiliary_loss_clip": 0.01046438, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.03031015, + "balance_loss_mlp": 1.01882243, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.7034115094961537, + "language_loss": 0.74655724, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.76731443, + "num_input_tokens_seen": 321193895, + "step": 14894, + "time_per_iteration": 2.7178595066070557 + }, + { + "auxiliary_loss_clip": 0.01087167, + "auxiliary_loss_mlp": 0.01028428, + "balance_loss_clip": 1.03636217, + "balance_loss_mlp": 1.01728165, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 2.868417903549679, + "language_loss": 0.66521907, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68637502, + "num_input_tokens_seen": 321211610, + "step": 14895, + "time_per_iteration": 2.5679140090942383 + }, + { + "auxiliary_loss_clip": 0.01087762, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.03425479, + "balance_loss_mlp": 1.01577663, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.5127960319683658, + "language_loss": 0.67102969, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69218194, + "num_input_tokens_seen": 321229805, + "step": 14896, + "time_per_iteration": 2.5448689460754395 + }, + { + "auxiliary_loss_clip": 0.01086155, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.0330925, + "balance_loss_mlp": 1.01921701, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 1.6661888575961523, + "language_loss": 0.76126039, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.78242588, + "num_input_tokens_seen": 321247165, + "step": 14897, + "time_per_iteration": 2.523557424545288 + }, + { + "auxiliary_loss_clip": 0.00987619, + "auxiliary_loss_mlp": 0.0074657, + "balance_loss_clip": 1.00688922, + "balance_loss_mlp": 0.99970371, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7598628825905394, + "language_loss": 0.55342484, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57076669, + "num_input_tokens_seen": 321308425, + "step": 14898, + "time_per_iteration": 3.2444279193878174 + }, + { + "auxiliary_loss_clip": 0.0109845, + "auxiliary_loss_mlp": 0.00749467, + "balance_loss_clip": 1.03428364, + "balance_loss_mlp": 1.00023389, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.520218603819741, + "language_loss": 0.7038877, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72236693, + "num_input_tokens_seen": 321329295, + "step": 14899, + "time_per_iteration": 2.5630409717559814 + }, + { + "auxiliary_loss_clip": 0.01041412, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.03367507, + "balance_loss_mlp": 1.0209223, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 1.7009794581592848, + "language_loss": 0.73850334, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.75925004, + "num_input_tokens_seen": 321347580, + "step": 14900, + "time_per_iteration": 2.709169387817383 + }, + { + "auxiliary_loss_clip": 0.0098723, + "auxiliary_loss_mlp": 0.01000759, + "balance_loss_clip": 1.00631452, + "balance_loss_mlp": 0.99985868, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7789226226703178, + "language_loss": 0.61796534, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63784528, + "num_input_tokens_seen": 321407820, + "step": 14901, + "time_per_iteration": 3.2173409461975098 + }, + { + "auxiliary_loss_clip": 0.01085844, + "auxiliary_loss_mlp": 0.01029267, + "balance_loss_clip": 1.03193045, + "balance_loss_mlp": 1.01841879, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.9044664702505294, + "language_loss": 0.7070294, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72818053, + "num_input_tokens_seen": 321426745, + "step": 14902, + "time_per_iteration": 2.5863685607910156 + }, + { + "auxiliary_loss_clip": 0.01069997, + "auxiliary_loss_mlp": 0.01026491, + "balance_loss_clip": 1.03183055, + "balance_loss_mlp": 1.01647186, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.7509956256814228, + "language_loss": 0.77739763, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.79836249, + "num_input_tokens_seen": 321446165, + "step": 14903, + "time_per_iteration": 2.636000394821167 + }, + { + "auxiliary_loss_clip": 0.01070889, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.03235126, + "balance_loss_mlp": 1.01960981, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 2.1456608721997656, + "language_loss": 0.73219311, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75321728, + "num_input_tokens_seen": 321465285, + "step": 14904, + "time_per_iteration": 4.093727350234985 + }, + { + "auxiliary_loss_clip": 0.01086617, + "auxiliary_loss_mlp": 0.01025129, + "balance_loss_clip": 1.03338325, + "balance_loss_mlp": 1.01344681, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 3.3754359619400223, + "language_loss": 0.74875516, + "learning_rate": 1.12035883275166e-07, + "loss": 0.76987267, + "num_input_tokens_seen": 321483670, + "step": 14905, + "time_per_iteration": 2.5276589393615723 + }, + { + "auxiliary_loss_clip": 0.0108349, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.03104806, + "balance_loss_mlp": 1.01814508, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 2.6677400181460826, + "language_loss": 0.76380533, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78493226, + "num_input_tokens_seen": 321501190, + "step": 14906, + "time_per_iteration": 2.540806293487549 + }, + { + "auxiliary_loss_clip": 0.01088167, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.03496981, + "balance_loss_mlp": 1.02088571, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.619219791644586, + "language_loss": 0.74427986, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76548195, + "num_input_tokens_seen": 321518540, + "step": 14907, + "time_per_iteration": 2.5523736476898193 + }, + { + "auxiliary_loss_clip": 0.01087576, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.03498793, + "balance_loss_mlp": 1.02340627, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 1.7828950428348385, + "language_loss": 0.82672691, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.84794581, + "num_input_tokens_seen": 321536555, + "step": 14908, + "time_per_iteration": 2.5607430934906006 + }, + { + "auxiliary_loss_clip": 0.01074454, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.03270054, + "balance_loss_mlp": 1.01691484, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 2.485094263192701, + "language_loss": 0.70171881, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72275043, + "num_input_tokens_seen": 321557655, + "step": 14909, + "time_per_iteration": 4.096464157104492 + }, + { + "auxiliary_loss_clip": 0.01064442, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.03398132, + "balance_loss_mlp": 1.01834047, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 2.017696137023505, + "language_loss": 0.72434896, + "learning_rate": 1.113941727737877e-07, + "loss": 0.7452904, + "num_input_tokens_seen": 321576160, + "step": 14910, + "time_per_iteration": 2.619535207748413 + }, + { + "auxiliary_loss_clip": 0.01083972, + "auxiliary_loss_mlp": 0.01025438, + "balance_loss_clip": 1.03146243, + "balance_loss_mlp": 1.01498318, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 2.2064229477805277, + "language_loss": 0.63217878, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65327287, + "num_input_tokens_seen": 321596205, + "step": 14911, + "time_per_iteration": 2.608560085296631 + }, + { + "auxiliary_loss_clip": 0.01067459, + "auxiliary_loss_mlp": 0.00749474, + "balance_loss_clip": 1.03326094, + "balance_loss_mlp": 1.00028467, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.6669619610131527, + "language_loss": 0.74991566, + "learning_rate": 1.111379898520437e-07, + "loss": 0.768085, + "num_input_tokens_seen": 321614800, + "step": 14912, + "time_per_iteration": 2.6026065349578857 + }, + { + "auxiliary_loss_clip": 0.01067284, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.02981901, + "balance_loss_mlp": 1.01993322, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.8374184769055606, + "language_loss": 0.82016391, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.84115052, + "num_input_tokens_seen": 321633445, + "step": 14913, + "time_per_iteration": 2.6740283966064453 + }, + { + "auxiliary_loss_clip": 0.01088771, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.03414786, + "balance_loss_mlp": 1.01915538, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 4.685186535863599, + "language_loss": 0.61375785, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63495374, + "num_input_tokens_seen": 321650890, + "step": 14914, + "time_per_iteration": 2.5846941471099854 + }, + { + "auxiliary_loss_clip": 0.01006215, + "auxiliary_loss_mlp": 0.01003589, + "balance_loss_clip": 1.00829256, + "balance_loss_mlp": 1.00262916, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.7119788229378229, + "language_loss": 0.55078685, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57088482, + "num_input_tokens_seen": 321710960, + "step": 14915, + "time_per_iteration": 3.1602730751037598 + }, + { + "auxiliary_loss_clip": 0.01057812, + "auxiliary_loss_mlp": 0.01027248, + "balance_loss_clip": 1.03078806, + "balance_loss_mlp": 1.01739526, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.4071850512499464, + "language_loss": 0.71409684, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73494744, + "num_input_tokens_seen": 321733290, + "step": 14916, + "time_per_iteration": 2.644822835922241 + }, + { + "auxiliary_loss_clip": 0.01075507, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.03284788, + "balance_loss_mlp": 1.01593494, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 3.4063491342125407, + "language_loss": 0.77873403, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.79975373, + "num_input_tokens_seen": 321753120, + "step": 14917, + "time_per_iteration": 2.6552560329437256 + }, + { + "auxiliary_loss_clip": 0.01090852, + "auxiliary_loss_mlp": 0.01035195, + "balance_loss_clip": 1.03514934, + "balance_loss_mlp": 1.02332222, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 4.523021705575687, + "language_loss": 0.68848741, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70974791, + "num_input_tokens_seen": 321772840, + "step": 14918, + "time_per_iteration": 2.5663201808929443 + }, + { + "auxiliary_loss_clip": 0.01047093, + "auxiliary_loss_mlp": 0.01026628, + "balance_loss_clip": 1.03090024, + "balance_loss_mlp": 1.01638246, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.997420465236533, + "language_loss": 0.83402586, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85476309, + "num_input_tokens_seen": 321791020, + "step": 14919, + "time_per_iteration": 4.199926137924194 + }, + { + "auxiliary_loss_clip": 0.0109816, + "auxiliary_loss_mlp": 0.00749604, + "balance_loss_clip": 1.03334284, + "balance_loss_mlp": 1.00022125, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 2.540330463352455, + "language_loss": 0.71917433, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.73765194, + "num_input_tokens_seen": 321810075, + "step": 14920, + "time_per_iteration": 2.5136187076568604 + }, + { + "auxiliary_loss_clip": 0.01081803, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.03151786, + "balance_loss_mlp": 1.01632833, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.481709550953671, + "language_loss": 0.90975261, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93085003, + "num_input_tokens_seen": 321822635, + "step": 14921, + "time_per_iteration": 2.5082130432128906 + }, + { + "auxiliary_loss_clip": 0.01035096, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.02836633, + "balance_loss_mlp": 1.02229846, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 2.1712960038328313, + "language_loss": 0.73709893, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.75779819, + "num_input_tokens_seen": 321841130, + "step": 14922, + "time_per_iteration": 2.737776279449463 + }, + { + "auxiliary_loss_clip": 0.01044455, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.02748942, + "balance_loss_mlp": 1.01849604, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 1.6398007644809238, + "language_loss": 0.70235032, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72310436, + "num_input_tokens_seen": 321859855, + "step": 14923, + "time_per_iteration": 2.6723451614379883 + }, + { + "auxiliary_loss_clip": 0.01074603, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.0314945, + "balance_loss_mlp": 1.01574707, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.384994870570663, + "language_loss": 0.70870966, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72973049, + "num_input_tokens_seen": 321877990, + "step": 14924, + "time_per_iteration": 2.5772523880004883 + }, + { + "auxiliary_loss_clip": 0.01080629, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.03058648, + "balance_loss_mlp": 1.02086401, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.484559050446764, + "language_loss": 0.72168601, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74280119, + "num_input_tokens_seen": 321898120, + "step": 14925, + "time_per_iteration": 2.6375975608825684 + }, + { + "auxiliary_loss_clip": 0.01072952, + "auxiliary_loss_mlp": 0.00749654, + "balance_loss_clip": 1.0328778, + "balance_loss_mlp": 1.0002346, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.7271284182192526, + "language_loss": 0.82392979, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84215581, + "num_input_tokens_seen": 321918140, + "step": 14926, + "time_per_iteration": 2.6144959926605225 + }, + { + "auxiliary_loss_clip": 0.01041031, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.02947438, + "balance_loss_mlp": 1.01884222, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.6792230545007356, + "language_loss": 0.79319066, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81389415, + "num_input_tokens_seen": 321938580, + "step": 14927, + "time_per_iteration": 2.671488046646118 + }, + { + "auxiliary_loss_clip": 0.01068067, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.03085244, + "balance_loss_mlp": 1.01693749, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.5327918344342248, + "language_loss": 0.66216362, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68311554, + "num_input_tokens_seen": 321961135, + "step": 14928, + "time_per_iteration": 2.7085437774658203 + }, + { + "auxiliary_loss_clip": 0.01076674, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.0333277, + "balance_loss_mlp": 1.02792943, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 2.2144291322398097, + "language_loss": 0.70962214, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.73080409, + "num_input_tokens_seen": 321980945, + "step": 14929, + "time_per_iteration": 2.6685335636138916 + }, + { + "auxiliary_loss_clip": 0.01078006, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.03542542, + "balance_loss_mlp": 1.02079809, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 2.1216373356686615, + "language_loss": 0.67885518, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.69994414, + "num_input_tokens_seen": 322000350, + "step": 14930, + "time_per_iteration": 2.612203359603882 + }, + { + "auxiliary_loss_clip": 0.0106633, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.03033173, + "balance_loss_mlp": 1.01972175, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.7908889057355184, + "language_loss": 0.75017691, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.77115011, + "num_input_tokens_seen": 322018980, + "step": 14931, + "time_per_iteration": 2.569507598876953 + }, + { + "auxiliary_loss_clip": 0.01087283, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.03496981, + "balance_loss_mlp": 1.0189867, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 1.6285580625673832, + "language_loss": 0.62725341, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.64841342, + "num_input_tokens_seen": 322037675, + "step": 14932, + "time_per_iteration": 2.5972228050231934 + }, + { + "auxiliary_loss_clip": 0.01081437, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.03215516, + "balance_loss_mlp": 1.02058792, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.6673820575350877, + "language_loss": 0.71490204, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73602259, + "num_input_tokens_seen": 322055130, + "step": 14933, + "time_per_iteration": 2.503842353820801 + }, + { + "auxiliary_loss_clip": 0.0104724, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.02929747, + "balance_loss_mlp": 1.01895773, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.4005939226536142, + "language_loss": 0.748882, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76965868, + "num_input_tokens_seen": 322074850, + "step": 14934, + "time_per_iteration": 4.232425689697266 + }, + { + "auxiliary_loss_clip": 0.01046844, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.02785683, + "balance_loss_mlp": 1.01912415, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.678722469584023, + "language_loss": 0.60408056, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62486458, + "num_input_tokens_seen": 322093315, + "step": 14935, + "time_per_iteration": 2.662750005722046 + }, + { + "auxiliary_loss_clip": 0.01059016, + "auxiliary_loss_mlp": 0.01023405, + "balance_loss_clip": 1.03210855, + "balance_loss_mlp": 1.01243746, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 2.037289188251034, + "language_loss": 0.7678144, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78863859, + "num_input_tokens_seen": 322112555, + "step": 14936, + "time_per_iteration": 2.631070852279663 + }, + { + "auxiliary_loss_clip": 0.01072838, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.03238034, + "balance_loss_mlp": 1.02234197, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.5635140356531725, + "language_loss": 0.73885316, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.75991356, + "num_input_tokens_seen": 322130440, + "step": 14937, + "time_per_iteration": 2.565399646759033 + }, + { + "auxiliary_loss_clip": 0.00994467, + "auxiliary_loss_mlp": 0.01002349, + "balance_loss_clip": 1.00413811, + "balance_loss_mlp": 1.0013535, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8530463642066642, + "language_loss": 0.63478184, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65474999, + "num_input_tokens_seen": 322187295, + "step": 14938, + "time_per_iteration": 3.0263009071350098 + }, + { + "auxiliary_loss_clip": 0.01074521, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.03289318, + "balance_loss_mlp": 1.01746213, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 2.0381863250396117, + "language_loss": 0.80312991, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82416034, + "num_input_tokens_seen": 322202965, + "step": 14939, + "time_per_iteration": 2.5341649055480957 + }, + { + "auxiliary_loss_clip": 0.00994089, + "auxiliary_loss_mlp": 0.00996593, + "balance_loss_clip": 1.00444937, + "balance_loss_mlp": 0.99565715, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.7191476478112366, + "language_loss": 0.52856922, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54847598, + "num_input_tokens_seen": 322269490, + "step": 14940, + "time_per_iteration": 3.2631802558898926 + }, + { + "auxiliary_loss_clip": 0.01098168, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.03403747, + "balance_loss_mlp": 1.01874089, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.7229352218861465, + "language_loss": 0.77659184, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.79787707, + "num_input_tokens_seen": 322288060, + "step": 14941, + "time_per_iteration": 2.5518150329589844 + }, + { + "auxiliary_loss_clip": 0.01086208, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.03239071, + "balance_loss_mlp": 1.0219146, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 2.3135236482642623, + "language_loss": 0.73611975, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75731468, + "num_input_tokens_seen": 322307930, + "step": 14942, + "time_per_iteration": 2.6201999187469482 + }, + { + "auxiliary_loss_clip": 0.01073678, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.03111768, + "balance_loss_mlp": 1.02471185, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.0097951906616096, + "language_loss": 0.79426491, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.81536365, + "num_input_tokens_seen": 322326155, + "step": 14943, + "time_per_iteration": 4.152792453765869 + }, + { + "auxiliary_loss_clip": 0.01076533, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03325963, + "balance_loss_mlp": 1.02082086, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.4578483837783536, + "language_loss": 0.71222234, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73331404, + "num_input_tokens_seen": 322345850, + "step": 14944, + "time_per_iteration": 2.5956006050109863 + }, + { + "auxiliary_loss_clip": 0.01050305, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.02855027, + "balance_loss_mlp": 1.01892602, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 2.7794677372085337, + "language_loss": 0.75725007, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.77806723, + "num_input_tokens_seen": 322364715, + "step": 14945, + "time_per_iteration": 2.5759193897247314 + }, + { + "auxiliary_loss_clip": 0.01103302, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.03471231, + "balance_loss_mlp": 1.02132154, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 2.9807933233731725, + "language_loss": 0.73560345, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75697017, + "num_input_tokens_seen": 322383570, + "step": 14946, + "time_per_iteration": 2.5272364616394043 + }, + { + "auxiliary_loss_clip": 0.01041368, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.02832055, + "balance_loss_mlp": 1.01500416, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 1.9634205884032785, + "language_loss": 0.64437711, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66505611, + "num_input_tokens_seen": 322401375, + "step": 14947, + "time_per_iteration": 2.6753108501434326 + }, + { + "auxiliary_loss_clip": 0.01076137, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.03358901, + "balance_loss_mlp": 1.02236104, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 2.5981461465921636, + "language_loss": 0.70009661, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.72118682, + "num_input_tokens_seen": 322421890, + "step": 14948, + "time_per_iteration": 2.5900135040283203 + }, + { + "auxiliary_loss_clip": 0.01063995, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.0314188, + "balance_loss_mlp": 1.01757348, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.8774776797605344, + "language_loss": 0.74634409, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76727003, + "num_input_tokens_seen": 322445730, + "step": 14949, + "time_per_iteration": 4.259083271026611 + }, + { + "auxiliary_loss_clip": 0.01055646, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.03175902, + "balance_loss_mlp": 1.02194893, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.971401662738147, + "language_loss": 0.75739747, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.77829885, + "num_input_tokens_seen": 322464595, + "step": 14950, + "time_per_iteration": 2.6678123474121094 + }, + { + "auxiliary_loss_clip": 0.01067455, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.03148973, + "balance_loss_mlp": 1.02026963, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.6239564419420216, + "language_loss": 0.66321325, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68419659, + "num_input_tokens_seen": 322483305, + "step": 14951, + "time_per_iteration": 2.7011725902557373 + }, + { + "auxiliary_loss_clip": 0.01098946, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.03181911, + "balance_loss_mlp": 1.01760888, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 1.99391492964544, + "language_loss": 0.73691356, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.75818479, + "num_input_tokens_seen": 322501905, + "step": 14952, + "time_per_iteration": 2.4688055515289307 + }, + { + "auxiliary_loss_clip": 0.01097453, + "auxiliary_loss_mlp": 0.01032919, + "balance_loss_clip": 1.03399885, + "balance_loss_mlp": 1.02221453, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.2134414233964215, + "language_loss": 0.56470275, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58600646, + "num_input_tokens_seen": 322518135, + "step": 14953, + "time_per_iteration": 2.4480082988739014 + }, + { + "auxiliary_loss_clip": 0.01072161, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.03167534, + "balance_loss_mlp": 1.02504969, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 1.9561917018736832, + "language_loss": 0.82182157, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.84290481, + "num_input_tokens_seen": 322537905, + "step": 14954, + "time_per_iteration": 2.5603995323181152 + }, + { + "auxiliary_loss_clip": 0.01096024, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.03425682, + "balance_loss_mlp": 1.02008343, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 1.8657440971035262, + "language_loss": 0.60255611, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.62382555, + "num_input_tokens_seen": 322557945, + "step": 14955, + "time_per_iteration": 2.562649965286255 + }, + { + "auxiliary_loss_clip": 0.01085876, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.03365624, + "balance_loss_mlp": 1.0216465, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 1.973955629157677, + "language_loss": 0.55092698, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.57210696, + "num_input_tokens_seen": 322575765, + "step": 14956, + "time_per_iteration": 2.521031141281128 + }, + { + "auxiliary_loss_clip": 0.01050261, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.03075802, + "balance_loss_mlp": 1.02090025, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.7705734193871774, + "language_loss": 0.7953763, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.81620002, + "num_input_tokens_seen": 322595665, + "step": 14957, + "time_per_iteration": 2.712236166000366 + }, + { + "auxiliary_loss_clip": 0.01100965, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.03486204, + "balance_loss_mlp": 1.02150106, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 2.227191882236208, + "language_loss": 0.78557605, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80691653, + "num_input_tokens_seen": 322614755, + "step": 14958, + "time_per_iteration": 2.4923369884490967 + }, + { + "auxiliary_loss_clip": 0.01024405, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.03478825, + "balance_loss_mlp": 1.01607966, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.497656492394638, + "language_loss": 0.74933815, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.76984966, + "num_input_tokens_seen": 322633425, + "step": 14959, + "time_per_iteration": 4.217623472213745 + }, + { + "auxiliary_loss_clip": 0.01092445, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.03150046, + "balance_loss_mlp": 1.01946831, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 1.9312215966059156, + "language_loss": 0.68924183, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.71046758, + "num_input_tokens_seen": 322652065, + "step": 14960, + "time_per_iteration": 2.506317615509033 + }, + { + "auxiliary_loss_clip": 0.01074954, + "auxiliary_loss_mlp": 0.01025815, + "balance_loss_clip": 1.03342438, + "balance_loss_mlp": 1.01549745, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.6868252131822055, + "language_loss": 0.65706336, + "learning_rate": 1.049510991294591e-07, + "loss": 0.67807108, + "num_input_tokens_seen": 322673275, + "step": 14961, + "time_per_iteration": 2.598255157470703 + }, + { + "auxiliary_loss_clip": 0.01072929, + "auxiliary_loss_mlp": 0.01025827, + "balance_loss_clip": 1.03084779, + "balance_loss_mlp": 1.01543188, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.5445457726814706, + "language_loss": 0.83154452, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85253209, + "num_input_tokens_seen": 322693375, + "step": 14962, + "time_per_iteration": 2.5971221923828125 + }, + { + "auxiliary_loss_clip": 0.01079075, + "auxiliary_loss_mlp": 0.0102879, + "balance_loss_clip": 1.03563643, + "balance_loss_mlp": 1.01690459, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 1.9407643332912266, + "language_loss": 0.76544023, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78651893, + "num_input_tokens_seen": 322712615, + "step": 14963, + "time_per_iteration": 2.618549108505249 + }, + { + "auxiliary_loss_clip": 0.0097737, + "auxiliary_loss_mlp": 0.01003465, + "balance_loss_clip": 1.00530291, + "balance_loss_mlp": 1.0022912, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7817006926120349, + "language_loss": 0.57581842, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59562683, + "num_input_tokens_seen": 322766855, + "step": 14964, + "time_per_iteration": 3.0542807579040527 + }, + { + "auxiliary_loss_clip": 0.01093103, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.0363059, + "balance_loss_mlp": 1.02094638, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 2.96427648464707, + "language_loss": 0.67648685, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69774383, + "num_input_tokens_seen": 322781130, + "step": 14965, + "time_per_iteration": 2.550692319869995 + }, + { + "auxiliary_loss_clip": 0.01099183, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.03448427, + "balance_loss_mlp": 1.01876068, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 5.563377150212919, + "language_loss": 0.71827853, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.73956305, + "num_input_tokens_seen": 322800310, + "step": 14966, + "time_per_iteration": 2.5125980377197266 + }, + { + "auxiliary_loss_clip": 0.01062174, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.03176129, + "balance_loss_mlp": 1.01697803, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 2.194990476067829, + "language_loss": 0.73185849, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75276458, + "num_input_tokens_seen": 322820955, + "step": 14967, + "time_per_iteration": 2.6876087188720703 + }, + { + "auxiliary_loss_clip": 0.01054544, + "auxiliary_loss_mlp": 0.00749423, + "balance_loss_clip": 1.03338146, + "balance_loss_mlp": 1.000247, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 2.1471694610845375, + "language_loss": 0.7182132, + "learning_rate": 1.040813291960323e-07, + "loss": 0.7362529, + "num_input_tokens_seen": 322838780, + "step": 14968, + "time_per_iteration": 2.62326717376709 + }, + { + "auxiliary_loss_clip": 0.01083384, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.03321862, + "balance_loss_mlp": 1.02333677, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 2.007066312137974, + "language_loss": 0.71506751, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.7362448, + "num_input_tokens_seen": 322856710, + "step": 14969, + "time_per_iteration": 2.5773158073425293 + }, + { + "auxiliary_loss_clip": 0.01101013, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.03614378, + "balance_loss_mlp": 1.01785588, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 1.9434284840038019, + "language_loss": 0.75880456, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78010517, + "num_input_tokens_seen": 322876070, + "step": 14970, + "time_per_iteration": 2.5584146976470947 + }, + { + "auxiliary_loss_clip": 0.01089496, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.03347862, + "balance_loss_mlp": 1.02059257, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.6780803939220479, + "language_loss": 0.7318939, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75309592, + "num_input_tokens_seen": 322895095, + "step": 14971, + "time_per_iteration": 2.553036689758301 + }, + { + "auxiliary_loss_clip": 0.01061804, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.03179812, + "balance_loss_mlp": 1.01790977, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 2.208320553254388, + "language_loss": 0.817307, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83821905, + "num_input_tokens_seen": 322911845, + "step": 14972, + "time_per_iteration": 2.6099655628204346 + }, + { + "auxiliary_loss_clip": 0.01065719, + "auxiliary_loss_mlp": 0.01029258, + "balance_loss_clip": 1.03053224, + "balance_loss_mlp": 1.01821351, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 2.251105111304163, + "language_loss": 0.81508982, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83603954, + "num_input_tokens_seen": 322928170, + "step": 14973, + "time_per_iteration": 4.093506336212158 + }, + { + "auxiliary_loss_clip": 0.01096039, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.03338373, + "balance_loss_mlp": 1.02246547, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.7037354834466605, + "language_loss": 0.58535099, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60665017, + "num_input_tokens_seen": 322948165, + "step": 14974, + "time_per_iteration": 2.5265908241271973 + }, + { + "auxiliary_loss_clip": 0.01101393, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.03739786, + "balance_loss_mlp": 1.02051842, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.7717159901887698, + "language_loss": 0.63433141, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65565753, + "num_input_tokens_seen": 322968880, + "step": 14975, + "time_per_iteration": 2.5028090476989746 + }, + { + "auxiliary_loss_clip": 0.0108658, + "auxiliary_loss_mlp": 0.01031877, + "balance_loss_clip": 1.03270638, + "balance_loss_mlp": 1.02046323, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.642548584286028, + "language_loss": 0.72780728, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.74899185, + "num_input_tokens_seen": 322989395, + "step": 14976, + "time_per_iteration": 2.5322060585021973 + }, + { + "auxiliary_loss_clip": 0.01080806, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.03482795, + "balance_loss_mlp": 1.01857471, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.7044973767090854, + "language_loss": 0.69414598, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71524525, + "num_input_tokens_seen": 323009060, + "step": 14977, + "time_per_iteration": 2.5846292972564697 + }, + { + "auxiliary_loss_clip": 0.0106925, + "auxiliary_loss_mlp": 0.00749531, + "balance_loss_clip": 1.03257632, + "balance_loss_mlp": 1.00021911, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.1766110765848685, + "language_loss": 0.65585995, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67404771, + "num_input_tokens_seen": 323027530, + "step": 14978, + "time_per_iteration": 2.539048910140991 + }, + { + "auxiliary_loss_clip": 0.01064322, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.03248978, + "balance_loss_mlp": 1.02262294, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 2.4644173449968423, + "language_loss": 0.78868759, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.80967784, + "num_input_tokens_seen": 323045370, + "step": 14979, + "time_per_iteration": 2.6067006587982178 + }, + { + "auxiliary_loss_clip": 0.01003657, + "auxiliary_loss_mlp": 0.01001771, + "balance_loss_clip": 1.00365829, + "balance_loss_mlp": 1.00084734, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7256682593839564, + "language_loss": 0.53591025, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55596447, + "num_input_tokens_seen": 323105660, + "step": 14980, + "time_per_iteration": 3.1683027744293213 + }, + { + "auxiliary_loss_clip": 0.01090522, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.03496957, + "balance_loss_mlp": 1.02435696, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.8230975957334588, + "language_loss": 0.82240343, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84366816, + "num_input_tokens_seen": 323126365, + "step": 14981, + "time_per_iteration": 2.58604097366333 + }, + { + "auxiliary_loss_clip": 0.01052038, + "auxiliary_loss_mlp": 0.01031787, + "balance_loss_clip": 1.03081465, + "balance_loss_mlp": 1.02084339, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 1.7187171376857486, + "language_loss": 0.81401527, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83485347, + "num_input_tokens_seen": 323145655, + "step": 14982, + "time_per_iteration": 2.7702853679656982 + }, + { + "auxiliary_loss_clip": 0.01059431, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02896535, + "balance_loss_mlp": 1.02486396, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.6380737350327894, + "language_loss": 0.7174437, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73840249, + "num_input_tokens_seen": 323164540, + "step": 14983, + "time_per_iteration": 4.155966520309448 + }, + { + "auxiliary_loss_clip": 0.01085075, + "auxiliary_loss_mlp": 0.01025958, + "balance_loss_clip": 1.03425336, + "balance_loss_mlp": 1.01639771, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.3692838608417786, + "language_loss": 0.75025988, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77137029, + "num_input_tokens_seen": 323186960, + "step": 14984, + "time_per_iteration": 2.562804698944092 + }, + { + "auxiliary_loss_clip": 0.01092546, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.03119802, + "balance_loss_mlp": 1.01715064, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.866903607013304, + "language_loss": 0.7009033, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72210824, + "num_input_tokens_seen": 323206135, + "step": 14985, + "time_per_iteration": 2.517230987548828 + }, + { + "auxiliary_loss_clip": 0.01077081, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.03280592, + "balance_loss_mlp": 1.01753497, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 1.8326292835537519, + "language_loss": 0.70941073, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.73047072, + "num_input_tokens_seen": 323225980, + "step": 14986, + "time_per_iteration": 2.587718963623047 + }, + { + "auxiliary_loss_clip": 0.01083228, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.03029585, + "balance_loss_mlp": 1.02306604, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.7690173082939882, + "language_loss": 0.76613867, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.78731358, + "num_input_tokens_seen": 323243700, + "step": 14987, + "time_per_iteration": 2.555349111557007 + }, + { + "auxiliary_loss_clip": 0.01083145, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.03417456, + "balance_loss_mlp": 1.02150774, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 1.9464972591551963, + "language_loss": 0.73344654, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75460553, + "num_input_tokens_seen": 323261535, + "step": 14988, + "time_per_iteration": 2.5633304119110107 + }, + { + "auxiliary_loss_clip": 0.01074793, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.03505313, + "balance_loss_mlp": 1.01902544, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 1.8524864782580992, + "language_loss": 0.69454283, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71559501, + "num_input_tokens_seen": 323281855, + "step": 14989, + "time_per_iteration": 4.072512626647949 + }, + { + "auxiliary_loss_clip": 0.01098311, + "auxiliary_loss_mlp": 0.010269, + "balance_loss_clip": 1.03424621, + "balance_loss_mlp": 1.01592088, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 1.971402580699098, + "language_loss": 0.80371714, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.82496923, + "num_input_tokens_seen": 323299505, + "step": 14990, + "time_per_iteration": 2.4473071098327637 + }, + { + "auxiliary_loss_clip": 0.01061488, + "auxiliary_loss_mlp": 0.01030389, + "balance_loss_clip": 1.03348064, + "balance_loss_mlp": 1.0186348, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 2.6010509714127736, + "language_loss": 0.77800477, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.79892349, + "num_input_tokens_seen": 323318365, + "step": 14991, + "time_per_iteration": 2.7460696697235107 + }, + { + "auxiliary_loss_clip": 0.00995504, + "auxiliary_loss_mlp": 0.00746605, + "balance_loss_clip": 1.0077002, + "balance_loss_mlp": 0.99979299, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7783830743418256, + "language_loss": 0.60206413, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.61948526, + "num_input_tokens_seen": 323371835, + "step": 14992, + "time_per_iteration": 3.0624425411224365 + }, + { + "auxiliary_loss_clip": 0.01083911, + "auxiliary_loss_mlp": 0.01026477, + "balance_loss_clip": 1.03291273, + "balance_loss_mlp": 1.01552796, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 2.0842744502931043, + "language_loss": 0.82827407, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.84937799, + "num_input_tokens_seen": 323388495, + "step": 14993, + "time_per_iteration": 2.532527208328247 + }, + { + "auxiliary_loss_clip": 0.01097325, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.03353167, + "balance_loss_mlp": 1.01825929, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 2.174915237498493, + "language_loss": 0.73238766, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75365484, + "num_input_tokens_seen": 323405280, + "step": 14994, + "time_per_iteration": 2.494267463684082 + }, + { + "auxiliary_loss_clip": 0.01082898, + "auxiliary_loss_mlp": 0.01023937, + "balance_loss_clip": 1.03243399, + "balance_loss_mlp": 1.01416218, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.7087867358863795, + "language_loss": 0.64831287, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.6693812, + "num_input_tokens_seen": 323425310, + "step": 14995, + "time_per_iteration": 2.5734055042266846 + }, + { + "auxiliary_loss_clip": 0.01051665, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.02693629, + "balance_loss_mlp": 1.02135611, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 1.7778678967790391, + "language_loss": 0.6659503, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68680179, + "num_input_tokens_seen": 323447805, + "step": 14996, + "time_per_iteration": 2.644249677658081 + }, + { + "auxiliary_loss_clip": 0.01082815, + "auxiliary_loss_mlp": 0.01025197, + "balance_loss_clip": 1.03039527, + "balance_loss_mlp": 1.01501107, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.6721734092863119, + "language_loss": 0.66121465, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68229473, + "num_input_tokens_seen": 323467150, + "step": 14997, + "time_per_iteration": 2.5398333072662354 + }, + { + "auxiliary_loss_clip": 0.01065486, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.02935255, + "balance_loss_mlp": 1.02292895, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 7.704835234294815, + "language_loss": 0.77845901, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.7994557, + "num_input_tokens_seen": 323484250, + "step": 14998, + "time_per_iteration": 2.5519092082977295 + }, + { + "auxiliary_loss_clip": 0.01096814, + "auxiliary_loss_mlp": 0.01028458, + "balance_loss_clip": 1.03286171, + "balance_loss_mlp": 1.01744282, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.6864416923172265, + "language_loss": 0.75157493, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77282768, + "num_input_tokens_seen": 323502910, + "step": 14999, + "time_per_iteration": 4.030239582061768 + }, + { + "auxiliary_loss_clip": 0.01032678, + "auxiliary_loss_mlp": 0.01024987, + "balance_loss_clip": 1.03277838, + "balance_loss_mlp": 1.01389432, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.6271336204425437, + "language_loss": 0.75957978, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.78015643, + "num_input_tokens_seen": 323521820, + "step": 15000, + "time_per_iteration": 2.6677727699279785 + }, + { + "auxiliary_loss_clip": 0.01078197, + "auxiliary_loss_mlp": 0.01024139, + "balance_loss_clip": 1.03229713, + "balance_loss_mlp": 1.01363707, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.12763806615069, + "language_loss": 0.81203723, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.83306062, + "num_input_tokens_seen": 323543200, + "step": 15001, + "time_per_iteration": 2.8263943195343018 + }, + { + "auxiliary_loss_clip": 0.01064705, + "auxiliary_loss_mlp": 0.01024777, + "balance_loss_clip": 1.03352869, + "balance_loss_mlp": 1.01488316, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.388718255621588, + "language_loss": 0.78428066, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80517554, + "num_input_tokens_seen": 323563075, + "step": 15002, + "time_per_iteration": 2.6442782878875732 + }, + { + "auxiliary_loss_clip": 0.01069314, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.03400993, + "balance_loss_mlp": 1.02272391, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 1.9929567006169806, + "language_loss": 0.68589425, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70694458, + "num_input_tokens_seen": 323579065, + "step": 15003, + "time_per_iteration": 2.5618910789489746 + }, + { + "auxiliary_loss_clip": 0.01070813, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.03091002, + "balance_loss_mlp": 1.02317405, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 1.6791035029566517, + "language_loss": 0.85987365, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88092661, + "num_input_tokens_seen": 323594835, + "step": 15004, + "time_per_iteration": 2.583446741104126 + }, + { + "auxiliary_loss_clip": 0.01061068, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.03217673, + "balance_loss_mlp": 1.02061307, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 3.085426277878096, + "language_loss": 0.72745788, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74838388, + "num_input_tokens_seen": 323611475, + "step": 15005, + "time_per_iteration": 2.6483166217803955 + }, + { + "auxiliary_loss_clip": 0.01086867, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.03020561, + "balance_loss_mlp": 1.01621687, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.8436816384053825, + "language_loss": 0.70977235, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73091412, + "num_input_tokens_seen": 323629730, + "step": 15006, + "time_per_iteration": 2.5535576343536377 + }, + { + "auxiliary_loss_clip": 0.01073853, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.03147554, + "balance_loss_mlp": 1.01760542, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 1.79432732727277, + "language_loss": 0.84294331, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86396587, + "num_input_tokens_seen": 323646000, + "step": 15007, + "time_per_iteration": 2.540701389312744 + }, + { + "auxiliary_loss_clip": 0.01065004, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.03021097, + "balance_loss_mlp": 1.01859951, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.6426117615045226, + "language_loss": 0.78423303, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80518413, + "num_input_tokens_seen": 323667250, + "step": 15008, + "time_per_iteration": 2.625974416732788 + }, + { + "auxiliary_loss_clip": 0.01051004, + "auxiliary_loss_mlp": 0.01030507, + "balance_loss_clip": 1.03235698, + "balance_loss_mlp": 1.02000439, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.8563038458454793, + "language_loss": 0.73368889, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75450397, + "num_input_tokens_seen": 323687150, + "step": 15009, + "time_per_iteration": 2.619251251220703 + }, + { + "auxiliary_loss_clip": 0.01095743, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.03274846, + "balance_loss_mlp": 1.02307379, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.956558075648999, + "language_loss": 0.72944039, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75074065, + "num_input_tokens_seen": 323703660, + "step": 15010, + "time_per_iteration": 2.485168933868408 + }, + { + "auxiliary_loss_clip": 0.01071223, + "auxiliary_loss_mlp": 0.01029252, + "balance_loss_clip": 1.03377223, + "balance_loss_mlp": 1.01777768, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 2.6839422258885457, + "language_loss": 0.74330652, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76431131, + "num_input_tokens_seen": 323722060, + "step": 15011, + "time_per_iteration": 2.5587315559387207 + }, + { + "auxiliary_loss_clip": 0.0108591, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.03152847, + "balance_loss_mlp": 1.01828408, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 5.8150769754225475, + "language_loss": 0.73069215, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75184774, + "num_input_tokens_seen": 323740645, + "step": 15012, + "time_per_iteration": 2.522167921066284 + }, + { + "auxiliary_loss_clip": 0.01063958, + "auxiliary_loss_mlp": 0.01033789, + "balance_loss_clip": 1.03222466, + "balance_loss_mlp": 1.02294064, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.853720096431462, + "language_loss": 0.69337898, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71435642, + "num_input_tokens_seen": 323758905, + "step": 15013, + "time_per_iteration": 4.079518795013428 + }, + { + "auxiliary_loss_clip": 0.01083005, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.03249264, + "balance_loss_mlp": 1.01953185, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.4589263971417077, + "language_loss": 0.73066866, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75179601, + "num_input_tokens_seen": 323780595, + "step": 15014, + "time_per_iteration": 2.588761329650879 + }, + { + "auxiliary_loss_clip": 0.0109583, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.03263211, + "balance_loss_mlp": 1.01803136, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 1.8795962650709983, + "language_loss": 0.72122693, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74247897, + "num_input_tokens_seen": 323798160, + "step": 15015, + "time_per_iteration": 2.4591941833496094 + }, + { + "auxiliary_loss_clip": 0.01071015, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.01785135, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 1.8900444727834949, + "language_loss": 0.68967533, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71067357, + "num_input_tokens_seen": 323816810, + "step": 15016, + "time_per_iteration": 2.6216278076171875 + }, + { + "auxiliary_loss_clip": 0.01095324, + "auxiliary_loss_mlp": 0.01025403, + "balance_loss_clip": 1.03299975, + "balance_loss_mlp": 1.01552045, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 2.746550142126488, + "language_loss": 0.70306444, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72427166, + "num_input_tokens_seen": 323836900, + "step": 15017, + "time_per_iteration": 2.5859150886535645 + }, + { + "auxiliary_loss_clip": 0.01075797, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.03711736, + "balance_loss_mlp": 1.02092767, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.6193015719637276, + "language_loss": 0.69581592, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71688807, + "num_input_tokens_seen": 323855325, + "step": 15018, + "time_per_iteration": 2.5513973236083984 + }, + { + "auxiliary_loss_clip": 0.01087316, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.03454638, + "balance_loss_mlp": 1.01912272, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 1.7632961605037376, + "language_loss": 0.69230598, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71348345, + "num_input_tokens_seen": 323875650, + "step": 15019, + "time_per_iteration": 2.5384724140167236 + }, + { + "auxiliary_loss_clip": 0.01094384, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.03272164, + "balance_loss_mlp": 1.02081728, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.8626746332831856, + "language_loss": 0.71728849, + "learning_rate": 9.773057299808951e-08, + "loss": 0.73854798, + "num_input_tokens_seen": 323892920, + "step": 15020, + "time_per_iteration": 2.431809663772583 + }, + { + "auxiliary_loss_clip": 0.01080966, + "auxiliary_loss_mlp": 0.01030747, + "balance_loss_clip": 1.03124869, + "balance_loss_mlp": 1.01941037, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5216890737140074, + "language_loss": 0.74492073, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76603794, + "num_input_tokens_seen": 323913835, + "step": 15021, + "time_per_iteration": 2.5510494709014893 + }, + { + "auxiliary_loss_clip": 0.01101702, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.03536201, + "balance_loss_mlp": 1.01805806, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 3.3050815374363625, + "language_loss": 0.7276696, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74898255, + "num_input_tokens_seen": 323933440, + "step": 15022, + "time_per_iteration": 2.5378918647766113 + }, + { + "auxiliary_loss_clip": 0.01054767, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.03030741, + "balance_loss_mlp": 1.01656246, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 2.8380316662484097, + "language_loss": 0.72905874, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74987566, + "num_input_tokens_seen": 323954090, + "step": 15023, + "time_per_iteration": 2.63443660736084 + }, + { + "auxiliary_loss_clip": 0.0108421, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.03446531, + "balance_loss_mlp": 1.01664674, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 2.034819184365762, + "language_loss": 0.82693994, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84805369, + "num_input_tokens_seen": 323974040, + "step": 15024, + "time_per_iteration": 3.9825479984283447 + }, + { + "auxiliary_loss_clip": 0.01087674, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.03312969, + "balance_loss_mlp": 1.0187093, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.81919067326989, + "language_loss": 0.69650829, + "learning_rate": 9.713019223328966e-08, + "loss": 0.7176773, + "num_input_tokens_seen": 323996125, + "step": 15025, + "time_per_iteration": 2.5156078338623047 + }, + { + "auxiliary_loss_clip": 0.01062223, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.03099108, + "balance_loss_mlp": 1.02026153, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 1.7303689841111394, + "language_loss": 0.76955855, + "learning_rate": 9.70103325331717e-08, + "loss": 0.79048777, + "num_input_tokens_seen": 324017645, + "step": 15026, + "time_per_iteration": 2.6301157474517822 + }, + { + "auxiliary_loss_clip": 0.01086315, + "auxiliary_loss_mlp": 0.01029261, + "balance_loss_clip": 1.03459477, + "balance_loss_mlp": 1.01900339, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 7.876896846916101, + "language_loss": 0.68431896, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70547473, + "num_input_tokens_seen": 324036875, + "step": 15027, + "time_per_iteration": 2.5123369693756104 + }, + { + "auxiliary_loss_clip": 0.0103658, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02866983, + "balance_loss_mlp": 1.02239704, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.5545492781503525, + "language_loss": 0.7575559, + "learning_rate": 9.677082962215477e-08, + "loss": 0.77826548, + "num_input_tokens_seen": 324057045, + "step": 15028, + "time_per_iteration": 2.6644914150238037 + }, + { + "auxiliary_loss_clip": 0.01040475, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.03011799, + "balance_loss_mlp": 1.02524722, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 2.116119480333872, + "language_loss": 0.69420826, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71497715, + "num_input_tokens_seen": 324079735, + "step": 15029, + "time_per_iteration": 4.318303346633911 + }, + { + "auxiliary_loss_clip": 0.01084701, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.03529561, + "balance_loss_mlp": 1.01776505, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 1.8742268532023216, + "language_loss": 0.73624748, + "learning_rate": 9.653161539369858e-08, + "loss": 0.75738847, + "num_input_tokens_seen": 324097785, + "step": 15030, + "time_per_iteration": 2.5039150714874268 + }, + { + "auxiliary_loss_clip": 0.01090736, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.03518164, + "balance_loss_mlp": 1.01730919, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 2.8888290194253545, + "language_loss": 0.6803211, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70151222, + "num_input_tokens_seen": 324121625, + "step": 15031, + "time_per_iteration": 2.728616237640381 + }, + { + "auxiliary_loss_clip": 0.01073025, + "auxiliary_loss_mlp": 0.01024224, + "balance_loss_clip": 1.03151584, + "balance_loss_mlp": 1.01359034, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.6548365725164573, + "language_loss": 0.76372117, + "learning_rate": 9.629268988408723e-08, + "loss": 0.7846936, + "num_input_tokens_seen": 324142535, + "step": 15032, + "time_per_iteration": 2.5618762969970703 + }, + { + "auxiliary_loss_clip": 0.01097765, + "auxiliary_loss_mlp": 0.01031216, + "balance_loss_clip": 1.03369296, + "balance_loss_mlp": 1.01993251, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 1.922897647247885, + "language_loss": 0.75284111, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77413094, + "num_input_tokens_seen": 324159610, + "step": 15033, + "time_per_iteration": 2.475003242492676 + }, + { + "auxiliary_loss_clip": 0.01057795, + "auxiliary_loss_mlp": 0.01033489, + "balance_loss_clip": 1.02884436, + "balance_loss_mlp": 1.02178907, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 2.531537195744258, + "language_loss": 0.73797786, + "learning_rate": 9.605405312956105e-08, + "loss": 0.75889063, + "num_input_tokens_seen": 324182510, + "step": 15034, + "time_per_iteration": 2.6727795600891113 + }, + { + "auxiliary_loss_clip": 0.01060678, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.03205967, + "balance_loss_mlp": 1.02149081, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.5782431129955083, + "language_loss": 0.63752103, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65845275, + "num_input_tokens_seen": 324200555, + "step": 15035, + "time_per_iteration": 2.5891127586364746 + }, + { + "auxiliary_loss_clip": 0.01097605, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.034688, + "balance_loss_mlp": 1.0205518, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 2.434792046604902, + "language_loss": 0.61409861, + "learning_rate": 9.581570516631643e-08, + "loss": 0.63539696, + "num_input_tokens_seen": 324220255, + "step": 15036, + "time_per_iteration": 2.5785109996795654 + }, + { + "auxiliary_loss_clip": 0.01043526, + "auxiliary_loss_mlp": 0.01024073, + "balance_loss_clip": 1.03193438, + "balance_loss_mlp": 1.013749, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.6147978691355067, + "language_loss": 0.82284307, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84351909, + "num_input_tokens_seen": 324237855, + "step": 15037, + "time_per_iteration": 2.641470193862915 + }, + { + "auxiliary_loss_clip": 0.01099217, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.03462291, + "balance_loss_mlp": 1.0159924, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 1.8090160070847279, + "language_loss": 0.67637855, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69764328, + "num_input_tokens_seen": 324257050, + "step": 15038, + "time_per_iteration": 2.4928359985351562 + }, + { + "auxiliary_loss_clip": 0.01072571, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.03062987, + "balance_loss_mlp": 1.02268124, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 2.0030803082921884, + "language_loss": 0.75521702, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77628094, + "num_input_tokens_seen": 324275510, + "step": 15039, + "time_per_iteration": 4.127467393875122 + }, + { + "auxiliary_loss_clip": 0.01071067, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.03305435, + "balance_loss_mlp": 1.01809406, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.697803887399249, + "language_loss": 0.69921935, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72021651, + "num_input_tokens_seen": 324295150, + "step": 15040, + "time_per_iteration": 2.566504955291748 + }, + { + "auxiliary_loss_clip": 0.01061976, + "auxiliary_loss_mlp": 0.01025065, + "balance_loss_clip": 1.03141761, + "balance_loss_mlp": 1.01472938, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.6972252452070795, + "language_loss": 0.67742312, + "learning_rate": 9.522109895720709e-08, + "loss": 0.69829357, + "num_input_tokens_seen": 324313855, + "step": 15041, + "time_per_iteration": 2.550572156906128 + }, + { + "auxiliary_loss_clip": 0.0108487, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.03226769, + "balance_loss_mlp": 1.01778913, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 1.6322866486549115, + "language_loss": 0.57464445, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59578097, + "num_input_tokens_seen": 324338465, + "step": 15042, + "time_per_iteration": 2.6481447219848633 + }, + { + "auxiliary_loss_clip": 0.01003383, + "auxiliary_loss_mlp": 0.00746629, + "balance_loss_clip": 1.00345349, + "balance_loss_mlp": 0.99987966, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7746875389702086, + "language_loss": 0.56933802, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58683813, + "num_input_tokens_seen": 324398740, + "step": 15043, + "time_per_iteration": 3.1037135124206543 + }, + { + "auxiliary_loss_clip": 0.01076633, + "auxiliary_loss_mlp": 0.01028057, + "balance_loss_clip": 1.03321385, + "balance_loss_mlp": 1.01610637, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 2.730923402190087, + "language_loss": 0.70011878, + "learning_rate": 9.486520194855274e-08, + "loss": 0.72116566, + "num_input_tokens_seen": 324417335, + "step": 15044, + "time_per_iteration": 2.5334043502807617 + }, + { + "auxiliary_loss_clip": 0.01078933, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.03456879, + "balance_loss_mlp": 1.02379382, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.143629484913233, + "language_loss": 0.69784379, + "learning_rate": 9.474671409214407e-08, + "loss": 0.71898937, + "num_input_tokens_seen": 324433240, + "step": 15045, + "time_per_iteration": 2.507197380065918 + }, + { + "auxiliary_loss_clip": 0.01064779, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.03295314, + "balance_loss_mlp": 1.02178907, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 1.7689910444860235, + "language_loss": 0.6519016, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67288041, + "num_input_tokens_seen": 324452675, + "step": 15046, + "time_per_iteration": 2.627535343170166 + }, + { + "auxiliary_loss_clip": 0.01064917, + "auxiliary_loss_mlp": 0.01033199, + "balance_loss_clip": 1.03101301, + "balance_loss_mlp": 1.02213025, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 3.4457135260725957, + "language_loss": 0.61714929, + "learning_rate": 9.450995512600379e-08, + "loss": 0.63813043, + "num_input_tokens_seen": 324467865, + "step": 15047, + "time_per_iteration": 2.5228381156921387 + }, + { + "auxiliary_loss_clip": 0.01097534, + "auxiliary_loss_mlp": 0.00749348, + "balance_loss_clip": 1.03519988, + "balance_loss_mlp": 1.00022221, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.4801327475542285, + "language_loss": 0.71363258, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73210144, + "num_input_tokens_seen": 324490430, + "step": 15048, + "time_per_iteration": 2.5303754806518555 + }, + { + "auxiliary_loss_clip": 0.01086025, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.03098404, + "balance_loss_mlp": 1.01877022, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.7203016245923353, + "language_loss": 0.75561184, + "learning_rate": 9.427348518535483e-08, + "loss": 0.77677488, + "num_input_tokens_seen": 324506620, + "step": 15049, + "time_per_iteration": 2.485408306121826 + }, + { + "auxiliary_loss_clip": 0.01085853, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.03660953, + "balance_loss_mlp": 1.01724815, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.659023020558645, + "language_loss": 0.75707912, + "learning_rate": 9.415535861079993e-08, + "loss": 0.7782203, + "num_input_tokens_seen": 324525505, + "step": 15050, + "time_per_iteration": 2.5695252418518066 + }, + { + "auxiliary_loss_clip": 0.01097012, + "auxiliary_loss_mlp": 0.00749314, + "balance_loss_clip": 1.03330517, + "balance_loss_mlp": 1.0002234, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.832160885630997, + "language_loss": 0.82125938, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83972263, + "num_input_tokens_seen": 324544415, + "step": 15051, + "time_per_iteration": 2.5272316932678223 + }, + { + "auxiliary_loss_clip": 0.0108556, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.03285408, + "balance_loss_mlp": 1.01723099, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.0145960298374694, + "language_loss": 0.8923341, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91346312, + "num_input_tokens_seen": 324562555, + "step": 15052, + "time_per_iteration": 2.485363721847534 + }, + { + "auxiliary_loss_clip": 0.01087863, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.03356946, + "balance_loss_mlp": 1.02279329, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 1.8790666431138754, + "language_loss": 0.7725932, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79380745, + "num_input_tokens_seen": 324580865, + "step": 15053, + "time_per_iteration": 2.4929757118225098 + }, + { + "auxiliary_loss_clip": 0.01085041, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.03350592, + "balance_loss_mlp": 1.02110672, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 2.1069661727174345, + "language_loss": 0.72655141, + "learning_rate": 9.368357505553049e-08, + "loss": 0.7477209, + "num_input_tokens_seen": 324600665, + "step": 15054, + "time_per_iteration": 4.007822751998901 + }, + { + "auxiliary_loss_clip": 0.01038335, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.02824426, + "balance_loss_mlp": 1.01843989, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.6343103172812365, + "language_loss": 0.82974815, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85042721, + "num_input_tokens_seen": 324618145, + "step": 15055, + "time_per_iteration": 2.6066300868988037 + }, + { + "auxiliary_loss_clip": 0.01085044, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.03370118, + "balance_loss_mlp": 1.0229156, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.7605145558166888, + "language_loss": 0.85115838, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87235278, + "num_input_tokens_seen": 324638165, + "step": 15056, + "time_per_iteration": 2.5395514965057373 + }, + { + "auxiliary_loss_clip": 0.01066975, + "auxiliary_loss_mlp": 0.01027598, + "balance_loss_clip": 1.0319438, + "balance_loss_mlp": 1.01726818, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.9619714970906235, + "language_loss": 0.72314703, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74409276, + "num_input_tokens_seen": 324658560, + "step": 15057, + "time_per_iteration": 2.6127076148986816 + }, + { + "auxiliary_loss_clip": 0.01080992, + "auxiliary_loss_mlp": 0.01025653, + "balance_loss_clip": 1.03116965, + "balance_loss_mlp": 1.0150249, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.690209179103118, + "language_loss": 0.80857635, + "learning_rate": 9.321294810356418e-08, + "loss": 0.82964283, + "num_input_tokens_seen": 324679185, + "step": 15058, + "time_per_iteration": 2.5275187492370605 + }, + { + "auxiliary_loss_clip": 0.01014334, + "auxiliary_loss_mlp": 0.01002769, + "balance_loss_clip": 1.00464749, + "balance_loss_mlp": 1.00177979, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6755386205976668, + "language_loss": 0.51369333, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53386444, + "num_input_tokens_seen": 324744830, + "step": 15059, + "time_per_iteration": 3.236051082611084 + }, + { + "auxiliary_loss_clip": 0.01058541, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.0346384, + "balance_loss_mlp": 1.01670289, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 1.7023332812351355, + "language_loss": 0.67140239, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69226509, + "num_input_tokens_seen": 324762905, + "step": 15060, + "time_per_iteration": 2.63950777053833 + }, + { + "auxiliary_loss_clip": 0.01077418, + "auxiliary_loss_mlp": 0.01028512, + "balance_loss_clip": 1.03312874, + "balance_loss_mlp": 1.01793242, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 2.127839837186322, + "language_loss": 0.64129078, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66235, + "num_input_tokens_seen": 324781905, + "step": 15061, + "time_per_iteration": 2.5683250427246094 + }, + { + "auxiliary_loss_clip": 0.01070882, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.03411627, + "balance_loss_mlp": 1.02241611, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 2.168038862110131, + "language_loss": 0.71557546, + "learning_rate": 9.274347804044058e-08, + "loss": 0.73661816, + "num_input_tokens_seen": 324799260, + "step": 15062, + "time_per_iteration": 2.633409023284912 + }, + { + "auxiliary_loss_clip": 0.01095281, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.03281641, + "balance_loss_mlp": 1.01875138, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.563181336760141, + "language_loss": 0.70760894, + "learning_rate": 9.2626291321936e-08, + "loss": 0.72885257, + "num_input_tokens_seen": 324817800, + "step": 15063, + "time_per_iteration": 2.5048656463623047 + }, + { + "auxiliary_loss_clip": 0.01052206, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.02967727, + "balance_loss_mlp": 1.01694655, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 1.6542528373311596, + "language_loss": 0.7220211, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74281526, + "num_input_tokens_seen": 324838445, + "step": 15064, + "time_per_iteration": 2.644819736480713 + }, + { + "auxiliary_loss_clip": 0.0108743, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.03184152, + "balance_loss_mlp": 1.02099264, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 3.0223911611531604, + "language_loss": 0.6952188, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71640927, + "num_input_tokens_seen": 324859895, + "step": 15065, + "time_per_iteration": 4.094729661941528 + }, + { + "auxiliary_loss_clip": 0.0106323, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.03243732, + "balance_loss_mlp": 1.02178156, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.4630388552494882, + "language_loss": 0.62913823, + "learning_rate": 9.227516515099743e-08, + "loss": 0.65009356, + "num_input_tokens_seen": 324879580, + "step": 15066, + "time_per_iteration": 2.5621793270111084 + }, + { + "auxiliary_loss_clip": 0.01018258, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.02532887, + "balance_loss_mlp": 1.01940501, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 2.520205547429851, + "language_loss": 0.80169666, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82220107, + "num_input_tokens_seen": 324898950, + "step": 15067, + "time_per_iteration": 2.7320961952209473 + }, + { + "auxiliary_loss_clip": 0.01077038, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.03298771, + "balance_loss_mlp": 1.01689768, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.514267858205333, + "language_loss": 0.70003068, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72108912, + "num_input_tokens_seen": 324917455, + "step": 15068, + "time_per_iteration": 4.068300485610962 + }, + { + "auxiliary_loss_clip": 0.01093389, + "auxiliary_loss_mlp": 0.0102581, + "balance_loss_clip": 1.03181362, + "balance_loss_mlp": 1.01486063, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 1.8039031239076364, + "language_loss": 0.85248715, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87367916, + "num_input_tokens_seen": 324934495, + "step": 15069, + "time_per_iteration": 2.542564868927002 + }, + { + "auxiliary_loss_clip": 0.01087452, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.03194356, + "balance_loss_mlp": 1.01729941, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 2.1438194599891194, + "language_loss": 0.59346592, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61463284, + "num_input_tokens_seen": 324953230, + "step": 15070, + "time_per_iteration": 2.5312063694000244 + }, + { + "auxiliary_loss_clip": 0.01062319, + "auxiliary_loss_mlp": 0.01022606, + "balance_loss_clip": 1.03200984, + "balance_loss_mlp": 1.01088166, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 2.0340825521315895, + "language_loss": 0.81546348, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83631271, + "num_input_tokens_seen": 324969880, + "step": 15071, + "time_per_iteration": 2.5928194522857666 + }, + { + "auxiliary_loss_clip": 0.01099508, + "auxiliary_loss_mlp": 0.010364, + "balance_loss_clip": 1.0330472, + "balance_loss_mlp": 1.02448475, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.8002062103620369, + "language_loss": 0.62110126, + "learning_rate": 9.157486613883758e-08, + "loss": 0.64246029, + "num_input_tokens_seen": 324987005, + "step": 15072, + "time_per_iteration": 2.424513339996338 + }, + { + "auxiliary_loss_clip": 0.01074828, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.0312531, + "balance_loss_mlp": 1.02484775, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.717703691896784, + "language_loss": 0.7340014, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75510788, + "num_input_tokens_seen": 325010700, + "step": 15073, + "time_per_iteration": 2.826763153076172 + }, + { + "auxiliary_loss_clip": 0.01083567, + "auxiliary_loss_mlp": 0.01023577, + "balance_loss_clip": 1.03283191, + "balance_loss_mlp": 1.01345587, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 1.8744607731455714, + "language_loss": 0.81033307, + "learning_rate": 9.134201202899161e-08, + "loss": 0.83140451, + "num_input_tokens_seen": 325028760, + "step": 15074, + "time_per_iteration": 2.5285604000091553 + }, + { + "auxiliary_loss_clip": 0.00975188, + "auxiliary_loss_mlp": 0.00746585, + "balance_loss_clip": 1.00533175, + "balance_loss_mlp": 0.99975187, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7477460873018781, + "language_loss": 0.5230155, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54023319, + "num_input_tokens_seen": 325093545, + "step": 15075, + "time_per_iteration": 3.2860043048858643 + }, + { + "auxiliary_loss_clip": 0.00995092, + "auxiliary_loss_mlp": 0.01002832, + "balance_loss_clip": 1.00749564, + "balance_loss_mlp": 1.0018543, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7328480483053535, + "language_loss": 0.62139189, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64137113, + "num_input_tokens_seen": 325152295, + "step": 15076, + "time_per_iteration": 3.083474636077881 + }, + { + "auxiliary_loss_clip": 0.01084363, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.03184855, + "balance_loss_mlp": 1.02588749, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 1.9206699118460522, + "language_loss": 0.82414436, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84535468, + "num_input_tokens_seen": 325169705, + "step": 15077, + "time_per_iteration": 2.5954902172088623 + }, + { + "auxiliary_loss_clip": 0.01068648, + "auxiliary_loss_mlp": 0.00749171, + "balance_loss_clip": 1.02888894, + "balance_loss_mlp": 1.00025439, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 2.5473847461233876, + "language_loss": 0.84203762, + "learning_rate": 9.08771723625934e-08, + "loss": 0.86021578, + "num_input_tokens_seen": 325189175, + "step": 15078, + "time_per_iteration": 2.576801061630249 + }, + { + "auxiliary_loss_clip": 0.01082422, + "auxiliary_loss_mlp": 0.0074928, + "balance_loss_clip": 1.03340888, + "balance_loss_mlp": 1.00018752, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.3985941435818445, + "language_loss": 0.65227431, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67059135, + "num_input_tokens_seen": 325211020, + "step": 15079, + "time_per_iteration": 4.076008081436157 + }, + { + "auxiliary_loss_clip": 0.01015151, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.02685905, + "balance_loss_mlp": 1.0150919, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.5420517329932595, + "language_loss": 0.70684129, + "learning_rate": 9.064518687654765e-08, + "loss": 0.72725356, + "num_input_tokens_seen": 325236970, + "step": 15080, + "time_per_iteration": 2.8899219036102295 + }, + { + "auxiliary_loss_clip": 0.01084147, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.03469539, + "balance_loss_mlp": 1.01777816, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.240156218209493, + "language_loss": 0.70874023, + "learning_rate": 9.052930273571547e-08, + "loss": 0.72987533, + "num_input_tokens_seen": 325252670, + "step": 15081, + "time_per_iteration": 2.492901086807251 + }, + { + "auxiliary_loss_clip": 0.01071012, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.03404343, + "balance_loss_mlp": 1.01827252, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 1.9647481064551782, + "language_loss": 0.74458075, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76558292, + "num_input_tokens_seen": 325273860, + "step": 15082, + "time_per_iteration": 2.539189577102661 + }, + { + "auxiliary_loss_clip": 0.01062437, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.03244996, + "balance_loss_mlp": 1.02070856, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 1.6520489783826162, + "language_loss": 0.7809248, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80185765, + "num_input_tokens_seen": 325294140, + "step": 15083, + "time_per_iteration": 2.6699318885803223 + }, + { + "auxiliary_loss_clip": 0.01073268, + "auxiliary_loss_mlp": 0.00749087, + "balance_loss_clip": 1.03477728, + "balance_loss_mlp": 1.00016236, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.6884106492660949, + "language_loss": 0.68920386, + "learning_rate": 9.01820847747028e-08, + "loss": 0.70742738, + "num_input_tokens_seen": 325313130, + "step": 15084, + "time_per_iteration": 2.625676393508911 + }, + { + "auxiliary_loss_clip": 0.01098448, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03542805, + "balance_loss_mlp": 1.02022576, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 1.5230355230197932, + "language_loss": 0.66724777, + "learning_rate": 9.006649028948965e-08, + "loss": 0.68854368, + "num_input_tokens_seen": 325334880, + "step": 15085, + "time_per_iteration": 2.530062437057495 + }, + { + "auxiliary_loss_clip": 0.00997553, + "auxiliary_loss_mlp": 0.01002324, + "balance_loss_clip": 1.01275051, + "balance_loss_mlp": 1.00110781, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7714523554313665, + "language_loss": 0.61280388, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63280267, + "num_input_tokens_seen": 325394175, + "step": 15086, + "time_per_iteration": 3.163740873336792 + }, + { + "auxiliary_loss_clip": 0.01080146, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.03157306, + "balance_loss_mlp": 1.02581501, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.5201979977297744, + "language_loss": 0.72232485, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74350107, + "num_input_tokens_seen": 325415020, + "step": 15087, + "time_per_iteration": 2.543936014175415 + }, + { + "auxiliary_loss_clip": 0.01072438, + "auxiliary_loss_mlp": 0.01026637, + "balance_loss_clip": 1.03037095, + "balance_loss_mlp": 1.01605153, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 2.196645957077643, + "language_loss": 0.76468325, + "learning_rate": 8.972014140059058e-08, + "loss": 0.78567398, + "num_input_tokens_seen": 325433595, + "step": 15088, + "time_per_iteration": 2.536555051803589 + }, + { + "auxiliary_loss_clip": 0.01060432, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.0299468, + "balance_loss_mlp": 1.01618731, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.7813853249816978, + "language_loss": 0.73602003, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75689203, + "num_input_tokens_seen": 325451605, + "step": 15089, + "time_per_iteration": 2.633744478225708 + }, + { + "auxiliary_loss_clip": 0.01092626, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.03299594, + "balance_loss_mlp": 1.022614, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 2.1505445350705115, + "language_loss": 0.75420469, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77545571, + "num_input_tokens_seen": 325470645, + "step": 15090, + "time_per_iteration": 2.5294718742370605 + }, + { + "auxiliary_loss_clip": 0.01067501, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.03141654, + "balance_loss_mlp": 1.0156014, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.469630230229768, + "language_loss": 0.77829123, + "learning_rate": 8.93744444537079e-08, + "loss": 0.7992422, + "num_input_tokens_seen": 325488070, + "step": 15091, + "time_per_iteration": 2.5746376514434814 + }, + { + "auxiliary_loss_clip": 0.01066224, + "auxiliary_loss_mlp": 0.01023508, + "balance_loss_clip": 1.02863348, + "balance_loss_mlp": 1.01376879, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.6708226999133553, + "language_loss": 0.85792291, + "learning_rate": 8.925935703448217e-08, + "loss": 0.87882024, + "num_input_tokens_seen": 325509285, + "step": 15092, + "time_per_iteration": 2.616211414337158 + }, + { + "auxiliary_loss_clip": 0.01074877, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.03535903, + "balance_loss_mlp": 1.01786518, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.8360045740271518, + "language_loss": 0.78676975, + "learning_rate": 8.914434207073296e-08, + "loss": 0.8078078, + "num_input_tokens_seen": 325529360, + "step": 15093, + "time_per_iteration": 2.6368446350097656 + }, + { + "auxiliary_loss_clip": 0.01013401, + "auxiliary_loss_mlp": 0.01001205, + "balance_loss_clip": 1.00301135, + "balance_loss_mlp": 1.00020361, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7355655081555241, + "language_loss": 0.56967163, + "learning_rate": 8.902939956682188e-08, + "loss": 0.58981764, + "num_input_tokens_seen": 325583565, + "step": 15094, + "time_per_iteration": 4.402063369750977 + }, + { + "auxiliary_loss_clip": 0.01087099, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.03309298, + "balance_loss_mlp": 1.02045059, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 1.7894429147905466, + "language_loss": 0.71270609, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73390079, + "num_input_tokens_seen": 325603690, + "step": 15095, + "time_per_iteration": 2.4849720001220703 + }, + { + "auxiliary_loss_clip": 0.01052293, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.03149605, + "balance_loss_mlp": 1.01962018, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 1.914363201683611, + "language_loss": 0.74336702, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76419914, + "num_input_tokens_seen": 325622255, + "step": 15096, + "time_per_iteration": 2.5677742958068848 + }, + { + "auxiliary_loss_clip": 0.01098585, + "auxiliary_loss_mlp": 0.01034923, + "balance_loss_clip": 1.03377867, + "balance_loss_mlp": 1.02247727, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 1.79736209506159, + "language_loss": 0.5670023, + "learning_rate": 8.868500685768898e-08, + "loss": 0.58833736, + "num_input_tokens_seen": 325640165, + "step": 15097, + "time_per_iteration": 2.5058653354644775 + }, + { + "auxiliary_loss_clip": 0.01073559, + "auxiliary_loss_mlp": 0.01023129, + "balance_loss_clip": 1.02997088, + "balance_loss_mlp": 1.01282918, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.5323840628796117, + "language_loss": 0.79646099, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81742781, + "num_input_tokens_seen": 325659455, + "step": 15098, + "time_per_iteration": 2.489230155944824 + }, + { + "auxiliary_loss_clip": 0.01051965, + "auxiliary_loss_mlp": 0.0074947, + "balance_loss_clip": 1.03084564, + "balance_loss_mlp": 1.0002656, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 4.207539509252843, + "language_loss": 0.65884292, + "learning_rate": 8.845577409729266e-08, + "loss": 0.67685723, + "num_input_tokens_seen": 325678095, + "step": 15099, + "time_per_iteration": 2.663367986679077 + }, + { + "auxiliary_loss_clip": 0.01075346, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.03252304, + "balance_loss_mlp": 1.02090764, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 2.1623287645833775, + "language_loss": 0.7019546, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72303021, + "num_input_tokens_seen": 325695825, + "step": 15100, + "time_per_iteration": 2.5672032833099365 + }, + { + "auxiliary_loss_clip": 0.01012956, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 1.00318933, + "balance_loss_mlp": 1.00013566, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6261104952435416, + "language_loss": 0.53441238, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55455154, + "num_input_tokens_seen": 325764515, + "step": 15101, + "time_per_iteration": 3.1436684131622314 + }, + { + "auxiliary_loss_clip": 0.01059134, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.0304476, + "balance_loss_mlp": 1.01812959, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.8921524860826338, + "language_loss": 0.6838491, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70473409, + "num_input_tokens_seen": 325783235, + "step": 15102, + "time_per_iteration": 2.585878610610962 + }, + { + "auxiliary_loss_clip": 0.01080625, + "auxiliary_loss_mlp": 0.01026328, + "balance_loss_clip": 1.03287947, + "balance_loss_mlp": 1.01538515, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 14.074568189814132, + "language_loss": 0.79360908, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81467867, + "num_input_tokens_seen": 325800195, + "step": 15103, + "time_per_iteration": 2.5251495838165283 + }, + { + "auxiliary_loss_clip": 0.01067348, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.03086853, + "balance_loss_mlp": 1.02009368, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.949421858288635, + "language_loss": 0.71724141, + "learning_rate": 8.78839607763413e-08, + "loss": 0.73822939, + "num_input_tokens_seen": 325820215, + "step": 15104, + "time_per_iteration": 4.025677919387817 + }, + { + "auxiliary_loss_clip": 0.01070084, + "auxiliary_loss_mlp": 0.01026076, + "balance_loss_clip": 1.03059781, + "balance_loss_mlp": 1.01640844, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 2.010351409265899, + "language_loss": 0.77344549, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79440713, + "num_input_tokens_seen": 325838415, + "step": 15105, + "time_per_iteration": 2.61299729347229 + }, + { + "auxiliary_loss_clip": 0.01094742, + "auxiliary_loss_mlp": 0.0074942, + "balance_loss_clip": 1.03157473, + "balance_loss_mlp": 1.00026822, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.840668925768322, + "language_loss": 0.736938, + "learning_rate": 8.765574297104628e-08, + "loss": 0.75537968, + "num_input_tokens_seen": 325855580, + "step": 15106, + "time_per_iteration": 2.4734067916870117 + }, + { + "auxiliary_loss_clip": 0.01041375, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.02754903, + "balance_loss_mlp": 1.02163649, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.7701665750563293, + "language_loss": 0.80261266, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82336056, + "num_input_tokens_seen": 325874890, + "step": 15107, + "time_per_iteration": 2.693424701690674 + }, + { + "auxiliary_loss_clip": 0.01005913, + "auxiliary_loss_mlp": 0.01004679, + "balance_loss_clip": 1.00881052, + "balance_loss_mlp": 1.00367725, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8172521063503628, + "language_loss": 0.59741735, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61752319, + "num_input_tokens_seen": 325935835, + "step": 15108, + "time_per_iteration": 4.682319402694702 + }, + { + "auxiliary_loss_clip": 0.01074204, + "auxiliary_loss_mlp": 0.01023832, + "balance_loss_clip": 1.0320847, + "balance_loss_mlp": 1.01304972, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.8518617767460643, + "language_loss": 0.73410058, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75508088, + "num_input_tokens_seen": 325958035, + "step": 15109, + "time_per_iteration": 2.650712490081787 + }, + { + "auxiliary_loss_clip": 0.01061034, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.02992725, + "balance_loss_mlp": 1.01791489, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 1.7869286522621706, + "language_loss": 0.71519923, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73608756, + "num_input_tokens_seen": 325979870, + "step": 15110, + "time_per_iteration": 2.6205952167510986 + }, + { + "auxiliary_loss_clip": 0.01062279, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.02791643, + "balance_loss_mlp": 1.02230287, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 1.77575188795706, + "language_loss": 0.68931878, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71028709, + "num_input_tokens_seen": 325998245, + "step": 15111, + "time_per_iteration": 2.647226333618164 + }, + { + "auxiliary_loss_clip": 0.00992935, + "auxiliary_loss_mlp": 0.01002078, + "balance_loss_clip": 1.00518334, + "balance_loss_mlp": 1.00106454, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.6896323988914419, + "language_loss": 0.52013421, + "learning_rate": 8.697283008425026e-08, + "loss": 0.54008436, + "num_input_tokens_seen": 326061770, + "step": 15112, + "time_per_iteration": 3.1695034503936768 + }, + { + "auxiliary_loss_clip": 0.0108513, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.03139138, + "balance_loss_mlp": 1.02232027, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 1.9716964694870704, + "language_loss": 0.6971401, + "learning_rate": 8.685926514226837e-08, + "loss": 0.71832293, + "num_input_tokens_seen": 326080945, + "step": 15113, + "time_per_iteration": 2.502466917037964 + }, + { + "auxiliary_loss_clip": 0.01086193, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.03360748, + "balance_loss_mlp": 1.01803374, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.019525505087304, + "language_loss": 0.78892469, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81007338, + "num_input_tokens_seen": 326100630, + "step": 15114, + "time_per_iteration": 2.655457019805908 + }, + { + "auxiliary_loss_clip": 0.01059357, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.03440332, + "balance_loss_mlp": 1.01719475, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 2.336085689663847, + "language_loss": 0.69564945, + "learning_rate": 8.663235290207405e-08, + "loss": 0.71653712, + "num_input_tokens_seen": 326120145, + "step": 15115, + "time_per_iteration": 2.643366575241089 + }, + { + "auxiliary_loss_clip": 0.01071191, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.03526998, + "balance_loss_mlp": 1.01761031, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.506949708824476, + "language_loss": 0.65931094, + "learning_rate": 8.651900561246561e-08, + "loss": 0.68031746, + "num_input_tokens_seen": 326140715, + "step": 15116, + "time_per_iteration": 2.6257643699645996 + }, + { + "auxiliary_loss_clip": 0.01095883, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.03455389, + "balance_loss_mlp": 1.01988316, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 2.679223453985599, + "language_loss": 0.69259393, + "learning_rate": 8.640573088224812e-08, + "loss": 0.7138617, + "num_input_tokens_seen": 326159130, + "step": 15117, + "time_per_iteration": 2.5588936805725098 + }, + { + "auxiliary_loss_clip": 0.01056643, + "auxiliary_loss_mlp": 0.0102656, + "balance_loss_clip": 1.03169322, + "balance_loss_mlp": 1.01586092, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.4125697107197386, + "language_loss": 0.74592096, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76675302, + "num_input_tokens_seen": 326181375, + "step": 15118, + "time_per_iteration": 4.146219730377197 + }, + { + "auxiliary_loss_clip": 0.01069062, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.03026104, + "balance_loss_mlp": 1.02111864, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 2.304278710542281, + "language_loss": 0.73265421, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75367868, + "num_input_tokens_seen": 326199740, + "step": 15119, + "time_per_iteration": 2.558579444885254 + }, + { + "auxiliary_loss_clip": 0.01060266, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.03309298, + "balance_loss_mlp": 1.01800239, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.3911958717644937, + "language_loss": 0.71337533, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73428774, + "num_input_tokens_seen": 326214350, + "step": 15120, + "time_per_iteration": 2.588643789291382 + }, + { + "auxiliary_loss_clip": 0.01097032, + "auxiliary_loss_mlp": 0.00749361, + "balance_loss_clip": 1.03313529, + "balance_loss_mlp": 1.00027239, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.1274915464436877, + "language_loss": 0.65716147, + "learning_rate": 8.595335764115596e-08, + "loss": 0.67562538, + "num_input_tokens_seen": 326234580, + "step": 15121, + "time_per_iteration": 2.499472141265869 + }, + { + "auxiliary_loss_clip": 0.01085974, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.03266597, + "balance_loss_mlp": 1.02387857, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.9106483379165913, + "language_loss": 0.69981343, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72102678, + "num_input_tokens_seen": 326259080, + "step": 15122, + "time_per_iteration": 2.7714974880218506 + }, + { + "auxiliary_loss_clip": 0.01039832, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.02802145, + "balance_loss_mlp": 1.01823676, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.4675809784573393, + "language_loss": 0.74514395, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76583159, + "num_input_tokens_seen": 326280175, + "step": 15123, + "time_per_iteration": 2.7642385959625244 + }, + { + "auxiliary_loss_clip": 0.01083841, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.03336561, + "balance_loss_mlp": 1.0164454, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 1.8718455982723081, + "language_loss": 0.7606318, + "learning_rate": 8.561483979414253e-08, + "loss": 0.7817384, + "num_input_tokens_seen": 326297990, + "step": 15124, + "time_per_iteration": 2.556518793106079 + }, + { + "auxiliary_loss_clip": 0.01077162, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.0316571, + "balance_loss_mlp": 1.01825666, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 2.0498858385850345, + "language_loss": 0.72218001, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74324548, + "num_input_tokens_seen": 326316735, + "step": 15125, + "time_per_iteration": 2.519235134124756 + }, + { + "auxiliary_loss_clip": 0.0105245, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.03224945, + "balance_loss_mlp": 1.0226388, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.5532292335235403, + "language_loss": 0.79233074, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81318748, + "num_input_tokens_seen": 326334370, + "step": 15126, + "time_per_iteration": 2.5980119705200195 + }, + { + "auxiliary_loss_clip": 0.01052417, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.03363109, + "balance_loss_mlp": 1.0234586, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.590276975385491, + "language_loss": 0.75288361, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77375239, + "num_input_tokens_seen": 326353435, + "step": 15127, + "time_per_iteration": 2.617126941680908 + }, + { + "auxiliary_loss_clip": 0.01014884, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.02834821, + "balance_loss_mlp": 1.02053142, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.7801697371965153, + "language_loss": 0.62265974, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64312595, + "num_input_tokens_seen": 326371810, + "step": 15128, + "time_per_iteration": 2.7068982124328613 + }, + { + "auxiliary_loss_clip": 0.01062356, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.03072929, + "balance_loss_mlp": 1.01611805, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.650777641819407, + "language_loss": 0.77014744, + "learning_rate": 8.505209531291013e-08, + "loss": 0.79103994, + "num_input_tokens_seen": 326391380, + "step": 15129, + "time_per_iteration": 2.61324143409729 + }, + { + "auxiliary_loss_clip": 0.01082173, + "auxiliary_loss_mlp": 0.01023898, + "balance_loss_clip": 1.0316025, + "balance_loss_mlp": 1.01338947, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 1.9444942253945114, + "language_loss": 0.83501667, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85607737, + "num_input_tokens_seen": 326408800, + "step": 15130, + "time_per_iteration": 2.5871171951293945 + }, + { + "auxiliary_loss_clip": 0.01075374, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.03281617, + "balance_loss_mlp": 1.02006269, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.758678905540186, + "language_loss": 0.75126278, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77233273, + "num_input_tokens_seen": 326431565, + "step": 15131, + "time_per_iteration": 2.7518064975738525 + }, + { + "auxiliary_loss_clip": 0.01067589, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.03126812, + "balance_loss_mlp": 1.01838756, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 2.069148493961659, + "language_loss": 0.59775496, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61873078, + "num_input_tokens_seen": 326451715, + "step": 15132, + "time_per_iteration": 2.673163652420044 + }, + { + "auxiliary_loss_clip": 0.01055847, + "auxiliary_loss_mlp": 0.01025916, + "balance_loss_clip": 1.03377104, + "balance_loss_mlp": 1.01534772, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.3946862785802476, + "language_loss": 0.82695603, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84777361, + "num_input_tokens_seen": 326470855, + "step": 15133, + "time_per_iteration": 2.571943521499634 + }, + { + "auxiliary_loss_clip": 0.01070905, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.02966321, + "balance_loss_mlp": 1.01824665, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.756764867422135, + "language_loss": 0.74378717, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76479101, + "num_input_tokens_seen": 326490480, + "step": 15134, + "time_per_iteration": 4.0905561447143555 + }, + { + "auxiliary_loss_clip": 0.01070127, + "auxiliary_loss_mlp": 0.01033176, + "balance_loss_clip": 1.03466403, + "balance_loss_mlp": 1.02145243, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.5682858777301938, + "language_loss": 0.7281549, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74918795, + "num_input_tokens_seen": 326509445, + "step": 15135, + "time_per_iteration": 2.6520228385925293 + }, + { + "auxiliary_loss_clip": 0.01086189, + "auxiliary_loss_mlp": 0.010291, + "balance_loss_clip": 1.03447604, + "balance_loss_mlp": 1.01864481, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.8989434686232476, + "language_loss": 0.69624919, + "learning_rate": 8.426730298881702e-08, + "loss": 0.71740204, + "num_input_tokens_seen": 326528380, + "step": 15136, + "time_per_iteration": 2.506992816925049 + }, + { + "auxiliary_loss_clip": 0.00982961, + "auxiliary_loss_mlp": 0.01001327, + "balance_loss_clip": 1.0047667, + "balance_loss_mlp": 1.00039089, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8207007820182659, + "language_loss": 0.59242773, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61227059, + "num_input_tokens_seen": 326576940, + "step": 15137, + "time_per_iteration": 2.859910488128662 + }, + { + "auxiliary_loss_clip": 0.01086643, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.03274357, + "balance_loss_mlp": 1.0211637, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 1.6444941331151979, + "language_loss": 0.82462382, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84580564, + "num_input_tokens_seen": 326596100, + "step": 15138, + "time_per_iteration": 2.5043160915374756 + }, + { + "auxiliary_loss_clip": 0.01083176, + "auxiliary_loss_mlp": 0.01022846, + "balance_loss_clip": 1.03355408, + "balance_loss_mlp": 1.01299334, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.461180905519566, + "language_loss": 0.81380856, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83486873, + "num_input_tokens_seen": 326615700, + "step": 15139, + "time_per_iteration": 2.559455156326294 + }, + { + "auxiliary_loss_clip": 0.01065752, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.03367877, + "balance_loss_mlp": 1.02257597, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.8529438473134996, + "language_loss": 0.77339518, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79438436, + "num_input_tokens_seen": 326635905, + "step": 15140, + "time_per_iteration": 2.5733540058135986 + }, + { + "auxiliary_loss_clip": 0.010952, + "auxiliary_loss_mlp": 0.01027686, + "balance_loss_clip": 1.03219819, + "balance_loss_mlp": 1.01686192, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.5890311363504555, + "language_loss": 0.66560042, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68682933, + "num_input_tokens_seen": 326661855, + "step": 15141, + "time_per_iteration": 2.7483487129211426 + }, + { + "auxiliary_loss_clip": 0.01085429, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03200626, + "balance_loss_mlp": 1.02147055, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.66419102652291, + "language_loss": 0.74783868, + "learning_rate": 8.359745694462005e-08, + "loss": 0.76901716, + "num_input_tokens_seen": 326679320, + "step": 15142, + "time_per_iteration": 2.526674270629883 + }, + { + "auxiliary_loss_clip": 0.0105446, + "auxiliary_loss_mlp": 0.01033157, + "balance_loss_clip": 1.02824163, + "balance_loss_mlp": 1.02189755, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.8899896877514182, + "language_loss": 0.64066243, + "learning_rate": 8.348607025820076e-08, + "loss": 0.6615386, + "num_input_tokens_seen": 326698110, + "step": 15143, + "time_per_iteration": 4.035478115081787 + }, + { + "auxiliary_loss_clip": 0.01097993, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.03249311, + "balance_loss_mlp": 1.02134848, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 2.4377299108473607, + "language_loss": 0.60836756, + "learning_rate": 8.337475624618152e-08, + "loss": 0.62967396, + "num_input_tokens_seen": 326718370, + "step": 15144, + "time_per_iteration": 2.5708439350128174 + }, + { + "auxiliary_loss_clip": 0.01049771, + "auxiliary_loss_mlp": 0.01025858, + "balance_loss_clip": 1.02740586, + "balance_loss_mlp": 1.01498592, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.7237014267885415, + "language_loss": 0.711025, + "learning_rate": 8.326351491278382e-08, + "loss": 0.7317813, + "num_input_tokens_seen": 326738445, + "step": 15145, + "time_per_iteration": 2.637648582458496 + }, + { + "auxiliary_loss_clip": 0.01033755, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.02861118, + "balance_loss_mlp": 1.02151716, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.581017880526226, + "language_loss": 0.70244718, + "learning_rate": 8.315234626222545e-08, + "loss": 0.7231046, + "num_input_tokens_seen": 326758855, + "step": 15146, + "time_per_iteration": 2.7277073860168457 + }, + { + "auxiliary_loss_clip": 0.01075758, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.03230977, + "balance_loss_mlp": 1.01817036, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 1.96327289611493, + "language_loss": 0.73073053, + "learning_rate": 8.304125029872233e-08, + "loss": 0.75177407, + "num_input_tokens_seen": 326777140, + "step": 15147, + "time_per_iteration": 2.620788097381592 + }, + { + "auxiliary_loss_clip": 0.01068797, + "auxiliary_loss_mlp": 0.01028287, + "balance_loss_clip": 1.03279996, + "balance_loss_mlp": 1.01755786, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 1.9143281147649762, + "language_loss": 0.80263031, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82360113, + "num_input_tokens_seen": 326794070, + "step": 15148, + "time_per_iteration": 4.102066993713379 + }, + { + "auxiliary_loss_clip": 0.01063713, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.03098416, + "balance_loss_mlp": 1.02357078, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 1.98259537341862, + "language_loss": 0.67702955, + "learning_rate": 8.281927644972996e-08, + "loss": 0.69801176, + "num_input_tokens_seen": 326814695, + "step": 15149, + "time_per_iteration": 2.6869277954101562 + }, + { + "auxiliary_loss_clip": 0.01096948, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.03426349, + "balance_loss_mlp": 1.01866126, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 2.275536176032145, + "language_loss": 0.62982249, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65109152, + "num_input_tokens_seen": 326835295, + "step": 15150, + "time_per_iteration": 2.553598165512085 + }, + { + "auxiliary_loss_clip": 0.01053941, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.0317452, + "balance_loss_mlp": 1.02048039, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 1.804062244908756, + "language_loss": 0.72586197, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74671519, + "num_input_tokens_seen": 326853350, + "step": 15151, + "time_per_iteration": 2.6146557331085205 + }, + { + "auxiliary_loss_clip": 0.01083648, + "auxiliary_loss_mlp": 0.01025633, + "balance_loss_clip": 1.03222537, + "balance_loss_mlp": 1.01532125, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.5857218152081511, + "language_loss": 0.64181489, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66290766, + "num_input_tokens_seen": 326873425, + "step": 15152, + "time_per_iteration": 2.618295192718506 + }, + { + "auxiliary_loss_clip": 0.01075314, + "auxiliary_loss_mlp": 0.00749315, + "balance_loss_clip": 1.03280854, + "balance_loss_mlp": 1.00021553, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 1.7605257373878704, + "language_loss": 0.73315626, + "learning_rate": 8.23762011815834e-08, + "loss": 0.7514025, + "num_input_tokens_seen": 326893455, + "step": 15153, + "time_per_iteration": 2.648064374923706 + }, + { + "auxiliary_loss_clip": 0.01048161, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.02810621, + "balance_loss_mlp": 1.02493048, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 1.9810844407472854, + "language_loss": 0.72203875, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74289739, + "num_input_tokens_seen": 326910210, + "step": 15154, + "time_per_iteration": 2.5524728298187256 + }, + { + "auxiliary_loss_clip": 0.01074342, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.03434408, + "balance_loss_mlp": 1.02077103, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 1.7594574487841959, + "language_loss": 0.82208371, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84314293, + "num_input_tokens_seen": 326929350, + "step": 15155, + "time_per_iteration": 2.6201722621917725 + }, + { + "auxiliary_loss_clip": 0.01085193, + "auxiliary_loss_mlp": 0.0102568, + "balance_loss_clip": 1.03487706, + "balance_loss_mlp": 1.01470101, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.4296261268866686, + "language_loss": 0.59284616, + "learning_rate": 8.204465823887252e-08, + "loss": 0.6139549, + "num_input_tokens_seen": 326949060, + "step": 15156, + "time_per_iteration": 2.5859248638153076 + }, + { + "auxiliary_loss_clip": 0.01087541, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.03193247, + "balance_loss_mlp": 1.01815152, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 1.6905715081100985, + "language_loss": 0.73806918, + "learning_rate": 8.193428937716796e-08, + "loss": 0.7592454, + "num_input_tokens_seen": 326968950, + "step": 15157, + "time_per_iteration": 2.646982192993164 + }, + { + "auxiliary_loss_clip": 0.01047576, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.02801192, + "balance_loss_mlp": 1.02150357, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.882593981186925, + "language_loss": 0.59543651, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61622953, + "num_input_tokens_seen": 326989455, + "step": 15158, + "time_per_iteration": 4.231480836868286 + }, + { + "auxiliary_loss_clip": 0.01040801, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.0325191, + "balance_loss_mlp": 1.02639818, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.5602373248079577, + "language_loss": 0.67561817, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69639629, + "num_input_tokens_seen": 327009640, + "step": 15159, + "time_per_iteration": 2.6838674545288086 + }, + { + "auxiliary_loss_clip": 0.01076083, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.03226089, + "balance_loss_mlp": 1.0145582, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 2.0073238363366626, + "language_loss": 0.78365576, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80466944, + "num_input_tokens_seen": 327027690, + "step": 15160, + "time_per_iteration": 2.658073663711548 + }, + { + "auxiliary_loss_clip": 0.01099441, + "auxiliary_loss_mlp": 0.01027226, + "balance_loss_clip": 1.03546119, + "balance_loss_mlp": 1.01522183, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 2.3194495259886954, + "language_loss": 0.68860799, + "learning_rate": 8.149354130460073e-08, + "loss": 0.70987463, + "num_input_tokens_seen": 327045915, + "step": 15161, + "time_per_iteration": 2.48228120803833 + }, + { + "auxiliary_loss_clip": 0.01048019, + "auxiliary_loss_mlp": 0.01036354, + "balance_loss_clip": 1.03065085, + "balance_loss_mlp": 1.02349722, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.7055441586865152, + "language_loss": 0.76469874, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78554243, + "num_input_tokens_seen": 327066355, + "step": 15162, + "time_per_iteration": 2.685932159423828 + }, + { + "auxiliary_loss_clip": 0.0107119, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.03493643, + "balance_loss_mlp": 1.02286267, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 3.6788728048240427, + "language_loss": 0.666888, + "learning_rate": 8.127360375135395e-08, + "loss": 0.68793738, + "num_input_tokens_seen": 327086735, + "step": 15163, + "time_per_iteration": 2.5945730209350586 + }, + { + "auxiliary_loss_clip": 0.0105252, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.03102624, + "balance_loss_mlp": 1.02041602, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.1315640586510156, + "language_loss": 0.70699424, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72784066, + "num_input_tokens_seen": 327104035, + "step": 15164, + "time_per_iteration": 2.6133830547332764 + }, + { + "auxiliary_loss_clip": 0.01097346, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.03632438, + "balance_loss_mlp": 1.0191617, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.4572557445474033, + "language_loss": 0.76135713, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78262764, + "num_input_tokens_seen": 327124370, + "step": 15165, + "time_per_iteration": 2.4817347526550293 + }, + { + "auxiliary_loss_clip": 0.01079683, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.03262639, + "balance_loss_mlp": 1.02289081, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.0762056399588946, + "language_loss": 0.72042239, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74156415, + "num_input_tokens_seen": 327140915, + "step": 15166, + "time_per_iteration": 2.549596071243286 + }, + { + "auxiliary_loss_clip": 0.0104607, + "auxiliary_loss_mlp": 0.01039343, + "balance_loss_clip": 1.02997017, + "balance_loss_mlp": 1.02777362, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 2.0766062293464045, + "language_loss": 0.73367202, + "learning_rate": 8.083460177773482e-08, + "loss": 0.7545262, + "num_input_tokens_seen": 327158940, + "step": 15167, + "time_per_iteration": 2.654114246368408 + }, + { + "auxiliary_loss_clip": 0.01006842, + "auxiliary_loss_mlp": 0.01001713, + "balance_loss_clip": 1.00609803, + "balance_loss_mlp": 1.00083661, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7699292623922684, + "language_loss": 0.65559578, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67568135, + "num_input_tokens_seen": 327217450, + "step": 15168, + "time_per_iteration": 3.119664192199707 + }, + { + "auxiliary_loss_clip": 0.01067256, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03182077, + "balance_loss_mlp": 1.01817513, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 2.0403933282745954, + "language_loss": 0.7810533, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80201221, + "num_input_tokens_seen": 327233905, + "step": 15169, + "time_per_iteration": 2.5264265537261963 + }, + { + "auxiliary_loss_clip": 0.01085312, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.03277349, + "balance_loss_mlp": 1.01938939, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 2.1149710112508924, + "language_loss": 0.82236445, + "learning_rate": 8.05061144198591e-08, + "loss": 0.8435207, + "num_input_tokens_seen": 327252430, + "step": 15170, + "time_per_iteration": 2.5383896827697754 + }, + { + "auxiliary_loss_clip": 0.01091064, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.03653836, + "balance_loss_mlp": 1.01794255, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 1.9921090882694241, + "language_loss": 0.77190232, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79310787, + "num_input_tokens_seen": 327269215, + "step": 15171, + "time_per_iteration": 2.631649971008301 + }, + { + "auxiliary_loss_clip": 0.01012162, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.02749681, + "balance_loss_mlp": 1.02093482, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.3506326905394843, + "language_loss": 0.66923928, + "learning_rate": 8.02874867780241e-08, + "loss": 0.68968964, + "num_input_tokens_seen": 327290320, + "step": 15172, + "time_per_iteration": 2.8620452880859375 + }, + { + "auxiliary_loss_clip": 0.01068145, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.03387368, + "balance_loss_mlp": 1.01913047, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.7653868624318634, + "language_loss": 0.74699634, + "learning_rate": 8.017828214857103e-08, + "loss": 0.76798773, + "num_input_tokens_seen": 327310150, + "step": 15173, + "time_per_iteration": 2.994220018386841 + }, + { + "auxiliary_loss_clip": 0.01081708, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.03494108, + "balance_loss_mlp": 1.01613939, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.6355709196674253, + "language_loss": 0.66406184, + "learning_rate": 8.00691503189499e-08, + "loss": 0.68516743, + "num_input_tokens_seen": 327326660, + "step": 15174, + "time_per_iteration": 4.060798645019531 + }, + { + "auxiliary_loss_clip": 0.01081717, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.03202367, + "balance_loss_mlp": 1.01874316, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.992605950552312, + "language_loss": 0.74546498, + "learning_rate": 7.996009129329894e-08, + "loss": 0.76658869, + "num_input_tokens_seen": 327346700, + "step": 15175, + "time_per_iteration": 2.5771281719207764 + }, + { + "auxiliary_loss_clip": 0.01012311, + "auxiliary_loss_mlp": 0.01004125, + "balance_loss_clip": 1.00258398, + "balance_loss_mlp": 1.00323093, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9701762763308474, + "language_loss": 0.58420688, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60437125, + "num_input_tokens_seen": 327403050, + "step": 15176, + "time_per_iteration": 3.183666229248047 + }, + { + "auxiliary_loss_clip": 0.01067722, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.03054965, + "balance_loss_mlp": 1.02128983, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.8016917358129074, + "language_loss": 0.65682149, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67782605, + "num_input_tokens_seen": 327422225, + "step": 15177, + "time_per_iteration": 2.704977035522461 + }, + { + "auxiliary_loss_clip": 0.01063928, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.0314095, + "balance_loss_mlp": 1.01778173, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 2.1535253979602715, + "language_loss": 0.8126055, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83352995, + "num_input_tokens_seen": 327437025, + "step": 15178, + "time_per_iteration": 2.5741171836853027 + }, + { + "auxiliary_loss_clip": 0.0103241, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.02747846, + "balance_loss_mlp": 1.02236724, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 1.8949517287156692, + "language_loss": 0.78709781, + "learning_rate": 7.952458331306711e-08, + "loss": 0.8077693, + "num_input_tokens_seen": 327453915, + "step": 15179, + "time_per_iteration": 2.7179815769195557 + }, + { + "auxiliary_loss_clip": 0.01072832, + "auxiliary_loss_mlp": 0.01032615, + "balance_loss_clip": 1.0318619, + "balance_loss_mlp": 1.02241015, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 1.5804899057857722, + "language_loss": 0.68240398, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70345843, + "num_input_tokens_seen": 327474415, + "step": 15180, + "time_per_iteration": 2.7215633392333984 + }, + { + "auxiliary_loss_clip": 0.01083302, + "auxiliary_loss_mlp": 0.01025857, + "balance_loss_clip": 1.03082907, + "balance_loss_mlp": 1.01593304, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.7809418966517179, + "language_loss": 0.75094396, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77203554, + "num_input_tokens_seen": 327492750, + "step": 15181, + "time_per_iteration": 2.552888870239258 + }, + { + "auxiliary_loss_clip": 0.0110215, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.03607535, + "balance_loss_mlp": 1.01710939, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 1.6795724750518748, + "language_loss": 0.74771923, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76902139, + "num_input_tokens_seen": 327509470, + "step": 15182, + "time_per_iteration": 2.5533502101898193 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.03370857, + "balance_loss_mlp": 1.01783681, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 1.6029252135591932, + "language_loss": 0.76361632, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78489995, + "num_input_tokens_seen": 327530520, + "step": 15183, + "time_per_iteration": 2.601451873779297 + }, + { + "auxiliary_loss_clip": 0.01087319, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.03529489, + "balance_loss_mlp": 1.01841521, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 3.3292641890136205, + "language_loss": 0.7673645, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78852975, + "num_input_tokens_seen": 327546960, + "step": 15184, + "time_per_iteration": 4.217064142227173 + }, + { + "auxiliary_loss_clip": 0.01081222, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.03455305, + "balance_loss_mlp": 1.02209425, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 1.7176477516158875, + "language_loss": 0.74497283, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76611137, + "num_input_tokens_seen": 327564830, + "step": 15185, + "time_per_iteration": 2.521649122238159 + }, + { + "auxiliary_loss_clip": 0.01072071, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.03225207, + "balance_loss_mlp": 1.02091861, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.3205984435766984, + "language_loss": 0.68768364, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70871991, + "num_input_tokens_seen": 327583675, + "step": 15186, + "time_per_iteration": 2.604092597961426 + }, + { + "auxiliary_loss_clip": 0.01076536, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.0334959, + "balance_loss_mlp": 1.02330494, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 1.958377180184199, + "language_loss": 0.78073221, + "learning_rate": 7.865706319773502e-08, + "loss": 0.80187088, + "num_input_tokens_seen": 327602280, + "step": 15187, + "time_per_iteration": 2.6034843921661377 + }, + { + "auxiliary_loss_clip": 0.01097625, + "auxiliary_loss_mlp": 0.00749411, + "balance_loss_clip": 1.03374422, + "balance_loss_mlp": 1.00026274, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 2.0942319012839907, + "language_loss": 0.6625216, + "learning_rate": 7.854895099902515e-08, + "loss": 0.68099189, + "num_input_tokens_seen": 327623515, + "step": 15188, + "time_per_iteration": 4.109795808792114 + }, + { + "auxiliary_loss_clip": 0.01018739, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.02627873, + "balance_loss_mlp": 1.01972389, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 1.650439063158774, + "language_loss": 0.76477003, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78527164, + "num_input_tokens_seen": 327642875, + "step": 15189, + "time_per_iteration": 2.822310209274292 + }, + { + "auxiliary_loss_clip": 0.01084689, + "auxiliary_loss_mlp": 0.0102601, + "balance_loss_clip": 1.03196096, + "balance_loss_mlp": 1.01625896, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 1.6092105822849336, + "language_loss": 0.75176132, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77286828, + "num_input_tokens_seen": 327662450, + "step": 15190, + "time_per_iteration": 3.018832206726074 + }, + { + "auxiliary_loss_clip": 0.01013276, + "auxiliary_loss_mlp": 0.01002201, + "balance_loss_clip": 1.00335479, + "balance_loss_mlp": 1.00125313, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.7838463789363079, + "language_loss": 0.57318765, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59334236, + "num_input_tokens_seen": 327723845, + "step": 15191, + "time_per_iteration": 3.1793112754821777 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01032102, + "balance_loss_clip": 1.03456712, + "balance_loss_mlp": 1.02049708, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 1.7410342918915875, + "language_loss": 0.74041921, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76173198, + "num_input_tokens_seen": 327742590, + "step": 15192, + "time_per_iteration": 2.541569232940674 + }, + { + "auxiliary_loss_clip": 0.01083644, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.03348482, + "balance_loss_mlp": 1.0147866, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.5734372530174903, + "language_loss": 0.69116902, + "learning_rate": 7.800948301161647e-08, + "loss": 0.7122618, + "num_input_tokens_seen": 327764350, + "step": 15193, + "time_per_iteration": 2.703061580657959 + }, + { + "auxiliary_loss_clip": 0.01083754, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.0330255, + "balance_loss_mlp": 1.02521932, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.5811869978869897, + "language_loss": 0.73171896, + "learning_rate": 7.790180804400215e-08, + "loss": 0.75291562, + "num_input_tokens_seen": 327783120, + "step": 15194, + "time_per_iteration": 2.5400760173797607 + }, + { + "auxiliary_loss_clip": 0.01049725, + "auxiliary_loss_mlp": 0.01037308, + "balance_loss_clip": 1.02972138, + "balance_loss_mlp": 1.02292562, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 2.0518173389848284, + "language_loss": 0.6185897, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63946003, + "num_input_tokens_seen": 327801960, + "step": 15195, + "time_per_iteration": 2.6639599800109863 + }, + { + "auxiliary_loss_clip": 0.01087609, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.03354454, + "balance_loss_mlp": 1.0207237, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.4876175382917942, + "language_loss": 0.71545815, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73665142, + "num_input_tokens_seen": 327823795, + "step": 15196, + "time_per_iteration": 2.6287918090820312 + }, + { + "auxiliary_loss_clip": 0.01073896, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.03220689, + "balance_loss_mlp": 1.0210073, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.688816394202406, + "language_loss": 0.71308297, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73413897, + "num_input_tokens_seen": 327845175, + "step": 15197, + "time_per_iteration": 2.6495912075042725 + }, + { + "auxiliary_loss_clip": 0.01074661, + "auxiliary_loss_mlp": 0.0102756, + "balance_loss_clip": 1.03012776, + "balance_loss_mlp": 1.01618695, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.7635217647748018, + "language_loss": 0.77719134, + "learning_rate": 7.747183707589489e-08, + "loss": 0.79821348, + "num_input_tokens_seen": 327863150, + "step": 15198, + "time_per_iteration": 4.151344537734985 + }, + { + "auxiliary_loss_clip": 0.01079734, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.03223598, + "balance_loss_mlp": 1.01806521, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.3620906059618814, + "language_loss": 0.68139267, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70247924, + "num_input_tokens_seen": 327883445, + "step": 15199, + "time_per_iteration": 2.6050078868865967 + }, + { + "auxiliary_loss_clip": 0.01085795, + "auxiliary_loss_mlp": 0.00749299, + "balance_loss_clip": 1.03250849, + "balance_loss_mlp": 1.00029373, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.5165611460844861, + "language_loss": 0.67760241, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69595337, + "num_input_tokens_seen": 327905745, + "step": 15200, + "time_per_iteration": 2.603478193283081 + }, + { + "auxiliary_loss_clip": 0.01085354, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.03497839, + "balance_loss_mlp": 1.02046943, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.6421696184713104, + "language_loss": 0.71241122, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73356771, + "num_input_tokens_seen": 327925435, + "step": 15201, + "time_per_iteration": 2.54819655418396 + }, + { + "auxiliary_loss_clip": 0.01083877, + "auxiliary_loss_mlp": 0.01026558, + "balance_loss_clip": 1.03163552, + "balance_loss_mlp": 1.01686621, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.8537994983912702, + "language_loss": 0.70734036, + "learning_rate": 7.704303254710165e-08, + "loss": 0.7284447, + "num_input_tokens_seen": 327944145, + "step": 15202, + "time_per_iteration": 2.511566638946533 + }, + { + "auxiliary_loss_clip": 0.0109582, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.03300071, + "balance_loss_mlp": 1.02064502, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.8987730679867987, + "language_loss": 0.66563004, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68690819, + "num_input_tokens_seen": 327960565, + "step": 15203, + "time_per_iteration": 2.548105478286743 + }, + { + "auxiliary_loss_clip": 0.01084916, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.03268957, + "balance_loss_mlp": 1.01888943, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.7534441189443812, + "language_loss": 0.68618155, + "learning_rate": 7.682906777877751e-08, + "loss": 0.70733845, + "num_input_tokens_seen": 327981180, + "step": 15204, + "time_per_iteration": 2.5584723949432373 + }, + { + "auxiliary_loss_clip": 0.01084212, + "auxiliary_loss_mlp": 0.01024474, + "balance_loss_clip": 1.02987051, + "balance_loss_mlp": 1.01301169, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 2.002745070784311, + "language_loss": 0.60034668, + "learning_rate": 7.672219478283915e-08, + "loss": 0.62143356, + "num_input_tokens_seen": 328001500, + "step": 15205, + "time_per_iteration": 2.573465585708618 + }, + { + "auxiliary_loss_clip": 0.01053034, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.03056204, + "balance_loss_mlp": 1.02102995, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.671435496542852, + "language_loss": 0.81328458, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83413327, + "num_input_tokens_seen": 328023025, + "step": 15206, + "time_per_iteration": 2.6695592403411865 + }, + { + "auxiliary_loss_clip": 0.01048721, + "auxiliary_loss_mlp": 0.01024579, + "balance_loss_clip": 1.02843893, + "balance_loss_mlp": 1.01342082, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.23735073458153, + "language_loss": 0.73772454, + "learning_rate": 7.650866758767382e-08, + "loss": 0.75845754, + "num_input_tokens_seen": 328041410, + "step": 15207, + "time_per_iteration": 2.6848998069763184 + }, + { + "auxiliary_loss_clip": 0.01054969, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.03461349, + "balance_loss_mlp": 1.02321053, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 3.583749683799521, + "language_loss": 0.73056084, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75145662, + "num_input_tokens_seen": 328060495, + "step": 15208, + "time_per_iteration": 2.6752331256866455 + }, + { + "auxiliary_loss_clip": 0.01077098, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.03318429, + "balance_loss_mlp": 1.01982188, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.157614672713441, + "language_loss": 0.8607114, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88178289, + "num_input_tokens_seen": 328076905, + "step": 15209, + "time_per_iteration": 2.5793445110321045 + }, + { + "auxiliary_loss_clip": 0.01080014, + "auxiliary_loss_mlp": 0.01030975, + "balance_loss_clip": 1.03715277, + "balance_loss_mlp": 1.02086592, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 8.440628287424257, + "language_loss": 0.7497893, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77089918, + "num_input_tokens_seen": 328096960, + "step": 15210, + "time_per_iteration": 2.6081881523132324 + }, + { + "auxiliary_loss_clip": 0.0107066, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.02835429, + "balance_loss_mlp": 1.01785231, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 1.8660725493239823, + "language_loss": 0.78080708, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80180085, + "num_input_tokens_seen": 328115445, + "step": 15211, + "time_per_iteration": 2.5925798416137695 + }, + { + "auxiliary_loss_clip": 0.01088042, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.03383183, + "balance_loss_mlp": 1.01962304, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.7257433055592277, + "language_loss": 0.82580453, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84698462, + "num_input_tokens_seen": 328133965, + "step": 15212, + "time_per_iteration": 2.5935821533203125 + }, + { + "auxiliary_loss_clip": 0.01083582, + "auxiliary_loss_mlp": 0.01024997, + "balance_loss_clip": 1.03272557, + "balance_loss_mlp": 1.01494157, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.8547207314405765, + "language_loss": 0.84127128, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86235702, + "num_input_tokens_seen": 328151520, + "step": 15213, + "time_per_iteration": 2.5406417846679688 + }, + { + "auxiliary_loss_clip": 0.01079537, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.03341842, + "balance_loss_mlp": 1.02106929, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 2.094626370362004, + "language_loss": 0.70829928, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72941047, + "num_input_tokens_seen": 328171275, + "step": 15214, + "time_per_iteration": 4.05748724937439 + }, + { + "auxiliary_loss_clip": 0.01089726, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.03440595, + "balance_loss_mlp": 1.02224278, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.6667453791946703, + "language_loss": 0.62790889, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64914405, + "num_input_tokens_seen": 328192115, + "step": 15215, + "time_per_iteration": 2.557947874069214 + }, + { + "auxiliary_loss_clip": 0.01072531, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.03743839, + "balance_loss_mlp": 1.0189662, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.5119381230949815, + "language_loss": 0.76028395, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78131199, + "num_input_tokens_seen": 328208990, + "step": 15216, + "time_per_iteration": 2.6262569427490234 + }, + { + "auxiliary_loss_clip": 0.01069324, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.03246093, + "balance_loss_mlp": 1.02447867, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 2.251861119755561, + "language_loss": 0.6805436, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70161659, + "num_input_tokens_seen": 328227840, + "step": 15217, + "time_per_iteration": 2.6520674228668213 + }, + { + "auxiliary_loss_clip": 0.01085374, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.03397965, + "balance_loss_mlp": 1.02067077, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.839470314058795, + "language_loss": 0.80020118, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82136631, + "num_input_tokens_seen": 328246250, + "step": 15218, + "time_per_iteration": 2.5572307109832764 + }, + { + "auxiliary_loss_clip": 0.00995924, + "auxiliary_loss_mlp": 0.01005091, + "balance_loss_clip": 1.00787544, + "balance_loss_mlp": 1.00419736, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.846853539912762, + "language_loss": 0.5922122, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61222231, + "num_input_tokens_seen": 328303625, + "step": 15219, + "time_per_iteration": 3.183310031890869 + }, + { + "auxiliary_loss_clip": 0.01083971, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.03255224, + "balance_loss_mlp": 1.02102661, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 2.282281037427553, + "language_loss": 0.78540647, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80656505, + "num_input_tokens_seen": 328322135, + "step": 15220, + "time_per_iteration": 2.5362935066223145 + }, + { + "auxiliary_loss_clip": 0.01044092, + "auxiliary_loss_mlp": 0.01037549, + "balance_loss_clip": 1.02987826, + "balance_loss_mlp": 1.02379787, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 2.0457224224096504, + "language_loss": 0.65993631, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68075275, + "num_input_tokens_seen": 328340750, + "step": 15221, + "time_per_iteration": 2.678621768951416 + }, + { + "auxiliary_loss_clip": 0.01075771, + "auxiliary_loss_mlp": 0.01027278, + "balance_loss_clip": 1.03256178, + "balance_loss_mlp": 1.01706219, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 1.736317702818654, + "language_loss": 0.84434545, + "learning_rate": 7.491651557384692e-08, + "loss": 0.865376, + "num_input_tokens_seen": 328359995, + "step": 15222, + "time_per_iteration": 2.5442967414855957 + }, + { + "auxiliary_loss_clip": 0.01007591, + "auxiliary_loss_mlp": 0.01003285, + "balance_loss_clip": 1.00719535, + "balance_loss_mlp": 1.00229585, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.735712015553495, + "language_loss": 0.4962225, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51633126, + "num_input_tokens_seen": 328426865, + "step": 15223, + "time_per_iteration": 4.631850242614746 + }, + { + "auxiliary_loss_clip": 0.01066214, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.0329473, + "balance_loss_mlp": 1.02420235, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 2.7655759075403594, + "language_loss": 0.72096026, + "learning_rate": 7.470546933201349e-08, + "loss": 0.74197829, + "num_input_tokens_seen": 328445970, + "step": 15224, + "time_per_iteration": 2.6029951572418213 + }, + { + "auxiliary_loss_clip": 0.01082153, + "auxiliary_loss_mlp": 0.01024703, + "balance_loss_clip": 1.03262973, + "balance_loss_mlp": 1.01385534, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 3.794611557496547, + "language_loss": 0.81262887, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83369732, + "num_input_tokens_seen": 328464585, + "step": 15225, + "time_per_iteration": 2.600559949874878 + }, + { + "auxiliary_loss_clip": 0.01094722, + "auxiliary_loss_mlp": 0.01023291, + "balance_loss_clip": 1.03174126, + "balance_loss_mlp": 1.01318169, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.351764657159002, + "language_loss": 0.71183211, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73301232, + "num_input_tokens_seen": 328490155, + "step": 15226, + "time_per_iteration": 2.618016004562378 + }, + { + "auxiliary_loss_clip": 0.01025238, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.03075826, + "balance_loss_mlp": 1.0168252, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 1.9295084585518687, + "language_loss": 0.74487156, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76540697, + "num_input_tokens_seen": 328508275, + "step": 15227, + "time_per_iteration": 2.7803962230682373 + }, + { + "auxiliary_loss_clip": 0.01072419, + "auxiliary_loss_mlp": 0.01027501, + "balance_loss_clip": 1.03210616, + "balance_loss_mlp": 1.01701689, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.7950312328990548, + "language_loss": 0.73951417, + "learning_rate": 7.428425296864404e-08, + "loss": 0.76051337, + "num_input_tokens_seen": 328529425, + "step": 15228, + "time_per_iteration": 4.455070972442627 + }, + { + "auxiliary_loss_clip": 0.01057866, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.02964306, + "balance_loss_mlp": 1.0193125, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.7441251131750866, + "language_loss": 0.71760499, + "learning_rate": 7.417913142616106e-08, + "loss": 0.7384814, + "num_input_tokens_seen": 328550200, + "step": 15229, + "time_per_iteration": 2.788205623626709 + }, + { + "auxiliary_loss_clip": 0.01099292, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.03515875, + "balance_loss_mlp": 1.01907158, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 2.663814006181187, + "language_loss": 0.83022857, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85152936, + "num_input_tokens_seen": 328568540, + "step": 15230, + "time_per_iteration": 2.497978448867798 + }, + { + "auxiliary_loss_clip": 0.01051505, + "auxiliary_loss_mlp": 0.0102663, + "balance_loss_clip": 1.03118277, + "balance_loss_mlp": 1.01619935, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.6854003565563163, + "language_loss": 0.83533323, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85611463, + "num_input_tokens_seen": 328587300, + "step": 15231, + "time_per_iteration": 2.770545244216919 + }, + { + "auxiliary_loss_clip": 0.01077293, + "auxiliary_loss_mlp": 0.01023547, + "balance_loss_clip": 1.02806568, + "balance_loss_mlp": 1.01306295, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.4346214580587615, + "language_loss": 0.72405732, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74506569, + "num_input_tokens_seen": 328610055, + "step": 15232, + "time_per_iteration": 2.6305675506591797 + }, + { + "auxiliary_loss_clip": 0.01097739, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.03290141, + "balance_loss_mlp": 1.02012205, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 2.2169789184525697, + "language_loss": 0.67630589, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69759446, + "num_input_tokens_seen": 328626815, + "step": 15233, + "time_per_iteration": 2.5450172424316406 + }, + { + "auxiliary_loss_clip": 0.01072984, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.03420854, + "balance_loss_mlp": 1.01990581, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 1.8696172296621676, + "language_loss": 0.69808221, + "learning_rate": 7.365461920317861e-08, + "loss": 0.71912742, + "num_input_tokens_seen": 328643995, + "step": 15234, + "time_per_iteration": 2.5950143337249756 + }, + { + "auxiliary_loss_clip": 0.01075369, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.03370774, + "balance_loss_mlp": 1.02134955, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 2.1756555807095523, + "language_loss": 0.8803553, + "learning_rate": 7.354993588431391e-08, + "loss": 0.9014312, + "num_input_tokens_seen": 328659565, + "step": 15235, + "time_per_iteration": 2.6719319820404053 + }, + { + "auxiliary_loss_clip": 0.01024945, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.02927434, + "balance_loss_mlp": 1.02127302, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.6453573027337935, + "language_loss": 0.76824319, + "learning_rate": 7.344532561662853e-08, + "loss": 0.78883219, + "num_input_tokens_seen": 328679045, + "step": 15236, + "time_per_iteration": 2.718721628189087 + }, + { + "auxiliary_loss_clip": 0.009805, + "auxiliary_loss_mlp": 0.01000435, + "balance_loss_clip": 1.00948691, + "balance_loss_mlp": 0.9994095, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6750322321548214, + "language_loss": 0.622715, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64252436, + "num_input_tokens_seen": 328744565, + "step": 15237, + "time_per_iteration": 4.710938215255737 + }, + { + "auxiliary_loss_clip": 0.01099258, + "auxiliary_loss_mlp": 0.00749227, + "balance_loss_clip": 1.03381312, + "balance_loss_mlp": 1.00021267, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 1.9426408050462494, + "language_loss": 0.74846625, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76695108, + "num_input_tokens_seen": 328762455, + "step": 15238, + "time_per_iteration": 2.676071882247925 + }, + { + "auxiliary_loss_clip": 0.01097829, + "auxiliary_loss_mlp": 0.01025494, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.0149858, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.693733909715443, + "language_loss": 0.74984598, + "learning_rate": 7.313193316030464e-08, + "loss": 0.77107918, + "num_input_tokens_seen": 328780320, + "step": 15239, + "time_per_iteration": 2.491539239883423 + }, + { + "auxiliary_loss_clip": 0.01065182, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.03151023, + "balance_loss_mlp": 1.02021456, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 1.9041751278546943, + "language_loss": 0.63035381, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65131563, + "num_input_tokens_seen": 328797570, + "step": 15240, + "time_per_iteration": 2.635305881500244 + }, + { + "auxiliary_loss_clip": 0.01072715, + "auxiliary_loss_mlp": 0.00749221, + "balance_loss_clip": 1.03257585, + "balance_loss_mlp": 1.00025022, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.7895639531504492, + "language_loss": 0.76410341, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78232288, + "num_input_tokens_seen": 328814075, + "step": 15241, + "time_per_iteration": 2.623871088027954 + }, + { + "auxiliary_loss_clip": 0.01094601, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.03525591, + "balance_loss_mlp": 1.01621485, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.2597838347050367, + "language_loss": 0.68173218, + "learning_rate": 7.281919830723549e-08, + "loss": 0.70296478, + "num_input_tokens_seen": 328831990, + "step": 15242, + "time_per_iteration": 2.4737374782562256 + }, + { + "auxiliary_loss_clip": 0.0108399, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.03148556, + "balance_loss_mlp": 1.01957738, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.8278945106599371, + "language_loss": 0.80563867, + "learning_rate": 7.271509950872334e-08, + "loss": 0.82678843, + "num_input_tokens_seen": 328849105, + "step": 15243, + "time_per_iteration": 2.4489898681640625 + }, + { + "auxiliary_loss_clip": 0.01068447, + "auxiliary_loss_mlp": 0.01026286, + "balance_loss_clip": 1.02803278, + "balance_loss_mlp": 1.01514542, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 2.2188236503912004, + "language_loss": 0.82102346, + "learning_rate": 7.261107379304721e-08, + "loss": 0.8419708, + "num_input_tokens_seen": 328866810, + "step": 15244, + "time_per_iteration": 2.5840647220611572 + }, + { + "auxiliary_loss_clip": 0.0110074, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.03424382, + "balance_loss_mlp": 1.02259254, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 2.2083549589215505, + "language_loss": 0.72703671, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74838626, + "num_input_tokens_seen": 328885325, + "step": 15245, + "time_per_iteration": 2.4700708389282227 + }, + { + "auxiliary_loss_clip": 0.01074157, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.03232682, + "balance_loss_mlp": 1.01935685, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.6707879687137743, + "language_loss": 0.7494449, + "learning_rate": 7.240324162598033e-08, + "loss": 0.77048111, + "num_input_tokens_seen": 328902655, + "step": 15246, + "time_per_iteration": 2.5676565170288086 + }, + { + "auxiliary_loss_clip": 0.01066096, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.03038871, + "balance_loss_mlp": 1.01834667, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 1.9931145697251158, + "language_loss": 0.75765473, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77861488, + "num_input_tokens_seen": 328918440, + "step": 15247, + "time_per_iteration": 2.571537971496582 + }, + { + "auxiliary_loss_clip": 0.01090822, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_clip": 1.03636432, + "balance_loss_mlp": 1.01318145, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.8335576537445601, + "language_loss": 0.7581836, + "learning_rate": 7.219570183756052e-08, + "loss": 0.77933514, + "num_input_tokens_seen": 328938055, + "step": 15248, + "time_per_iteration": 2.5645558834075928 + }, + { + "auxiliary_loss_clip": 0.01084318, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.03137922, + "balance_loss_mlp": 1.0240556, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.380549877575975, + "language_loss": 0.7277087, + "learning_rate": 7.209204159518178e-08, + "loss": 0.74891365, + "num_input_tokens_seen": 328957895, + "step": 15249, + "time_per_iteration": 2.5697057247161865 + }, + { + "auxiliary_loss_clip": 0.01047339, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.03055835, + "balance_loss_mlp": 1.01516271, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 1.9258493834125112, + "language_loss": 0.7595917, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78033692, + "num_input_tokens_seen": 328971365, + "step": 15250, + "time_per_iteration": 2.614870309829712 + }, + { + "auxiliary_loss_clip": 0.01053842, + "auxiliary_loss_mlp": 0.01025759, + "balance_loss_clip": 1.03078461, + "balance_loss_mlp": 1.01466084, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.6190049830489412, + "language_loss": 0.7550658, + "learning_rate": 7.188494043374138e-08, + "loss": 0.77586174, + "num_input_tokens_seen": 328990830, + "step": 15251, + "time_per_iteration": 2.6679186820983887 + }, + { + "auxiliary_loss_clip": 0.01072298, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.03531158, + "balance_loss_mlp": 1.01732635, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.280640530403421, + "language_loss": 0.80148411, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82250869, + "num_input_tokens_seen": 329008345, + "step": 15252, + "time_per_iteration": 2.5820226669311523 + }, + { + "auxiliary_loss_clip": 0.01097532, + "auxiliary_loss_mlp": 0.0103137, + "balance_loss_clip": 1.03314078, + "balance_loss_mlp": 1.02059317, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.9882088442172343, + "language_loss": 0.77304643, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79433548, + "num_input_tokens_seen": 329027440, + "step": 15253, + "time_per_iteration": 2.5407183170318604 + }, + { + "auxiliary_loss_clip": 0.01088236, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.03433108, + "balance_loss_mlp": 1.0158422, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 1.7223417637455944, + "language_loss": 0.73009402, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75124103, + "num_input_tokens_seen": 329046445, + "step": 15254, + "time_per_iteration": 2.569239377975464 + }, + { + "auxiliary_loss_clip": 0.01058851, + "auxiliary_loss_mlp": 0.01023201, + "balance_loss_clip": 1.03192878, + "balance_loss_mlp": 1.01304483, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.957950080399804, + "language_loss": 0.79418743, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81500793, + "num_input_tokens_seen": 329065555, + "step": 15255, + "time_per_iteration": 4.174260377883911 + }, + { + "auxiliary_loss_clip": 0.01088351, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.03308499, + "balance_loss_mlp": 1.02145791, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 1.9793378168013565, + "language_loss": 0.68455172, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70576113, + "num_input_tokens_seen": 329087515, + "step": 15256, + "time_per_iteration": 2.7797393798828125 + }, + { + "auxiliary_loss_clip": 0.01078751, + "auxiliary_loss_mlp": 0.0103655, + "balance_loss_clip": 1.03201795, + "balance_loss_mlp": 1.02458715, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 2.13240465352751, + "language_loss": 0.83781052, + "learning_rate": 7.126539181842561e-08, + "loss": 0.85896349, + "num_input_tokens_seen": 329106820, + "step": 15257, + "time_per_iteration": 2.520989179611206 + }, + { + "auxiliary_loss_clip": 0.01068122, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.02979398, + "balance_loss_mlp": 1.02095616, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.512947393003167, + "language_loss": 0.77301544, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79400921, + "num_input_tokens_seen": 329126515, + "step": 15258, + "time_per_iteration": 2.5465776920318604 + }, + { + "auxiliary_loss_clip": 0.01081983, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.03651953, + "balance_loss_mlp": 1.02026081, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 2.131766484373745, + "language_loss": 0.7810576, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80218983, + "num_input_tokens_seen": 329142660, + "step": 15259, + "time_per_iteration": 2.524719476699829 + }, + { + "auxiliary_loss_clip": 0.01046822, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.02927208, + "balance_loss_mlp": 1.02098584, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.5858471338785332, + "language_loss": 0.76502258, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78580433, + "num_input_tokens_seen": 329162575, + "step": 15260, + "time_per_iteration": 2.6601223945617676 + }, + { + "auxiliary_loss_clip": 0.01053151, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.03018296, + "balance_loss_mlp": 1.01843119, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.7873564391842112, + "language_loss": 0.6093629, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63018364, + "num_input_tokens_seen": 329182090, + "step": 15261, + "time_per_iteration": 2.724884510040283 + }, + { + "auxiliary_loss_clip": 0.0106872, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.02955461, + "balance_loss_mlp": 1.01915932, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.7100298998912116, + "language_loss": 0.73519945, + "learning_rate": 7.075111255942002e-08, + "loss": 0.75618136, + "num_input_tokens_seen": 329196535, + "step": 15262, + "time_per_iteration": 2.5669777393341064 + }, + { + "auxiliary_loss_clip": 0.0109825, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.03204632, + "balance_loss_mlp": 1.0207231, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.7412767913898466, + "language_loss": 0.77444565, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79575193, + "num_input_tokens_seen": 329215135, + "step": 15263, + "time_per_iteration": 4.057616233825684 + }, + { + "auxiliary_loss_clip": 0.01099936, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.0340333, + "balance_loss_mlp": 1.02162051, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.814689983080771, + "language_loss": 0.76011735, + "learning_rate": 7.054591292971324e-08, + "loss": 0.78143942, + "num_input_tokens_seen": 329235150, + "step": 15264, + "time_per_iteration": 2.7986531257629395 + }, + { + "auxiliary_loss_clip": 0.01074519, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.0330137, + "balance_loss_mlp": 1.0221839, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.6533696367399182, + "language_loss": 0.83608234, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85715199, + "num_input_tokens_seen": 329254365, + "step": 15265, + "time_per_iteration": 2.6175625324249268 + }, + { + "auxiliary_loss_clip": 0.01101563, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.03529298, + "balance_loss_mlp": 1.02528667, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.8089038709064604, + "language_loss": 0.73065197, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75204051, + "num_input_tokens_seen": 329274385, + "step": 15266, + "time_per_iteration": 2.565295934677124 + }, + { + "auxiliary_loss_clip": 0.01096387, + "auxiliary_loss_mlp": 0.01027083, + "balance_loss_clip": 1.03252256, + "balance_loss_mlp": 1.01670623, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.5798015990736003, + "language_loss": 0.77686679, + "learning_rate": 7.023866223305486e-08, + "loss": 0.79810148, + "num_input_tokens_seen": 329292160, + "step": 15267, + "time_per_iteration": 2.528198003768921 + }, + { + "auxiliary_loss_clip": 0.01014294, + "auxiliary_loss_mlp": 0.00746628, + "balance_loss_clip": 1.00666833, + "balance_loss_mlp": 0.99976587, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7418943932286356, + "language_loss": 0.56211132, + "learning_rate": 7.013639168247975e-08, + "loss": 0.57972056, + "num_input_tokens_seen": 329351870, + "step": 15268, + "time_per_iteration": 5.031262636184692 + }, + { + "auxiliary_loss_clip": 0.01098828, + "auxiliary_loss_mlp": 0.00749206, + "balance_loss_clip": 1.03356218, + "balance_loss_mlp": 1.00018144, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 1.9697332047361178, + "language_loss": 0.7609998, + "learning_rate": 7.0034194312526e-08, + "loss": 0.7794801, + "num_input_tokens_seen": 329370930, + "step": 15269, + "time_per_iteration": 2.5357065200805664 + }, + { + "auxiliary_loss_clip": 0.01062214, + "auxiliary_loss_mlp": 0.01029555, + "balance_loss_clip": 1.03006387, + "balance_loss_mlp": 1.01803303, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 2.2063234256194595, + "language_loss": 0.72611856, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74703628, + "num_input_tokens_seen": 329391275, + "step": 15270, + "time_per_iteration": 2.9007208347320557 + }, + { + "auxiliary_loss_clip": 0.01094468, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.03218222, + "balance_loss_mlp": 1.01782882, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.7667106461631081, + "language_loss": 0.7980001, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81923389, + "num_input_tokens_seen": 329412775, + "step": 15271, + "time_per_iteration": 2.5520105361938477 + }, + { + "auxiliary_loss_clip": 0.01049438, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.02818334, + "balance_loss_mlp": 1.02181792, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 2.8381096658474028, + "language_loss": 0.72714609, + "learning_rate": 6.972804132513355e-08, + "loss": 0.74797332, + "num_input_tokens_seen": 329432440, + "step": 15272, + "time_per_iteration": 2.696079730987549 + }, + { + "auxiliary_loss_clip": 0.01067839, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.03252006, + "balance_loss_mlp": 1.01967192, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 1.8605010817606118, + "language_loss": 0.72654414, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74752033, + "num_input_tokens_seen": 329450605, + "step": 15273, + "time_per_iteration": 2.583871603012085 + }, + { + "auxiliary_loss_clip": 0.01058177, + "auxiliary_loss_mlp": 0.01026376, + "balance_loss_clip": 1.03102469, + "balance_loss_mlp": 1.01679194, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.6788641001746132, + "language_loss": 0.74316388, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76400942, + "num_input_tokens_seen": 329470550, + "step": 15274, + "time_per_iteration": 2.677314281463623 + }, + { + "auxiliary_loss_clip": 0.01081619, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.03013098, + "balance_loss_mlp": 1.02416563, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.7216843919535365, + "language_loss": 0.68798637, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70915043, + "num_input_tokens_seen": 329489765, + "step": 15275, + "time_per_iteration": 2.6089136600494385 + }, + { + "auxiliary_loss_clip": 0.01084009, + "auxiliary_loss_mlp": 0.01027407, + "balance_loss_clip": 1.03311944, + "balance_loss_mlp": 1.01678586, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.9216533834335405, + "language_loss": 0.72566879, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74678296, + "num_input_tokens_seen": 329507040, + "step": 15276, + "time_per_iteration": 2.579808473587036 + }, + { + "auxiliary_loss_clip": 0.01075135, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.03294826, + "balance_loss_mlp": 1.01791739, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.7824726397349528, + "language_loss": 0.73628974, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75732183, + "num_input_tokens_seen": 329525540, + "step": 15277, + "time_per_iteration": 4.082295179367065 + }, + { + "auxiliary_loss_clip": 0.00997698, + "auxiliary_loss_mlp": 0.01001574, + "balance_loss_clip": 1.00704861, + "balance_loss_mlp": 1.00064468, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7170068113959566, + "language_loss": 0.59224403, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61223674, + "num_input_tokens_seen": 329592905, + "step": 15278, + "time_per_iteration": 3.35894513130188 + }, + { + "auxiliary_loss_clip": 0.01058515, + "auxiliary_loss_mlp": 0.01025904, + "balance_loss_clip": 1.02926731, + "balance_loss_mlp": 1.01632535, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.9022572144727439, + "language_loss": 0.64347172, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66431594, + "num_input_tokens_seen": 329610150, + "step": 15279, + "time_per_iteration": 2.602717399597168 + }, + { + "auxiliary_loss_clip": 0.01022283, + "auxiliary_loss_mlp": 0.00746563, + "balance_loss_clip": 1.00244308, + "balance_loss_mlp": 0.99972671, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.8489846951095881, + "language_loss": 0.60170543, + "learning_rate": 6.891485427041211e-08, + "loss": 0.61939394, + "num_input_tokens_seen": 329673650, + "step": 15280, + "time_per_iteration": 3.057718515396118 + }, + { + "auxiliary_loss_clip": 0.01076674, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.03164399, + "balance_loss_mlp": 1.02158654, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 1.6787807582421734, + "language_loss": 0.69453096, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71562386, + "num_input_tokens_seen": 329692520, + "step": 15281, + "time_per_iteration": 2.5828919410705566 + }, + { + "auxiliary_loss_clip": 0.01077066, + "auxiliary_loss_mlp": 0.01027302, + "balance_loss_clip": 1.03316414, + "balance_loss_mlp": 1.01549995, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 1.6437313196400836, + "language_loss": 0.84788585, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86892956, + "num_input_tokens_seen": 329713750, + "step": 15282, + "time_per_iteration": 2.6073966026306152 + }, + { + "auxiliary_loss_clip": 0.01071427, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.0306952, + "balance_loss_mlp": 1.01978588, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.9554635109589924, + "language_loss": 0.60385323, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62487859, + "num_input_tokens_seen": 329730960, + "step": 15283, + "time_per_iteration": 2.5615084171295166 + }, + { + "auxiliary_loss_clip": 0.01090502, + "auxiliary_loss_mlp": 0.00749423, + "balance_loss_clip": 1.03453612, + "balance_loss_mlp": 1.00017381, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.5191150842133814, + "language_loss": 0.65289187, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67129111, + "num_input_tokens_seen": 329750975, + "step": 15284, + "time_per_iteration": 2.6272895336151123 + }, + { + "auxiliary_loss_clip": 0.01093882, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.03117228, + "balance_loss_mlp": 1.0181663, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.7593926152346129, + "language_loss": 0.73957795, + "learning_rate": 6.840899211156292e-08, + "loss": 0.76081145, + "num_input_tokens_seen": 329769645, + "step": 15285, + "time_per_iteration": 2.492753744125366 + }, + { + "auxiliary_loss_clip": 0.01095399, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.03247058, + "balance_loss_mlp": 1.02104831, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 2.068964402116771, + "language_loss": 0.71686262, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73813993, + "num_input_tokens_seen": 329788185, + "step": 15286, + "time_per_iteration": 2.5066182613372803 + }, + { + "auxiliary_loss_clip": 0.01097664, + "auxiliary_loss_mlp": 0.01027654, + "balance_loss_clip": 1.03364122, + "balance_loss_mlp": 1.01647854, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 1.7555441599277584, + "language_loss": 0.73410636, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75535953, + "num_input_tokens_seen": 329806780, + "step": 15287, + "time_per_iteration": 2.595830202102661 + }, + { + "auxiliary_loss_clip": 0.01100224, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.03599548, + "balance_loss_mlp": 1.01718926, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 1.9396888472055087, + "language_loss": 0.65671712, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67801237, + "num_input_tokens_seen": 329826350, + "step": 15288, + "time_per_iteration": 2.5081441402435303 + }, + { + "auxiliary_loss_clip": 0.01101378, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.03769076, + "balance_loss_mlp": 1.02199316, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 2.2340263204569135, + "language_loss": 0.71411639, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73545897, + "num_input_tokens_seen": 329846160, + "step": 15289, + "time_per_iteration": 2.6501991748809814 + }, + { + "auxiliary_loss_clip": 0.01051667, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.03021622, + "balance_loss_mlp": 1.01934528, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 2.9409462002859175, + "language_loss": 0.74227285, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76309991, + "num_input_tokens_seen": 329862020, + "step": 15290, + "time_per_iteration": 2.6229400634765625 + }, + { + "auxiliary_loss_clip": 0.01054676, + "auxiliary_loss_mlp": 0.01026568, + "balance_loss_clip": 1.03291917, + "balance_loss_mlp": 1.01627445, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 2.0671019572458813, + "language_loss": 0.71814543, + "learning_rate": 6.78043746849506e-08, + "loss": 0.73895788, + "num_input_tokens_seen": 329880185, + "step": 15291, + "time_per_iteration": 2.7026665210723877 + }, + { + "auxiliary_loss_clip": 0.01069969, + "auxiliary_loss_mlp": 0.01024448, + "balance_loss_clip": 1.03069091, + "balance_loss_mlp": 1.01367188, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.868001089787203, + "language_loss": 0.7059896, + "learning_rate": 6.770386153326346e-08, + "loss": 0.72693372, + "num_input_tokens_seen": 329900255, + "step": 15292, + "time_per_iteration": 2.6159720420837402 + }, + { + "auxiliary_loss_clip": 0.01077378, + "auxiliary_loss_mlp": 0.01027121, + "balance_loss_clip": 1.03377748, + "balance_loss_mlp": 1.01595068, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 2.610681792964131, + "language_loss": 0.73046613, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75151122, + "num_input_tokens_seen": 329919095, + "step": 15293, + "time_per_iteration": 2.6133193969726562 + }, + { + "auxiliary_loss_clip": 0.01096035, + "auxiliary_loss_mlp": 0.01025956, + "balance_loss_clip": 1.03401148, + "balance_loss_mlp": 1.01525712, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 2.25528004614088, + "language_loss": 0.78225648, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80347633, + "num_input_tokens_seen": 329936505, + "step": 15294, + "time_per_iteration": 2.5187251567840576 + }, + { + "auxiliary_loss_clip": 0.01073216, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.03132463, + "balance_loss_mlp": 1.01990879, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.5388210405836726, + "language_loss": 0.7693764, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79042602, + "num_input_tokens_seen": 329956795, + "step": 15295, + "time_per_iteration": 4.100630044937134 + }, + { + "auxiliary_loss_clip": 0.01095522, + "auxiliary_loss_mlp": 0.01027488, + "balance_loss_clip": 1.03461277, + "balance_loss_mlp": 1.01818395, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.3669455954739838, + "language_loss": 0.71476084, + "learning_rate": 6.730254169322114e-08, + "loss": 0.735991, + "num_input_tokens_seen": 329977195, + "step": 15296, + "time_per_iteration": 2.574836492538452 + }, + { + "auxiliary_loss_clip": 0.01097421, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.03473234, + "balance_loss_mlp": 1.02395129, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.339550576487661, + "language_loss": 0.75078213, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77210134, + "num_input_tokens_seen": 329992095, + "step": 15297, + "time_per_iteration": 2.4652342796325684 + }, + { + "auxiliary_loss_clip": 0.01082905, + "auxiliary_loss_mlp": 0.007495, + "balance_loss_clip": 1.03321862, + "balance_loss_mlp": 1.00026512, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.5617867624569368, + "language_loss": 0.73626673, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75459087, + "num_input_tokens_seen": 330011490, + "step": 15298, + "time_per_iteration": 2.6474928855895996 + }, + { + "auxiliary_loss_clip": 0.01070057, + "auxiliary_loss_mlp": 0.0103333, + "balance_loss_clip": 1.03317094, + "balance_loss_mlp": 1.02245808, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.122519532422908, + "language_loss": 0.7927174, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81375128, + "num_input_tokens_seen": 330027885, + "step": 15299, + "time_per_iteration": 2.574120044708252 + }, + { + "auxiliary_loss_clip": 0.01066427, + "auxiliary_loss_mlp": 0.01022056, + "balance_loss_clip": 1.03318787, + "balance_loss_mlp": 1.01176178, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 1.948838819521124, + "language_loss": 0.63629901, + "learning_rate": 6.690239446242385e-08, + "loss": 0.65718383, + "num_input_tokens_seen": 330046230, + "step": 15300, + "time_per_iteration": 2.6405444145202637 + }, + { + "auxiliary_loss_clip": 0.01070662, + "auxiliary_loss_mlp": 0.00749021, + "balance_loss_clip": 1.03253961, + "balance_loss_mlp": 1.00018072, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.834354946067662, + "language_loss": 0.69627231, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71446913, + "num_input_tokens_seen": 330065535, + "step": 15301, + "time_per_iteration": 2.6144516468048096 + }, + { + "auxiliary_loss_clip": 0.01081583, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.03176498, + "balance_loss_mlp": 1.02230144, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 2.1540839535959146, + "language_loss": 0.70773238, + "learning_rate": 6.670276065138814e-08, + "loss": 0.72890782, + "num_input_tokens_seen": 330082920, + "step": 15302, + "time_per_iteration": 2.5173401832580566 + }, + { + "auxiliary_loss_clip": 0.01098231, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.03401363, + "balance_loss_mlp": 1.01780891, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.6983322386908273, + "language_loss": 0.76247728, + "learning_rate": 6.660305371021579e-08, + "loss": 0.78374445, + "num_input_tokens_seen": 330101165, + "step": 15303, + "time_per_iteration": 2.5570430755615234 + }, + { + "auxiliary_loss_clip": 0.01073617, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.03203475, + "balance_loss_mlp": 1.01769459, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 3.823111407653633, + "language_loss": 0.87974238, + "learning_rate": 6.650342008365006e-08, + "loss": 0.90076113, + "num_input_tokens_seen": 330118775, + "step": 15304, + "time_per_iteration": 4.033865451812744 + }, + { + "auxiliary_loss_clip": 0.01030524, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.02963257, + "balance_loss_mlp": 1.02249932, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 2.060473688839959, + "language_loss": 0.77061176, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79128242, + "num_input_tokens_seen": 330135570, + "step": 15305, + "time_per_iteration": 2.6598331928253174 + }, + { + "auxiliary_loss_clip": 0.01077395, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.03195, + "balance_loss_mlp": 1.024243, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 1.9040496387199188, + "language_loss": 0.80968308, + "learning_rate": 6.630437278944501e-08, + "loss": 0.83080977, + "num_input_tokens_seen": 330152840, + "step": 15306, + "time_per_iteration": 2.564379930496216 + }, + { + "auxiliary_loss_clip": 0.01055835, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.02944756, + "balance_loss_mlp": 1.02254343, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 2.2901486627596683, + "language_loss": 0.72087252, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74175549, + "num_input_tokens_seen": 330168605, + "step": 15307, + "time_per_iteration": 2.630563735961914 + }, + { + "auxiliary_loss_clip": 0.01088571, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.03369904, + "balance_loss_mlp": 1.01843488, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.8756879967065874, + "language_loss": 0.78805661, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80924183, + "num_input_tokens_seen": 330186160, + "step": 15308, + "time_per_iteration": 4.0602028369903564 + }, + { + "auxiliary_loss_clip": 0.010718, + "auxiliary_loss_mlp": 0.01029531, + "balance_loss_clip": 1.02987027, + "balance_loss_mlp": 1.01838553, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 1.8250109884178374, + "language_loss": 0.77538514, + "learning_rate": 6.600635180204484e-08, + "loss": 0.79639852, + "num_input_tokens_seen": 330201780, + "step": 15309, + "time_per_iteration": 2.5407540798187256 + }, + { + "auxiliary_loss_clip": 0.01037518, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.02738512, + "balance_loss_mlp": 1.01800764, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 1.9983962690759471, + "language_loss": 0.66580176, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68646997, + "num_input_tokens_seen": 330219165, + "step": 15310, + "time_per_iteration": 2.6700172424316406 + }, + { + "auxiliary_loss_clip": 0.01030321, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.02841115, + "balance_loss_mlp": 1.01823521, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6612377464779584, + "language_loss": 0.65845776, + "learning_rate": 6.580803782366495e-08, + "loss": 0.67905682, + "num_input_tokens_seen": 330238975, + "step": 15311, + "time_per_iteration": 2.7222390174865723 + }, + { + "auxiliary_loss_clip": 0.01085954, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.032022, + "balance_loss_mlp": 1.01956415, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.6215390904808675, + "language_loss": 0.75688654, + "learning_rate": 6.570899084972503e-08, + "loss": 0.7780512, + "num_input_tokens_seen": 330259755, + "step": 15312, + "time_per_iteration": 2.6049118041992188 + }, + { + "auxiliary_loss_clip": 0.01083317, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.033108, + "balance_loss_mlp": 1.02321649, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.5990274882323439, + "language_loss": 0.79223049, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81339991, + "num_input_tokens_seen": 330277660, + "step": 15313, + "time_per_iteration": 2.534539222717285 + }, + { + "auxiliary_loss_clip": 0.01086511, + "auxiliary_loss_mlp": 0.01026284, + "balance_loss_clip": 1.03215349, + "balance_loss_mlp": 1.01538277, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.88823551569096, + "language_loss": 0.7820937, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80322164, + "num_input_tokens_seen": 330295455, + "step": 15314, + "time_per_iteration": 2.5684239864349365 + }, + { + "auxiliary_loss_clip": 0.01081338, + "auxiliary_loss_mlp": 0.01031812, + "balance_loss_clip": 1.03423429, + "balance_loss_mlp": 1.01951528, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 1.7732731641255461, + "language_loss": 0.79227453, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81340599, + "num_input_tokens_seen": 330315310, + "step": 15315, + "time_per_iteration": 2.563246965408325 + }, + { + "auxiliary_loss_clip": 0.01078984, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.0340035, + "balance_loss_mlp": 1.02205944, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.8701243666469762, + "language_loss": 0.76453531, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78566045, + "num_input_tokens_seen": 330333260, + "step": 15316, + "time_per_iteration": 2.529104471206665 + }, + { + "auxiliary_loss_clip": 0.01096659, + "auxiliary_loss_mlp": 0.01030667, + "balance_loss_clip": 1.03194463, + "balance_loss_mlp": 1.01935422, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.8718362995000357, + "language_loss": 0.69378108, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71505427, + "num_input_tokens_seen": 330352465, + "step": 15317, + "time_per_iteration": 4.07588529586792 + }, + { + "auxiliary_loss_clip": 0.01088401, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.035326, + "balance_loss_mlp": 1.01972771, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 1.704946967457598, + "language_loss": 0.83500504, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85619688, + "num_input_tokens_seen": 330372685, + "step": 15318, + "time_per_iteration": 2.5508131980895996 + }, + { + "auxiliary_loss_clip": 0.01075822, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.03323805, + "balance_loss_mlp": 1.02210999, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 1.898180739583376, + "language_loss": 0.85462463, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87571359, + "num_input_tokens_seen": 330388860, + "step": 15319, + "time_per_iteration": 2.5409209728240967 + }, + { + "auxiliary_loss_clip": 0.01022297, + "auxiliary_loss_mlp": 0.01000257, + "balance_loss_clip": 1.0025022, + "balance_loss_mlp": 0.99925536, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7725276355931646, + "language_loss": 0.56196654, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58219206, + "num_input_tokens_seen": 330448735, + "step": 15320, + "time_per_iteration": 3.102971315383911 + }, + { + "auxiliary_loss_clip": 0.01045309, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.03152418, + "balance_loss_mlp": 1.02806997, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.7822387600111873, + "language_loss": 0.63892746, + "learning_rate": 6.482086921695384e-08, + "loss": 0.65979475, + "num_input_tokens_seen": 330465600, + "step": 15321, + "time_per_iteration": 2.666290521621704 + }, + { + "auxiliary_loss_clip": 0.01053696, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.03071737, + "balance_loss_mlp": 1.01875377, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.5543889160648117, + "language_loss": 0.71922636, + "learning_rate": 6.47225558966582e-08, + "loss": 0.74005395, + "num_input_tokens_seen": 330485770, + "step": 15322, + "time_per_iteration": 2.670494794845581 + }, + { + "auxiliary_loss_clip": 0.01047274, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03140593, + "balance_loss_mlp": 1.02040553, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.6646348626740817, + "language_loss": 0.70279562, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72357285, + "num_input_tokens_seen": 330504255, + "step": 15323, + "time_per_iteration": 2.6123781204223633 + }, + { + "auxiliary_loss_clip": 0.01068471, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.03007555, + "balance_loss_mlp": 1.01760471, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 2.2302669498941188, + "language_loss": 0.74447238, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76546192, + "num_input_tokens_seen": 330520705, + "step": 15324, + "time_per_iteration": 2.526085376739502 + }, + { + "auxiliary_loss_clip": 0.01086681, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.03320456, + "balance_loss_mlp": 1.02889001, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 2.065384750508658, + "language_loss": 0.71140075, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73266715, + "num_input_tokens_seen": 330539245, + "step": 15325, + "time_per_iteration": 2.6347947120666504 + }, + { + "auxiliary_loss_clip": 0.0107043, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.03305304, + "balance_loss_mlp": 1.0201447, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.791689656407903, + "language_loss": 0.78687185, + "learning_rate": 6.433003651186109e-08, + "loss": 0.8078866, + "num_input_tokens_seen": 330561815, + "step": 15326, + "time_per_iteration": 2.6672203540802 + }, + { + "auxiliary_loss_clip": 0.01089287, + "auxiliary_loss_mlp": 0.01030669, + "balance_loss_clip": 1.03476715, + "balance_loss_mlp": 1.01956451, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 2.544653035592959, + "language_loss": 0.71425509, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73545468, + "num_input_tokens_seen": 330579760, + "step": 15327, + "time_per_iteration": 2.4948298931121826 + }, + { + "auxiliary_loss_clip": 0.01090954, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.03555119, + "balance_loss_mlp": 1.02434468, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 1.7631173128738153, + "language_loss": 0.78083062, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80210131, + "num_input_tokens_seen": 330598545, + "step": 15328, + "time_per_iteration": 2.577263593673706 + }, + { + "auxiliary_loss_clip": 0.01077129, + "auxiliary_loss_mlp": 0.01025655, + "balance_loss_clip": 1.03428316, + "balance_loss_mlp": 1.01522422, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 3.5417521332192545, + "language_loss": 0.70938003, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73040789, + "num_input_tokens_seen": 330616700, + "step": 15329, + "time_per_iteration": 2.602424144744873 + }, + { + "auxiliary_loss_clip": 0.01084984, + "auxiliary_loss_mlp": 0.01023322, + "balance_loss_clip": 1.03193307, + "balance_loss_mlp": 1.01336777, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 1.891253989406102, + "language_loss": 0.86329567, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88437879, + "num_input_tokens_seen": 330633355, + "step": 15330, + "time_per_iteration": 2.478663206100464 + }, + { + "auxiliary_loss_clip": 0.01058272, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.03020298, + "balance_loss_mlp": 1.02040625, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.2686002647577532, + "language_loss": 0.75643128, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77733099, + "num_input_tokens_seen": 330651470, + "step": 15331, + "time_per_iteration": 2.586301803588867 + }, + { + "auxiliary_loss_clip": 0.0108583, + "auxiliary_loss_mlp": 0.01024717, + "balance_loss_clip": 1.0325551, + "balance_loss_mlp": 1.01406598, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.7930493119729516, + "language_loss": 0.75348175, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77458727, + "num_input_tokens_seen": 330669170, + "step": 15332, + "time_per_iteration": 2.573359727859497 + }, + { + "auxiliary_loss_clip": 0.01041475, + "auxiliary_loss_mlp": 0.01030789, + "balance_loss_clip": 1.0286243, + "balance_loss_mlp": 1.02049553, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 1.8113040594901348, + "language_loss": 0.74504864, + "learning_rate": 6.364595366195358e-08, + "loss": 0.76577127, + "num_input_tokens_seen": 330686635, + "step": 15333, + "time_per_iteration": 2.669428825378418 + }, + { + "auxiliary_loss_clip": 0.01013842, + "auxiliary_loss_mlp": 0.01001267, + "balance_loss_clip": 1.00616932, + "balance_loss_mlp": 1.00030696, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.82452883359951, + "language_loss": 0.52972913, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54988015, + "num_input_tokens_seen": 330749160, + "step": 15334, + "time_per_iteration": 3.105712652206421 + }, + { + "auxiliary_loss_clip": 0.01066384, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.03099585, + "balance_loss_mlp": 1.02067566, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 1.9170343023305838, + "language_loss": 0.62506723, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64603913, + "num_input_tokens_seen": 330766840, + "step": 15335, + "time_per_iteration": 4.025209903717041 + }, + { + "auxiliary_loss_clip": 0.01038883, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.0288043, + "balance_loss_mlp": 1.01884842, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.8152903044608593, + "language_loss": 0.71658093, + "learning_rate": 6.335387662475366e-08, + "loss": 0.7372756, + "num_input_tokens_seen": 330785585, + "step": 15336, + "time_per_iteration": 2.6313610076904297 + }, + { + "auxiliary_loss_clip": 0.01071775, + "auxiliary_loss_mlp": 0.01029778, + "balance_loss_clip": 1.03159118, + "balance_loss_mlp": 1.02016354, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.860030840903329, + "language_loss": 0.72217792, + "learning_rate": 6.325666448306433e-08, + "loss": 0.74319345, + "num_input_tokens_seen": 330800750, + "step": 15337, + "time_per_iteration": 2.5319411754608154 + }, + { + "auxiliary_loss_clip": 0.01012841, + "auxiliary_loss_mlp": 0.01004824, + "balance_loss_clip": 1.00265622, + "balance_loss_mlp": 1.00397146, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8881308881333784, + "language_loss": 0.65287781, + "learning_rate": 6.31595257828763e-08, + "loss": 0.6730544, + "num_input_tokens_seen": 330863640, + "step": 15338, + "time_per_iteration": 3.076841115951538 + }, + { + "auxiliary_loss_clip": 0.01089665, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.03507805, + "balance_loss_mlp": 1.02075422, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 2.116355310074575, + "language_loss": 0.67293513, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69415331, + "num_input_tokens_seen": 330884675, + "step": 15339, + "time_per_iteration": 2.5943634510040283 + }, + { + "auxiliary_loss_clip": 0.01098343, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.03481746, + "balance_loss_mlp": 1.01928902, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 1.8169481878343705, + "language_loss": 0.71710098, + "learning_rate": 6.296546872173513e-08, + "loss": 0.73838699, + "num_input_tokens_seen": 330904125, + "step": 15340, + "time_per_iteration": 2.5385916233062744 + }, + { + "auxiliary_loss_clip": 0.01062041, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.03130722, + "balance_loss_mlp": 1.02328396, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 2.46476730359113, + "language_loss": 0.70679331, + "learning_rate": 6.286855036814098e-08, + "loss": 0.7277528, + "num_input_tokens_seen": 330925140, + "step": 15341, + "time_per_iteration": 2.6751785278320312 + }, + { + "auxiliary_loss_clip": 0.01043797, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.03170717, + "balance_loss_mlp": 1.01746535, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.9350880501529053, + "language_loss": 0.67375433, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69446379, + "num_input_tokens_seen": 330946625, + "step": 15342, + "time_per_iteration": 2.7083470821380615 + }, + { + "auxiliary_loss_clip": 0.01049759, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.01965857, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 2.0091093589316267, + "language_loss": 0.69843698, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71923059, + "num_input_tokens_seen": 330967795, + "step": 15343, + "time_per_iteration": 4.375161647796631 + }, + { + "auxiliary_loss_clip": 0.0100363, + "auxiliary_loss_mlp": 0.01001371, + "balance_loss_clip": 1.00324965, + "balance_loss_mlp": 1.000471, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7139125525633514, + "language_loss": 0.51985967, + "learning_rate": 6.257823605935786e-08, + "loss": 0.53990966, + "num_input_tokens_seen": 331040850, + "step": 15344, + "time_per_iteration": 3.2651712894439697 + }, + { + "auxiliary_loss_clip": 0.01091996, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.03321683, + "balance_loss_mlp": 1.01964378, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 2.5053911788310184, + "language_loss": 0.70478916, + "learning_rate": 6.248161155266162e-08, + "loss": 0.72600269, + "num_input_tokens_seen": 331060595, + "step": 15345, + "time_per_iteration": 2.482452154159546 + }, + { + "auxiliary_loss_clip": 0.01074754, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.03126466, + "balance_loss_mlp": 1.02229691, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.7384547064347537, + "language_loss": 0.77527046, + "learning_rate": 6.238506051685677e-08, + "loss": 0.7963503, + "num_input_tokens_seen": 331080195, + "step": 15346, + "time_per_iteration": 2.646580219268799 + }, + { + "auxiliary_loss_clip": 0.01078071, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.0350492, + "balance_loss_mlp": 1.02258539, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 1.9288306310203647, + "language_loss": 0.76113731, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78225768, + "num_input_tokens_seen": 331097645, + "step": 15347, + "time_per_iteration": 2.5582375526428223 + }, + { + "auxiliary_loss_clip": 0.01083853, + "auxiliary_loss_mlp": 0.01029807, + "balance_loss_clip": 1.03462505, + "balance_loss_mlp": 1.01978779, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.4744771702342268, + "language_loss": 0.76857829, + "learning_rate": 6.219217887256367e-08, + "loss": 0.78971487, + "num_input_tokens_seen": 331116830, + "step": 15348, + "time_per_iteration": 4.0721435546875 + }, + { + "auxiliary_loss_clip": 0.01074063, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.03160799, + "balance_loss_mlp": 1.02101541, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 7.334193432400714, + "language_loss": 0.67606986, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69713247, + "num_input_tokens_seen": 331137235, + "step": 15349, + "time_per_iteration": 2.616302490234375 + }, + { + "auxiliary_loss_clip": 0.01052215, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.02981544, + "balance_loss_mlp": 1.01668596, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 2.134295927319142, + "language_loss": 0.86925203, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89005083, + "num_input_tokens_seen": 331153155, + "step": 15350, + "time_per_iteration": 2.566025495529175 + }, + { + "auxiliary_loss_clip": 0.01003315, + "auxiliary_loss_mlp": 0.01002784, + "balance_loss_clip": 1.0032413, + "balance_loss_mlp": 1.00192547, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.764212602640533, + "language_loss": 0.60372019, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62378114, + "num_input_tokens_seen": 331214895, + "step": 15351, + "time_per_iteration": 3.0643038749694824 + }, + { + "auxiliary_loss_clip": 0.01071699, + "auxiliary_loss_mlp": 0.0102253, + "balance_loss_clip": 1.03040981, + "balance_loss_mlp": 1.01191425, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 3.0018294461441317, + "language_loss": 0.77446377, + "learning_rate": 6.180729739558233e-08, + "loss": 0.7954061, + "num_input_tokens_seen": 331232185, + "step": 15352, + "time_per_iteration": 2.5529744625091553 + }, + { + "auxiliary_loss_clip": 0.01058718, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_clip": 1.0296458, + "balance_loss_mlp": 1.02920175, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 2.0930203589613843, + "language_loss": 0.59799021, + "learning_rate": 6.171126075837585e-08, + "loss": 0.61900228, + "num_input_tokens_seen": 331251065, + "step": 15353, + "time_per_iteration": 2.647409200668335 + }, + { + "auxiliary_loss_clip": 0.0106921, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.03184104, + "balance_loss_mlp": 1.01856899, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.7977152628165203, + "language_loss": 0.74722874, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76821089, + "num_input_tokens_seen": 331269110, + "step": 15354, + "time_per_iteration": 2.632863998413086 + }, + { + "auxiliary_loss_clip": 0.01099657, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.03331113, + "balance_loss_mlp": 1.01917362, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 2.1231951689698567, + "language_loss": 0.6500805, + "learning_rate": 6.1519407987912e-08, + "loss": 0.67138791, + "num_input_tokens_seen": 331286555, + "step": 15355, + "time_per_iteration": 2.531230926513672 + }, + { + "auxiliary_loss_clip": 0.01068341, + "auxiliary_loss_mlp": 0.01037321, + "balance_loss_clip": 1.03174639, + "balance_loss_mlp": 1.02597189, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.5128879397802297, + "language_loss": 0.74031913, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76137578, + "num_input_tokens_seen": 331307660, + "step": 15356, + "time_per_iteration": 2.6857502460479736 + }, + { + "auxiliary_loss_clip": 0.01070346, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.03127027, + "balance_loss_mlp": 1.02213502, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 1.9814972681860965, + "language_loss": 0.6096611, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63069725, + "num_input_tokens_seen": 331324885, + "step": 15357, + "time_per_iteration": 2.5933468341827393 + }, + { + "auxiliary_loss_clip": 0.01068152, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.03181672, + "balance_loss_mlp": 1.0172714, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.4140422426694956, + "language_loss": 0.70188642, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72285378, + "num_input_tokens_seen": 331345885, + "step": 15358, + "time_per_iteration": 4.2136070728302 + }, + { + "auxiliary_loss_clip": 0.01095526, + "auxiliary_loss_mlp": 0.0102816, + "balance_loss_clip": 1.03247416, + "balance_loss_mlp": 1.01716316, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 2.0343454529894474, + "language_loss": 0.73234278, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75357962, + "num_input_tokens_seen": 331364320, + "step": 15359, + "time_per_iteration": 2.5711464881896973 + }, + { + "auxiliary_loss_clip": 0.0103514, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.03257704, + "balance_loss_mlp": 1.01911283, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 2.060563866261277, + "language_loss": 0.65007287, + "learning_rate": 6.104106250440732e-08, + "loss": 0.67072248, + "num_input_tokens_seen": 331384135, + "step": 15360, + "time_per_iteration": 2.8373680114746094 + }, + { + "auxiliary_loss_clip": 0.01012581, + "auxiliary_loss_mlp": 0.00746602, + "balance_loss_clip": 1.00275147, + "balance_loss_mlp": 0.99974155, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7598322789624812, + "language_loss": 0.55120343, + "learning_rate": 6.094561396976083e-08, + "loss": 0.56879526, + "num_input_tokens_seen": 331440645, + "step": 15361, + "time_per_iteration": 3.198185682296753 + }, + { + "auxiliary_loss_clip": 0.01060886, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.02901947, + "balance_loss_mlp": 1.02111971, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 2.8014563687799545, + "language_loss": 0.69799149, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71893108, + "num_input_tokens_seen": 331459580, + "step": 15362, + "time_per_iteration": 2.6245839595794678 + }, + { + "auxiliary_loss_clip": 0.01080259, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.03127861, + "balance_loss_mlp": 1.01919746, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.5128444116900963, + "language_loss": 0.75835836, + "learning_rate": 6.075493749149463e-08, + "loss": 0.77948332, + "num_input_tokens_seen": 331481560, + "step": 15363, + "time_per_iteration": 2.6146929264068604 + }, + { + "auxiliary_loss_clip": 0.01096958, + "auxiliary_loss_mlp": 0.0102836, + "balance_loss_clip": 1.03336549, + "balance_loss_mlp": 1.01734519, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 1.8921588924580186, + "language_loss": 0.82884818, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85010135, + "num_input_tokens_seen": 331499090, + "step": 15364, + "time_per_iteration": 2.584594488143921 + }, + { + "auxiliary_loss_clip": 0.01062384, + "auxiliary_loss_mlp": 0.01023045, + "balance_loss_clip": 1.03231144, + "balance_loss_mlp": 1.01298332, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.6890384082814176, + "language_loss": 0.67986798, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70072228, + "num_input_tokens_seen": 331519420, + "step": 15365, + "time_per_iteration": 2.6408193111419678 + }, + { + "auxiliary_loss_clip": 0.01098427, + "auxiliary_loss_mlp": 0.01025741, + "balance_loss_clip": 1.03421688, + "balance_loss_mlp": 1.01480961, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.044883545445625, + "language_loss": 0.63112622, + "learning_rate": 6.046947430586913e-08, + "loss": 0.65236789, + "num_input_tokens_seen": 331538720, + "step": 15366, + "time_per_iteration": 2.5994279384613037 + }, + { + "auxiliary_loss_clip": 0.01065163, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.03448391, + "balance_loss_mlp": 1.01655877, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.8280945382638765, + "language_loss": 0.74340975, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76433945, + "num_input_tokens_seen": 331558505, + "step": 15367, + "time_per_iteration": 2.5969645977020264 + }, + { + "auxiliary_loss_clip": 0.01072116, + "auxiliary_loss_mlp": 0.00749087, + "balance_loss_clip": 1.03226411, + "balance_loss_mlp": 1.00019813, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 2.0946293127518887, + "language_loss": 0.64925069, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66746271, + "num_input_tokens_seen": 331578440, + "step": 15368, + "time_per_iteration": 2.6574032306671143 + }, + { + "auxiliary_loss_clip": 0.01087283, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.03233635, + "balance_loss_mlp": 1.01994586, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 2.036364911732793, + "language_loss": 0.7445491, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76573622, + "num_input_tokens_seen": 331598945, + "step": 15369, + "time_per_iteration": 2.5938215255737305 + }, + { + "auxiliary_loss_clip": 0.01092869, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.03648162, + "balance_loss_mlp": 1.01996589, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 1.9925223613410787, + "language_loss": 0.76484591, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78610092, + "num_input_tokens_seen": 331616700, + "step": 15370, + "time_per_iteration": 2.565340280532837 + }, + { + "auxiliary_loss_clip": 0.01098524, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.03414583, + "balance_loss_mlp": 1.01908946, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.265520597384197, + "language_loss": 0.67328453, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69456959, + "num_input_tokens_seen": 331635625, + "step": 15371, + "time_per_iteration": 2.511235237121582 + }, + { + "auxiliary_loss_clip": 0.01004898, + "auxiliary_loss_mlp": 0.00997969, + "balance_loss_clip": 1.00494695, + "balance_loss_mlp": 0.99699748, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7250171444064222, + "language_loss": 0.57676423, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59679288, + "num_input_tokens_seen": 331698595, + "step": 15372, + "time_per_iteration": 3.0956552028656006 + }, + { + "auxiliary_loss_clip": 0.01097179, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.03560948, + "balance_loss_mlp": 1.01764286, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 1.841573652069827, + "language_loss": 0.69764221, + "learning_rate": 5.98059678590237e-08, + "loss": 0.71889007, + "num_input_tokens_seen": 331717975, + "step": 15373, + "time_per_iteration": 2.505078077316284 + }, + { + "auxiliary_loss_clip": 0.01082171, + "auxiliary_loss_mlp": 0.01036735, + "balance_loss_clip": 1.03312147, + "balance_loss_mlp": 1.02596986, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 3.4105552705911464, + "language_loss": 0.75308079, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77426982, + "num_input_tokens_seen": 331737220, + "step": 15374, + "time_per_iteration": 2.507316827774048 + }, + { + "auxiliary_loss_clip": 0.01056138, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.03059936, + "balance_loss_mlp": 1.02043748, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.623815438630629, + "language_loss": 0.64871216, + "learning_rate": 5.961705668581784e-08, + "loss": 0.66958654, + "num_input_tokens_seen": 331757300, + "step": 15375, + "time_per_iteration": 4.126837491989136 + }, + { + "auxiliary_loss_clip": 0.01070536, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.03502882, + "balance_loss_mlp": 1.01793742, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.858663551060532, + "language_loss": 0.66351855, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68451184, + "num_input_tokens_seen": 331776995, + "step": 15376, + "time_per_iteration": 2.7002079486846924 + }, + { + "auxiliary_loss_clip": 0.01023027, + "auxiliary_loss_mlp": 0.00997205, + "balance_loss_clip": 1.00317633, + "balance_loss_mlp": 0.99618572, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.7085665792065701, + "language_loss": 0.61134815, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63155043, + "num_input_tokens_seen": 331845015, + "step": 15377, + "time_per_iteration": 3.103891611099243 + }, + { + "auxiliary_loss_clip": 0.01054208, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.02969384, + "balance_loss_mlp": 1.02342546, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 1.8021635218739966, + "language_loss": 0.74112999, + "learning_rate": 5.933424178131341e-08, + "loss": 0.76201296, + "num_input_tokens_seen": 331862795, + "step": 15378, + "time_per_iteration": 2.652231454849243 + }, + { + "auxiliary_loss_clip": 0.01099672, + "auxiliary_loss_mlp": 0.01029787, + "balance_loss_clip": 1.03542376, + "balance_loss_mlp": 1.01796126, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 2.6683113314722147, + "language_loss": 0.62410372, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64539838, + "num_input_tokens_seen": 331882535, + "step": 15379, + "time_per_iteration": 2.6510961055755615 + }, + { + "auxiliary_loss_clip": 0.01011174, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.0275656, + "balance_loss_mlp": 1.01992321, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 2.1276694549872084, + "language_loss": 0.83738953, + "learning_rate": 5.914606645688591e-08, + "loss": 0.85782194, + "num_input_tokens_seen": 331899335, + "step": 15380, + "time_per_iteration": 2.8180456161499023 + }, + { + "auxiliary_loss_clip": 0.01098412, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.03254831, + "balance_loss_mlp": 1.01797783, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.4583247577775251, + "language_loss": 0.73243445, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75371605, + "num_input_tokens_seen": 331919030, + "step": 15381, + "time_per_iteration": 2.971024751663208 + }, + { + "auxiliary_loss_clip": 0.01082248, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.03480542, + "balance_loss_mlp": 1.01875043, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.9321456470973168, + "language_loss": 0.78368258, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80479687, + "num_input_tokens_seen": 331936465, + "step": 15382, + "time_per_iteration": 2.561469793319702 + }, + { + "auxiliary_loss_clip": 0.01073502, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.0308677, + "balance_loss_mlp": 1.02408791, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 3.852442873965433, + "language_loss": 0.74749339, + "learning_rate": 5.886435545946455e-08, + "loss": 0.76858288, + "num_input_tokens_seen": 331954625, + "step": 15383, + "time_per_iteration": 4.20122218132019 + }, + { + "auxiliary_loss_clip": 0.01073817, + "auxiliary_loss_mlp": 0.01026393, + "balance_loss_clip": 1.0311842, + "balance_loss_mlp": 1.01611757, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.5446571394597386, + "language_loss": 0.75606894, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77707106, + "num_input_tokens_seen": 331975865, + "step": 15384, + "time_per_iteration": 2.6262946128845215 + }, + { + "auxiliary_loss_clip": 0.01065139, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.03250885, + "balance_loss_mlp": 1.02052021, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 1.8678327044787413, + "language_loss": 0.6655466, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68650931, + "num_input_tokens_seen": 331992760, + "step": 15385, + "time_per_iteration": 2.578977584838867 + }, + { + "auxiliary_loss_clip": 0.01096421, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.03326631, + "balance_loss_mlp": 1.02277696, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 2.084185013509946, + "language_loss": 0.8049373, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82623506, + "num_input_tokens_seen": 332011890, + "step": 15386, + "time_per_iteration": 2.517859935760498 + }, + { + "auxiliary_loss_clip": 0.01083393, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.03250813, + "balance_loss_mlp": 1.0275315, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 2.1213842433468546, + "language_loss": 0.75540096, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77662307, + "num_input_tokens_seen": 332029485, + "step": 15387, + "time_per_iteration": 2.5672810077667236 + }, + { + "auxiliary_loss_clip": 0.01081387, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.03274035, + "balance_loss_mlp": 1.01992869, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.4463729888057861, + "language_loss": 0.69896573, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72008091, + "num_input_tokens_seen": 332052970, + "step": 15388, + "time_per_iteration": 2.6875686645507812 + }, + { + "auxiliary_loss_clip": 0.01087545, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.03333116, + "balance_loss_mlp": 1.01675081, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.6842276080483474, + "language_loss": 0.81947958, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84063101, + "num_input_tokens_seen": 332070395, + "step": 15389, + "time_per_iteration": 4.141648769378662 + }, + { + "auxiliary_loss_clip": 0.010903, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.03349745, + "balance_loss_mlp": 1.02195108, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 2.225812733342042, + "language_loss": 0.79034734, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81159085, + "num_input_tokens_seen": 332090185, + "step": 15390, + "time_per_iteration": 2.540179491043091 + }, + { + "auxiliary_loss_clip": 0.01062093, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.03175735, + "balance_loss_mlp": 1.02284992, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 2.0454901593509627, + "language_loss": 0.75456178, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77552891, + "num_input_tokens_seen": 332109050, + "step": 15391, + "time_per_iteration": 2.626910448074341 + }, + { + "auxiliary_loss_clip": 0.01073765, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.03132474, + "balance_loss_mlp": 1.01951289, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 3.1372876825565457, + "language_loss": 0.51969469, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54075128, + "num_input_tokens_seen": 332131180, + "step": 15392, + "time_per_iteration": 2.680866241455078 + }, + { + "auxiliary_loss_clip": 0.01096447, + "auxiliary_loss_mlp": 0.0102457, + "balance_loss_clip": 1.03230071, + "balance_loss_mlp": 1.01441979, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.7669338756639603, + "language_loss": 0.77248269, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79369289, + "num_input_tokens_seen": 332149555, + "step": 15393, + "time_per_iteration": 2.5405848026275635 + }, + { + "auxiliary_loss_clip": 0.01064328, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.03075814, + "balance_loss_mlp": 1.01956534, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 1.866533160286052, + "language_loss": 0.69664007, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71758652, + "num_input_tokens_seen": 332165830, + "step": 15394, + "time_per_iteration": 2.5483858585357666 + }, + { + "auxiliary_loss_clip": 0.01098499, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.03453493, + "balance_loss_mlp": 1.01463318, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.6068866096478518, + "language_loss": 0.72982395, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75106454, + "num_input_tokens_seen": 332185130, + "step": 15395, + "time_per_iteration": 2.479480266571045 + }, + { + "auxiliary_loss_clip": 0.01045184, + "auxiliary_loss_mlp": 0.01027946, + "balance_loss_clip": 1.0290252, + "balance_loss_mlp": 1.01800358, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 1.9282675717391864, + "language_loss": 0.71851897, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.7392503, + "num_input_tokens_seen": 332203695, + "step": 15396, + "time_per_iteration": 2.6559369564056396 + }, + { + "auxiliary_loss_clip": 0.01095966, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.03392768, + "balance_loss_mlp": 1.01669073, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.7029061635238285, + "language_loss": 0.87698603, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89822346, + "num_input_tokens_seen": 332224850, + "step": 15397, + "time_per_iteration": 2.5593783855438232 + }, + { + "auxiliary_loss_clip": 0.01022749, + "auxiliary_loss_mlp": 0.01000896, + "balance_loss_clip": 1.00288594, + "balance_loss_mlp": 0.99996072, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.8104986103551882, + "language_loss": 0.55192542, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57216185, + "num_input_tokens_seen": 332278085, + "step": 15398, + "time_per_iteration": 4.367262601852417 + }, + { + "auxiliary_loss_clip": 0.01075387, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.03133547, + "balance_loss_mlp": 1.01914728, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 1.978503902696574, + "language_loss": 0.76332331, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78440249, + "num_input_tokens_seen": 332297875, + "step": 15399, + "time_per_iteration": 2.6261305809020996 + }, + { + "auxiliary_loss_clip": 0.01062509, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.02786779, + "balance_loss_mlp": 1.01826346, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.663297398202524, + "language_loss": 0.78255928, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80346501, + "num_input_tokens_seen": 332318500, + "step": 15400, + "time_per_iteration": 2.5861949920654297 + }, + { + "auxiliary_loss_clip": 0.01014208, + "auxiliary_loss_mlp": 0.01001174, + "balance_loss_clip": 1.00444698, + "balance_loss_mlp": 1.00016069, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7321376404973272, + "language_loss": 0.51323509, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53338891, + "num_input_tokens_seen": 332381980, + "step": 15401, + "time_per_iteration": 3.0596208572387695 + }, + { + "auxiliary_loss_clip": 0.01082732, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.03319776, + "balance_loss_mlp": 1.0206008, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.767940336722792, + "language_loss": 0.82326853, + "learning_rate": 5.709557384259378e-08, + "loss": 0.84440434, + "num_input_tokens_seen": 332399510, + "step": 15402, + "time_per_iteration": 2.5623068809509277 + }, + { + "auxiliary_loss_clip": 0.01022926, + "auxiliary_loss_mlp": 0.01001804, + "balance_loss_clip": 1.0031631, + "balance_loss_mlp": 1.00083816, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7333831334664555, + "language_loss": 0.5107879, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53103518, + "num_input_tokens_seen": 332459130, + "step": 15403, + "time_per_iteration": 3.1465606689453125 + }, + { + "auxiliary_loss_clip": 0.0100349, + "auxiliary_loss_mlp": 0.01004015, + "balance_loss_clip": 1.00414133, + "balance_loss_mlp": 1.00315714, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6952947605238491, + "language_loss": 0.58804011, + "learning_rate": 5.69109330631965e-08, + "loss": 0.6081152, + "num_input_tokens_seen": 332526555, + "step": 15404, + "time_per_iteration": 3.211110830307007 + }, + { + "auxiliary_loss_clip": 0.01071807, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.03311467, + "balance_loss_mlp": 1.01895416, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 2.020806000888589, + "language_loss": 0.71856588, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73958516, + "num_input_tokens_seen": 332544005, + "step": 15405, + "time_per_iteration": 2.5535166263580322 + }, + { + "auxiliary_loss_clip": 0.01051871, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.03220677, + "balance_loss_mlp": 1.02662873, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.7085539949532065, + "language_loss": 0.68488705, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70579189, + "num_input_tokens_seen": 332563070, + "step": 15406, + "time_per_iteration": 2.6030406951904297 + }, + { + "auxiliary_loss_clip": 0.01042605, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_clip": 1.02892876, + "balance_loss_mlp": 1.02340937, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.8076050890615294, + "language_loss": 0.76837456, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78916973, + "num_input_tokens_seen": 332579620, + "step": 15407, + "time_per_iteration": 2.6155710220336914 + }, + { + "auxiliary_loss_clip": 0.01063258, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.02895379, + "balance_loss_mlp": 1.02461553, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 2.9475654812680445, + "language_loss": 0.72852653, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74952513, + "num_input_tokens_seen": 332597795, + "step": 15408, + "time_per_iteration": 2.609577178955078 + }, + { + "auxiliary_loss_clip": 0.01063706, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.03036547, + "balance_loss_mlp": 1.01897931, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 2.111956538751035, + "language_loss": 0.68479818, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70572227, + "num_input_tokens_seen": 332620375, + "step": 15409, + "time_per_iteration": 2.8199379444122314 + }, + { + "auxiliary_loss_clip": 0.01065395, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.03473306, + "balance_loss_mlp": 1.01850927, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 1.923018089061312, + "language_loss": 0.75385427, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77481335, + "num_input_tokens_seen": 332639510, + "step": 15410, + "time_per_iteration": 2.737248659133911 + }, + { + "auxiliary_loss_clip": 0.01046909, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.03268218, + "balance_loss_mlp": 1.01943016, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.661619902107034, + "language_loss": 0.81954443, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84032071, + "num_input_tokens_seen": 332658350, + "step": 15411, + "time_per_iteration": 2.6615419387817383 + }, + { + "auxiliary_loss_clip": 0.01071172, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.03459466, + "balance_loss_mlp": 1.02270317, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 1.9308110860267838, + "language_loss": 0.75288939, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77393818, + "num_input_tokens_seen": 332676715, + "step": 15412, + "time_per_iteration": 2.599482774734497 + }, + { + "auxiliary_loss_clip": 0.01094771, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.03162169, + "balance_loss_mlp": 1.01674485, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 1.8247051519526487, + "language_loss": 0.66594064, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.68716258, + "num_input_tokens_seen": 332701470, + "step": 15413, + "time_per_iteration": 2.608036994934082 + }, + { + "auxiliary_loss_clip": 0.01036451, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.03025496, + "balance_loss_mlp": 1.02057815, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.6687690921236367, + "language_loss": 0.75780809, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.77849817, + "num_input_tokens_seen": 332719060, + "step": 15414, + "time_per_iteration": 2.6577537059783936 + }, + { + "auxiliary_loss_clip": 0.01084423, + "auxiliary_loss_mlp": 0.01029307, + "balance_loss_clip": 1.03506184, + "balance_loss_mlp": 1.01874518, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 2.1260906457842164, + "language_loss": 0.81669033, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83782768, + "num_input_tokens_seen": 332736345, + "step": 15415, + "time_per_iteration": 4.007412910461426 + }, + { + "auxiliary_loss_clip": 0.0107377, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.03009939, + "balance_loss_mlp": 1.02148259, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.3331038214262616, + "language_loss": 0.54234755, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56341684, + "num_input_tokens_seen": 332756270, + "step": 15416, + "time_per_iteration": 2.603729724884033 + }, + { + "auxiliary_loss_clip": 0.01053908, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.03075063, + "balance_loss_mlp": 1.01988387, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.4717317460189245, + "language_loss": 0.71876037, + "learning_rate": 5.571795325221807e-08, + "loss": 0.73960447, + "num_input_tokens_seen": 332775185, + "step": 15417, + "time_per_iteration": 2.60044264793396 + }, + { + "auxiliary_loss_clip": 0.01082632, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.03364038, + "balance_loss_mlp": 1.01751888, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 5.362184975318964, + "language_loss": 0.75678003, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77789354, + "num_input_tokens_seen": 332794320, + "step": 15418, + "time_per_iteration": 2.562145471572876 + }, + { + "auxiliary_loss_clip": 0.0108483, + "auxiliary_loss_mlp": 0.01025855, + "balance_loss_clip": 1.03223467, + "balance_loss_mlp": 1.01497734, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 2.9176721508076007, + "language_loss": 0.76339841, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78450531, + "num_input_tokens_seen": 332818095, + "step": 15419, + "time_per_iteration": 2.623953342437744 + }, + { + "auxiliary_loss_clip": 0.01092328, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.03162646, + "balance_loss_mlp": 1.02021408, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 4.3100615042332, + "language_loss": 0.76048136, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78170824, + "num_input_tokens_seen": 332839860, + "step": 15420, + "time_per_iteration": 2.564342975616455 + }, + { + "auxiliary_loss_clip": 0.01087924, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.03231597, + "balance_loss_mlp": 1.01895356, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.4999065463298484, + "language_loss": 0.76735711, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78854036, + "num_input_tokens_seen": 332861155, + "step": 15421, + "time_per_iteration": 2.5687944889068604 + }, + { + "auxiliary_loss_clip": 0.01070859, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.03350878, + "balance_loss_mlp": 1.02044904, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 2.0076592861715237, + "language_loss": 0.72849399, + "learning_rate": 5.526243217829041e-08, + "loss": 0.74951452, + "num_input_tokens_seen": 332881110, + "step": 15422, + "time_per_iteration": 2.6344873905181885 + }, + { + "auxiliary_loss_clip": 0.01087348, + "auxiliary_loss_mlp": 0.01037435, + "balance_loss_clip": 1.03297555, + "balance_loss_mlp": 1.02559793, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 2.071354277454767, + "language_loss": 0.77324742, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79449528, + "num_input_tokens_seen": 332899350, + "step": 15423, + "time_per_iteration": 4.144218683242798 + }, + { + "auxiliary_loss_clip": 0.01087899, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.03305292, + "balance_loss_mlp": 1.01851106, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 1.7563592393486551, + "language_loss": 0.75718927, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77836835, + "num_input_tokens_seen": 332918105, + "step": 15424, + "time_per_iteration": 2.5559804439544678 + }, + { + "auxiliary_loss_clip": 0.010127, + "auxiliary_loss_mlp": 0.0100202, + "balance_loss_clip": 1.00245118, + "balance_loss_mlp": 1.00097132, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7776885174602421, + "language_loss": 0.60667169, + "learning_rate": 5.499000444202351e-08, + "loss": 0.6268189, + "num_input_tokens_seen": 332969490, + "step": 15425, + "time_per_iteration": 2.881218433380127 + }, + { + "auxiliary_loss_clip": 0.01076253, + "auxiliary_loss_mlp": 0.00749273, + "balance_loss_clip": 1.03423548, + "balance_loss_mlp": 1.00019574, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.4474175897226063, + "language_loss": 0.70960462, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72785991, + "num_input_tokens_seen": 332988805, + "step": 15426, + "time_per_iteration": 2.664888620376587 + }, + { + "auxiliary_loss_clip": 0.01074032, + "auxiliary_loss_mlp": 0.01026842, + "balance_loss_clip": 1.03349543, + "balance_loss_mlp": 1.01660168, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 3.282909961318447, + "language_loss": 0.83101451, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85202324, + "num_input_tokens_seen": 333007960, + "step": 15427, + "time_per_iteration": 2.5365586280822754 + }, + { + "auxiliary_loss_clip": 0.01061232, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.03262043, + "balance_loss_mlp": 1.02096629, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.6137543301580208, + "language_loss": 0.77024388, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79117489, + "num_input_tokens_seen": 333026035, + "step": 15428, + "time_per_iteration": 2.6545562744140625 + }, + { + "auxiliary_loss_clip": 0.01054608, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.02892172, + "balance_loss_mlp": 1.02090621, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 2.0032516220794583, + "language_loss": 0.74486279, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76573551, + "num_input_tokens_seen": 333045590, + "step": 15429, + "time_per_iteration": 4.099015712738037 + }, + { + "auxiliary_loss_clip": 0.01056952, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.02992892, + "balance_loss_mlp": 1.0195595, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 2.0174603938915636, + "language_loss": 0.74556082, + "learning_rate": 5.45374333601647e-08, + "loss": 0.76642895, + "num_input_tokens_seen": 333063355, + "step": 15430, + "time_per_iteration": 2.581882953643799 + }, + { + "auxiliary_loss_clip": 0.01085077, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.03154016, + "balance_loss_mlp": 1.01829004, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.3300210199467508, + "language_loss": 0.76615477, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78730512, + "num_input_tokens_seen": 333088045, + "step": 15431, + "time_per_iteration": 2.6465234756469727 + }, + { + "auxiliary_loss_clip": 0.01082811, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.03257823, + "balance_loss_mlp": 1.01898503, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.4945887797816881, + "language_loss": 0.70479918, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72591925, + "num_input_tokens_seen": 333108005, + "step": 15432, + "time_per_iteration": 2.558823585510254 + }, + { + "auxiliary_loss_clip": 0.01050306, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.03234291, + "balance_loss_mlp": 1.01993787, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.3900574909954235, + "language_loss": 0.82294089, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84374887, + "num_input_tokens_seen": 333124335, + "step": 15433, + "time_per_iteration": 2.6864638328552246 + }, + { + "auxiliary_loss_clip": 0.01094963, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.03455544, + "balance_loss_mlp": 1.01753187, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 1.8138135064401326, + "language_loss": 0.66345209, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68467587, + "num_input_tokens_seen": 333143995, + "step": 15434, + "time_per_iteration": 2.5858309268951416 + }, + { + "auxiliary_loss_clip": 0.01065008, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.030406, + "balance_loss_mlp": 1.01653111, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.6538006360858495, + "language_loss": 0.69011927, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71103907, + "num_input_tokens_seen": 333162805, + "step": 15435, + "time_per_iteration": 2.6219706535339355 + }, + { + "auxiliary_loss_clip": 0.01097601, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.03359902, + "balance_loss_mlp": 1.01787603, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 2.43409939206484, + "language_loss": 0.71914631, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74041241, + "num_input_tokens_seen": 333175770, + "step": 15436, + "time_per_iteration": 2.521183490753174 + }, + { + "auxiliary_loss_clip": 0.01071618, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.03183842, + "balance_loss_mlp": 1.019243, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 9.140610528868535, + "language_loss": 0.67129934, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69232076, + "num_input_tokens_seen": 333194775, + "step": 15437, + "time_per_iteration": 2.6005196571350098 + }, + { + "auxiliary_loss_clip": 0.01086216, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.03367257, + "balance_loss_mlp": 1.01884437, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.143139260565581, + "language_loss": 0.71189165, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73306203, + "num_input_tokens_seen": 333208920, + "step": 15438, + "time_per_iteration": 3.947190523147583 + }, + { + "auxiliary_loss_clip": 0.01097242, + "auxiliary_loss_mlp": 0.0102461, + "balance_loss_clip": 1.03399193, + "balance_loss_mlp": 1.01325536, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.679157134793998, + "language_loss": 0.6457051, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66692358, + "num_input_tokens_seen": 333229350, + "step": 15439, + "time_per_iteration": 2.518526077270508 + }, + { + "auxiliary_loss_clip": 0.01071998, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.03014851, + "balance_loss_mlp": 1.01988673, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 1.7564946216835908, + "language_loss": 0.70482969, + "learning_rate": 5.363782453347876e-08, + "loss": 0.72585797, + "num_input_tokens_seen": 333246125, + "step": 15440, + "time_per_iteration": 2.5978448390960693 + }, + { + "auxiliary_loss_clip": 0.01059079, + "auxiliary_loss_mlp": 0.00749467, + "balance_loss_clip": 1.03190899, + "balance_loss_mlp": 1.00016928, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.68579569145552, + "language_loss": 0.76844072, + "learning_rate": 5.354826952900682e-08, + "loss": 0.7865262, + "num_input_tokens_seen": 333263685, + "step": 15441, + "time_per_iteration": 2.6260201930999756 + }, + { + "auxiliary_loss_clip": 0.01077391, + "auxiliary_loss_mlp": 0.01025665, + "balance_loss_clip": 1.03163445, + "balance_loss_mlp": 1.01653361, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 2.03972327239352, + "language_loss": 0.64510238, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66613293, + "num_input_tokens_seen": 333282435, + "step": 15442, + "time_per_iteration": 2.534111738204956 + }, + { + "auxiliary_loss_clip": 0.0104944, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.03071046, + "balance_loss_mlp": 1.02368975, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 1.827689434061213, + "language_loss": 0.80938351, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.83024371, + "num_input_tokens_seen": 333300400, + "step": 15443, + "time_per_iteration": 2.5364999771118164 + }, + { + "auxiliary_loss_clip": 0.01086168, + "auxiliary_loss_mlp": 0.00749359, + "balance_loss_clip": 1.03253305, + "balance_loss_mlp": 1.00029624, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 2.5165226595272188, + "language_loss": 0.65541565, + "learning_rate": 5.328004738702896e-08, + "loss": 0.6737709, + "num_input_tokens_seen": 333318980, + "step": 15444, + "time_per_iteration": 2.60491943359375 + }, + { + "auxiliary_loss_clip": 0.01052435, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.03013349, + "balance_loss_mlp": 1.01734447, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 2.0212323224181827, + "language_loss": 0.73372984, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.75453711, + "num_input_tokens_seen": 333334135, + "step": 15445, + "time_per_iteration": 2.5669143199920654 + }, + { + "auxiliary_loss_clip": 0.01078993, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.03270292, + "balance_loss_mlp": 1.02144742, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.7571086932535693, + "language_loss": 0.71351504, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73463953, + "num_input_tokens_seen": 333353325, + "step": 15446, + "time_per_iteration": 2.5754783153533936 + }, + { + "auxiliary_loss_clip": 0.01037734, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.03214502, + "balance_loss_mlp": 1.01414895, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.7473104438294582, + "language_loss": 0.69457257, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71521336, + "num_input_tokens_seen": 333371110, + "step": 15447, + "time_per_iteration": 2.651549816131592 + }, + { + "auxiliary_loss_clip": 0.01090276, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.03181744, + "balance_loss_mlp": 1.0170567, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 2.3128355217417864, + "language_loss": 0.7237339, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74490625, + "num_input_tokens_seen": 333391420, + "step": 15448, + "time_per_iteration": 2.6148910522460938 + }, + { + "auxiliary_loss_clip": 0.01096104, + "auxiliary_loss_mlp": 0.01027766, + "balance_loss_clip": 1.03292406, + "balance_loss_mlp": 1.01582718, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.5630367529308173, + "language_loss": 0.74164653, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76288527, + "num_input_tokens_seen": 333410365, + "step": 15449, + "time_per_iteration": 2.506925106048584 + }, + { + "auxiliary_loss_clip": 0.01097542, + "auxiliary_loss_mlp": 0.00749314, + "balance_loss_clip": 1.03365016, + "balance_loss_mlp": 1.00024831, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 2.052607564755346, + "language_loss": 0.67580795, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69427651, + "num_input_tokens_seen": 333430000, + "step": 15450, + "time_per_iteration": 2.59234356880188 + }, + { + "auxiliary_loss_clip": 0.01062934, + "auxiliary_loss_mlp": 0.01024333, + "balance_loss_clip": 1.03053486, + "balance_loss_mlp": 1.01316261, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.7134954878149644, + "language_loss": 0.71879578, + "learning_rate": 5.265677957368875e-08, + "loss": 0.73966843, + "num_input_tokens_seen": 333445800, + "step": 15451, + "time_per_iteration": 2.6956288814544678 + }, + { + "auxiliary_loss_clip": 0.01068459, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.03053641, + "balance_loss_mlp": 1.0306344, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 2.152598601639699, + "language_loss": 0.73316336, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75427681, + "num_input_tokens_seen": 333461550, + "step": 15452, + "time_per_iteration": 2.5951902866363525 + }, + { + "auxiliary_loss_clip": 0.01077889, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.03537607, + "balance_loss_mlp": 1.01669097, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.8165127411012434, + "language_loss": 0.74043041, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.7614888, + "num_input_tokens_seen": 333478835, + "step": 15453, + "time_per_iteration": 2.547034740447998 + }, + { + "auxiliary_loss_clip": 0.00984918, + "auxiliary_loss_mlp": 0.01004543, + "balance_loss_clip": 1.0061872, + "balance_loss_mlp": 1.00364923, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.9942044544367273, + "language_loss": 0.60665393, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62654853, + "num_input_tokens_seen": 333535250, + "step": 15454, + "time_per_iteration": 3.037416934967041 + }, + { + "auxiliary_loss_clip": 0.01068578, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.02932072, + "balance_loss_mlp": 1.02923751, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 1.6076488492149752, + "language_loss": 0.68873322, + "learning_rate": 5.230225101914709e-08, + "loss": 0.70983624, + "num_input_tokens_seen": 333553805, + "step": 15455, + "time_per_iteration": 4.106611967086792 + }, + { + "auxiliary_loss_clip": 0.01051128, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.03330016, + "balance_loss_mlp": 1.02066767, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.7368056776761243, + "language_loss": 0.64702845, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66786289, + "num_input_tokens_seen": 333572800, + "step": 15456, + "time_per_iteration": 2.7197465896606445 + }, + { + "auxiliary_loss_clip": 0.01048083, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.02991414, + "balance_loss_mlp": 1.01603615, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 1.8237819489512288, + "language_loss": 0.68029976, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70105457, + "num_input_tokens_seen": 333588520, + "step": 15457, + "time_per_iteration": 2.6338744163513184 + }, + { + "auxiliary_loss_clip": 0.01074343, + "auxiliary_loss_mlp": 0.01027345, + "balance_loss_clip": 1.03123307, + "balance_loss_mlp": 1.01711679, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 2.269552433691733, + "language_loss": 0.81131816, + "learning_rate": 5.203713008885291e-08, + "loss": 0.83233505, + "num_input_tokens_seen": 333603435, + "step": 15458, + "time_per_iteration": 2.576533317565918 + }, + { + "auxiliary_loss_clip": 0.01088716, + "auxiliary_loss_mlp": 0.01034867, + "balance_loss_clip": 1.03374302, + "balance_loss_mlp": 1.02419162, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.6662167448076664, + "language_loss": 0.72242749, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74366325, + "num_input_tokens_seen": 333623305, + "step": 15459, + "time_per_iteration": 2.550450563430786 + }, + { + "auxiliary_loss_clip": 0.01055604, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.03026891, + "balance_loss_mlp": 1.01993537, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 2.4348255717862264, + "language_loss": 0.58597642, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.60684538, + "num_input_tokens_seen": 333641205, + "step": 15460, + "time_per_iteration": 2.641896963119507 + }, + { + "auxiliary_loss_clip": 0.01043102, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.02895117, + "balance_loss_mlp": 1.02140021, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 2.1893018212971738, + "language_loss": 0.80258048, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82334507, + "num_input_tokens_seen": 333659615, + "step": 15461, + "time_per_iteration": 2.8303611278533936 + }, + { + "auxiliary_loss_clip": 0.01072631, + "auxiliary_loss_mlp": 0.01026312, + "balance_loss_clip": 1.03169155, + "balance_loss_mlp": 1.0153389, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 1.7965751830663181, + "language_loss": 0.78413188, + "learning_rate": 5.168466966796869e-08, + "loss": 0.8051213, + "num_input_tokens_seen": 333678985, + "step": 15462, + "time_per_iteration": 2.675344944000244 + }, + { + "auxiliary_loss_clip": 0.01047514, + "auxiliary_loss_mlp": 0.0102482, + "balance_loss_clip": 1.02603579, + "balance_loss_mlp": 1.01307821, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 2.228725706673033, + "language_loss": 0.6270349, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64775825, + "num_input_tokens_seen": 333696410, + "step": 15463, + "time_per_iteration": 4.182676315307617 + }, + { + "auxiliary_loss_clip": 0.01068173, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.02837861, + "balance_loss_mlp": 1.01723576, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.362399989796759, + "language_loss": 0.70647395, + "learning_rate": 5.15088827260437e-08, + "loss": 0.7274338, + "num_input_tokens_seen": 333716615, + "step": 15464, + "time_per_iteration": 2.6916401386260986 + }, + { + "auxiliary_loss_clip": 0.01067593, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.02980065, + "balance_loss_mlp": 1.01721895, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 2.1830360456102524, + "language_loss": 0.77373278, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79469043, + "num_input_tokens_seen": 333732800, + "step": 15465, + "time_per_iteration": 2.618095874786377 + }, + { + "auxiliary_loss_clip": 0.00975812, + "auxiliary_loss_mlp": 0.01001715, + "balance_loss_clip": 1.0078249, + "balance_loss_mlp": 1.00081539, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6913507360690576, + "language_loss": 0.56381685, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58359212, + "num_input_tokens_seen": 333799300, + "step": 15466, + "time_per_iteration": 3.4737985134124756 + }, + { + "auxiliary_loss_clip": 0.01072888, + "auxiliary_loss_mlp": 0.01036671, + "balance_loss_clip": 1.03024793, + "balance_loss_mlp": 1.02460694, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.531185746693475, + "language_loss": 0.72651327, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.7476089, + "num_input_tokens_seen": 333820360, + "step": 15467, + "time_per_iteration": 3.049593687057495 + }, + { + "auxiliary_loss_clip": 0.01064177, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.03030348, + "balance_loss_mlp": 1.01862347, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.9710092367254042, + "language_loss": 0.71650451, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.73744726, + "num_input_tokens_seen": 333840415, + "step": 15468, + "time_per_iteration": 2.6673953533172607 + }, + { + "auxiliary_loss_clip": 0.01080133, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.03021252, + "balance_loss_mlp": 1.02055717, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.7506457426361237, + "language_loss": 0.75337493, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77450848, + "num_input_tokens_seen": 333859910, + "step": 15469, + "time_per_iteration": 4.10434889793396 + }, + { + "auxiliary_loss_clip": 0.0106996, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.03289449, + "balance_loss_mlp": 1.02236462, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 2.3797719485960025, + "language_loss": 0.75630999, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77734208, + "num_input_tokens_seen": 333880495, + "step": 15470, + "time_per_iteration": 2.6402950286865234 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01028917, + "balance_loss_clip": 1.03330564, + "balance_loss_mlp": 1.01891518, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.745363533059206, + "language_loss": 0.74888408, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76973403, + "num_input_tokens_seen": 333897640, + "step": 15471, + "time_per_iteration": 2.6526708602905273 + }, + { + "auxiliary_loss_clip": 0.01083512, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.03299403, + "balance_loss_mlp": 1.0166713, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 2.377818448934212, + "language_loss": 0.6908412, + "learning_rate": 5.080869070341487e-08, + "loss": 0.71195155, + "num_input_tokens_seen": 333913670, + "step": 15472, + "time_per_iteration": 2.530263900756836 + }, + { + "auxiliary_loss_clip": 0.01064302, + "auxiliary_loss_mlp": 0.01028398, + "balance_loss_clip": 1.02998984, + "balance_loss_mlp": 1.01834321, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 2.298438619131311, + "language_loss": 0.88408726, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90501428, + "num_input_tokens_seen": 333934105, + "step": 15473, + "time_per_iteration": 2.560905694961548 + }, + { + "auxiliary_loss_clip": 0.01076378, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.03367579, + "balance_loss_mlp": 1.02065468, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 2.2409044068724695, + "language_loss": 0.64378476, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66487968, + "num_input_tokens_seen": 333953635, + "step": 15474, + "time_per_iteration": 2.612403392791748 + }, + { + "auxiliary_loss_clip": 0.01097663, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.03393662, + "balance_loss_mlp": 1.01954675, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.8595529137846996, + "language_loss": 0.7496556, + "learning_rate": 5.054733817702339e-08, + "loss": 0.77093637, + "num_input_tokens_seen": 333971825, + "step": 15475, + "time_per_iteration": 2.486618995666504 + }, + { + "auxiliary_loss_clip": 0.01080345, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.03080678, + "balance_loss_mlp": 1.01594031, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 2.903458876201551, + "language_loss": 0.6662786, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68734837, + "num_input_tokens_seen": 333990120, + "step": 15476, + "time_per_iteration": 2.6314799785614014 + }, + { + "auxiliary_loss_clip": 0.01054223, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.03268957, + "balance_loss_mlp": 1.02386904, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 2.0591172595823974, + "language_loss": 0.68624306, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.70713705, + "num_input_tokens_seen": 334007970, + "step": 15477, + "time_per_iteration": 2.730715274810791 + }, + { + "auxiliary_loss_clip": 0.01075906, + "auxiliary_loss_mlp": 0.01027973, + "balance_loss_clip": 1.03423309, + "balance_loss_mlp": 1.01747656, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 1.8732195324825962, + "language_loss": 0.5837574, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60479611, + "num_input_tokens_seen": 334027120, + "step": 15478, + "time_per_iteration": 4.502750396728516 + }, + { + "auxiliary_loss_clip": 0.01075916, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.03455675, + "balance_loss_mlp": 1.01690793, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 2.0030073598783837, + "language_loss": 0.78877258, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80981994, + "num_input_tokens_seen": 334042785, + "step": 15479, + "time_per_iteration": 2.628451108932495 + }, + { + "auxiliary_loss_clip": 0.01096542, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.03305471, + "balance_loss_mlp": 1.02038348, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 1.586814172130091, + "language_loss": 0.68629634, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70756984, + "num_input_tokens_seen": 334063480, + "step": 15480, + "time_per_iteration": 2.589418411254883 + }, + { + "auxiliary_loss_clip": 0.0109704, + "auxiliary_loss_mlp": 0.01032884, + "balance_loss_clip": 1.03342962, + "balance_loss_mlp": 1.02170777, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.7530552919069626, + "language_loss": 0.67736983, + "learning_rate": 5.002662914604583e-08, + "loss": 0.69866908, + "num_input_tokens_seen": 334082005, + "step": 15481, + "time_per_iteration": 2.4947831630706787 + }, + { + "auxiliary_loss_clip": 0.01066056, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.02901912, + "balance_loss_mlp": 1.0180223, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.7956377640278687, + "language_loss": 0.74645442, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76740968, + "num_input_tokens_seen": 334101375, + "step": 15482, + "time_per_iteration": 2.6060686111450195 + }, + { + "auxiliary_loss_clip": 0.01083172, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.03093195, + "balance_loss_mlp": 1.01774812, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 1.9851155224075367, + "language_loss": 0.79898095, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82009423, + "num_input_tokens_seen": 334119460, + "step": 15483, + "time_per_iteration": 2.5197432041168213 + }, + { + "auxiliary_loss_clip": 0.01072821, + "auxiliary_loss_mlp": 0.01028695, + "balance_loss_clip": 1.03329396, + "balance_loss_mlp": 1.01744771, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 2.6608170754270786, + "language_loss": 0.74521428, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76622939, + "num_input_tokens_seen": 334136065, + "step": 15484, + "time_per_iteration": 2.530308485031128 + }, + { + "auxiliary_loss_clip": 0.01077533, + "auxiliary_loss_mlp": 0.01028697, + "balance_loss_clip": 1.03538203, + "balance_loss_mlp": 1.01764607, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.4777263921189157, + "language_loss": 0.76590341, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78696573, + "num_input_tokens_seen": 334153690, + "step": 15485, + "time_per_iteration": 2.567244291305542 + }, + { + "auxiliary_loss_clip": 0.01048128, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.02989721, + "balance_loss_mlp": 1.01732564, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.8291324372506628, + "language_loss": 0.78364927, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80442625, + "num_input_tokens_seen": 334171880, + "step": 15486, + "time_per_iteration": 2.6623036861419678 + }, + { + "auxiliary_loss_clip": 0.01068029, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.03554487, + "balance_loss_mlp": 1.01742077, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 2.325657198700773, + "language_loss": 0.77612406, + "learning_rate": 4.950858206945674e-08, + "loss": 0.79709405, + "num_input_tokens_seen": 334190005, + "step": 15487, + "time_per_iteration": 2.616349935531616 + }, + { + "auxiliary_loss_clip": 0.01068908, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.03544211, + "balance_loss_mlp": 1.01522589, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.1133260290621565, + "language_loss": 0.6720596, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69301605, + "num_input_tokens_seen": 334209545, + "step": 15488, + "time_per_iteration": 2.7527143955230713 + }, + { + "auxiliary_loss_clip": 0.01071572, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.03171229, + "balance_loss_mlp": 1.01730275, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.6912464654677655, + "language_loss": 0.74711257, + "learning_rate": 4.933649137834983e-08, + "loss": 0.76811123, + "num_input_tokens_seen": 334228900, + "step": 15489, + "time_per_iteration": 2.5600132942199707 + }, + { + "auxiliary_loss_clip": 0.01097802, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.0331105, + "balance_loss_mlp": 1.01955986, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 14.317149095152914, + "language_loss": 0.80928016, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83056653, + "num_input_tokens_seen": 334245500, + "step": 15490, + "time_per_iteration": 2.571302890777588 + }, + { + "auxiliary_loss_clip": 0.01051227, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.03165388, + "balance_loss_mlp": 1.01678181, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 1.7195517342083, + "language_loss": 0.71612191, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.73691833, + "num_input_tokens_seen": 334264370, + "step": 15491, + "time_per_iteration": 2.68733549118042 + }, + { + "auxiliary_loss_clip": 0.01070034, + "auxiliary_loss_mlp": 0.00749171, + "balance_loss_clip": 1.02991128, + "balance_loss_mlp": 1.00022507, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 2.1915989937190905, + "language_loss": 0.74374211, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76193416, + "num_input_tokens_seen": 334283905, + "step": 15492, + "time_per_iteration": 2.642406940460205 + }, + { + "auxiliary_loss_clip": 0.01013235, + "auxiliary_loss_mlp": 0.0099959, + "balance_loss_clip": 1.0033716, + "balance_loss_mlp": 0.99866027, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.7183381181503852, + "language_loss": 0.5344283, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55455655, + "num_input_tokens_seen": 334339925, + "step": 15493, + "time_per_iteration": 3.0357506275177 + }, + { + "auxiliary_loss_clip": 0.01084096, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.03167462, + "balance_loss_mlp": 1.02071965, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.6685063847263588, + "language_loss": 0.70575345, + "learning_rate": 4.890755917128531e-08, + "loss": 0.72690785, + "num_input_tokens_seen": 334357225, + "step": 15494, + "time_per_iteration": 2.5981593132019043 + }, + { + "auxiliary_loss_clip": 0.0108013, + "auxiliary_loss_mlp": 0.01027133, + "balance_loss_clip": 1.03223145, + "balance_loss_mlp": 1.01590347, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.5108222845446821, + "language_loss": 0.68396807, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70504069, + "num_input_tokens_seen": 334375945, + "step": 15495, + "time_per_iteration": 4.078913927078247 + }, + { + "auxiliary_loss_clip": 0.01093103, + "auxiliary_loss_mlp": 0.01033659, + "balance_loss_clip": 1.03159535, + "balance_loss_mlp": 1.02377093, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 1.6872208858747193, + "language_loss": 0.61348283, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63475049, + "num_input_tokens_seen": 334395310, + "step": 15496, + "time_per_iteration": 2.5747885704040527 + }, + { + "auxiliary_loss_clip": 0.01087286, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.03311563, + "balance_loss_mlp": 1.0210886, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.8674300626841316, + "language_loss": 0.76867771, + "learning_rate": 4.865108764847825e-08, + "loss": 0.78987503, + "num_input_tokens_seen": 334416965, + "step": 15497, + "time_per_iteration": 2.661940574645996 + }, + { + "auxiliary_loss_clip": 0.01090351, + "auxiliary_loss_mlp": 0.00749358, + "balance_loss_clip": 1.03484297, + "balance_loss_mlp": 1.00022364, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.7771555670075638, + "language_loss": 0.66294569, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68134284, + "num_input_tokens_seen": 334435620, + "step": 15498, + "time_per_iteration": 2.6422314643859863 + }, + { + "auxiliary_loss_clip": 0.01076215, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.03335512, + "balance_loss_mlp": 1.02145839, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.7721592319901898, + "language_loss": 0.79937279, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82046342, + "num_input_tokens_seen": 334456210, + "step": 15499, + "time_per_iteration": 2.616672992706299 + }, + { + "auxiliary_loss_clip": 0.01051604, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.03123009, + "balance_loss_mlp": 1.02064443, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.6582162524030766, + "language_loss": 0.76537591, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.7862134, + "num_input_tokens_seen": 334475485, + "step": 15500, + "time_per_iteration": 2.6624066829681396 + }, + { + "auxiliary_loss_clip": 0.01064728, + "auxiliary_loss_mlp": 0.01024721, + "balance_loss_clip": 1.03284907, + "balance_loss_mlp": 1.0145818, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 2.1416768916022706, + "language_loss": 0.72096008, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74185455, + "num_input_tokens_seen": 334494740, + "step": 15501, + "time_per_iteration": 2.725233554840088 + }, + { + "auxiliary_loss_clip": 0.0109933, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.03322744, + "balance_loss_mlp": 1.02358413, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 2.223112679325594, + "language_loss": 0.66128314, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68262482, + "num_input_tokens_seen": 334511910, + "step": 15502, + "time_per_iteration": 2.5322370529174805 + }, + { + "auxiliary_loss_clip": 0.01088707, + "auxiliary_loss_mlp": 0.00749478, + "balance_loss_clip": 1.03410006, + "balance_loss_mlp": 1.00029278, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.3882509482276337, + "language_loss": 0.6575039, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67588574, + "num_input_tokens_seen": 334533150, + "step": 15503, + "time_per_iteration": 4.109589576721191 + }, + { + "auxiliary_loss_clip": 0.01054488, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.02808475, + "balance_loss_mlp": 1.02089047, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 3.555523288786428, + "language_loss": 0.75041223, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77128482, + "num_input_tokens_seen": 334550940, + "step": 15504, + "time_per_iteration": 2.644430160522461 + }, + { + "auxiliary_loss_clip": 0.01087376, + "auxiliary_loss_mlp": 0.00749367, + "balance_loss_clip": 1.03346729, + "balance_loss_mlp": 1.00022817, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 2.3582403116783275, + "language_loss": 0.70907962, + "learning_rate": 4.797041961982762e-08, + "loss": 0.72744709, + "num_input_tokens_seen": 334570935, + "step": 15505, + "time_per_iteration": 2.6036031246185303 + }, + { + "auxiliary_loss_clip": 0.01078404, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.03400445, + "balance_loss_mlp": 1.01784527, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 1.7964098770784958, + "language_loss": 0.75451666, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77559012, + "num_input_tokens_seen": 334589315, + "step": 15506, + "time_per_iteration": 2.574536085128784 + }, + { + "auxiliary_loss_clip": 0.01055934, + "auxiliary_loss_mlp": 0.0102412, + "balance_loss_clip": 1.03151011, + "balance_loss_mlp": 1.01358736, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 1.9197538150445088, + "language_loss": 0.83472073, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85552126, + "num_input_tokens_seen": 334608990, + "step": 15507, + "time_per_iteration": 2.6509246826171875 + }, + { + "auxiliary_loss_clip": 0.01098112, + "auxiliary_loss_mlp": 0.01027077, + "balance_loss_clip": 1.03337836, + "balance_loss_mlp": 1.0163188, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.4079486612507561, + "language_loss": 0.67758048, + "learning_rate": 4.771639036957742e-08, + "loss": 0.69883239, + "num_input_tokens_seen": 334628655, + "step": 15508, + "time_per_iteration": 4.082173585891724 + }, + { + "auxiliary_loss_clip": 0.01061145, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.03311849, + "balance_loss_mlp": 1.01730609, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.7290964019627257, + "language_loss": 0.72193849, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74283564, + "num_input_tokens_seen": 334648295, + "step": 15509, + "time_per_iteration": 2.651916265487671 + }, + { + "auxiliary_loss_clip": 0.01086324, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.03315282, + "balance_loss_mlp": 1.0182097, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 3.1669324471065945, + "language_loss": 0.74549711, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76664662, + "num_input_tokens_seen": 334666280, + "step": 15510, + "time_per_iteration": 2.5093190670013428 + }, + { + "auxiliary_loss_clip": 0.01085534, + "auxiliary_loss_mlp": 0.01025971, + "balance_loss_clip": 1.03063083, + "balance_loss_mlp": 1.01514041, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 1.662304550564779, + "language_loss": 0.70257646, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72369152, + "num_input_tokens_seen": 334688830, + "step": 15511, + "time_per_iteration": 2.5591790676116943 + }, + { + "auxiliary_loss_clip": 0.01067219, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.03083217, + "balance_loss_mlp": 1.02160621, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.8123436402665647, + "language_loss": 0.77989048, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80089182, + "num_input_tokens_seen": 334705205, + "step": 15512, + "time_per_iteration": 2.6184253692626953 + }, + { + "auxiliary_loss_clip": 0.01095561, + "auxiliary_loss_mlp": 0.01028644, + "balance_loss_clip": 1.03250313, + "balance_loss_mlp": 1.01730156, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 2.3459755690969186, + "language_loss": 0.8060708, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82731289, + "num_input_tokens_seen": 334723830, + "step": 15513, + "time_per_iteration": 2.530829668045044 + }, + { + "auxiliary_loss_clip": 0.01073541, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.03427458, + "balance_loss_mlp": 1.01787603, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 2.126082900963778, + "language_loss": 0.79989588, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82093364, + "num_input_tokens_seen": 334740825, + "step": 15514, + "time_per_iteration": 2.518969774246216 + }, + { + "auxiliary_loss_clip": 0.01062921, + "auxiliary_loss_mlp": 0.0103791, + "balance_loss_clip": 1.03279734, + "balance_loss_mlp": 1.02734852, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 2.3874610438988983, + "language_loss": 0.71528506, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.73629344, + "num_input_tokens_seen": 334765825, + "step": 15515, + "time_per_iteration": 2.7334330081939697 + }, + { + "auxiliary_loss_clip": 0.01080891, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.03419399, + "balance_loss_mlp": 1.01855981, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.4719978247698733, + "language_loss": 0.80768526, + "learning_rate": 4.704223662500806e-08, + "loss": 0.82879472, + "num_input_tokens_seen": 334782680, + "step": 15516, + "time_per_iteration": 2.5516629219055176 + }, + { + "auxiliary_loss_clip": 0.01054464, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.02925444, + "balance_loss_mlp": 1.0252254, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.6210600849295418, + "language_loss": 0.8061645, + "learning_rate": 4.695830062703643e-08, + "loss": 0.82708859, + "num_input_tokens_seen": 334800160, + "step": 15517, + "time_per_iteration": 4.057014226913452 + }, + { + "auxiliary_loss_clip": 0.01076762, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.03255415, + "balance_loss_mlp": 1.01926506, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 1.995750911335667, + "language_loss": 0.74582481, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76690114, + "num_input_tokens_seen": 334815840, + "step": 15518, + "time_per_iteration": 2.545503854751587 + }, + { + "auxiliary_loss_clip": 0.01068903, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.03019667, + "balance_loss_mlp": 1.01960802, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 1.956097739686633, + "language_loss": 0.75779289, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77878696, + "num_input_tokens_seen": 334834735, + "step": 15519, + "time_per_iteration": 2.608497142791748 + }, + { + "auxiliary_loss_clip": 0.01040626, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02935362, + "balance_loss_mlp": 1.02272344, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.033955063361358, + "language_loss": 0.82718432, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.84793651, + "num_input_tokens_seen": 334853490, + "step": 15520, + "time_per_iteration": 2.691452980041504 + }, + { + "auxiliary_loss_clip": 0.01082956, + "auxiliary_loss_mlp": 0.01025813, + "balance_loss_clip": 1.0310055, + "balance_loss_mlp": 1.01474977, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.882627129491701, + "language_loss": 0.76111394, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78220165, + "num_input_tokens_seen": 334873675, + "step": 15521, + "time_per_iteration": 2.561931610107422 + }, + { + "auxiliary_loss_clip": 0.01088357, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.0345093, + "balance_loss_mlp": 1.02024043, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 2.0285443388796334, + "language_loss": 0.77766204, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79885244, + "num_input_tokens_seen": 334890970, + "step": 15522, + "time_per_iteration": 2.5381484031677246 + }, + { + "auxiliary_loss_clip": 0.01061331, + "auxiliary_loss_mlp": 0.00749427, + "balance_loss_clip": 1.03058195, + "balance_loss_mlp": 1.0002265, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 1.9134730028459066, + "language_loss": 0.63030124, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.64840883, + "num_input_tokens_seen": 334906635, + "step": 15523, + "time_per_iteration": 2.562858819961548 + }, + { + "auxiliary_loss_clip": 0.01071699, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.03104568, + "balance_loss_mlp": 1.02132773, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 1.9071507809321888, + "language_loss": 0.68286151, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70390034, + "num_input_tokens_seen": 334926230, + "step": 15524, + "time_per_iteration": 2.6723833084106445 + }, + { + "auxiliary_loss_clip": 0.01054039, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.03208983, + "balance_loss_mlp": 1.01973462, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.6559561129788547, + "language_loss": 0.74084055, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76169294, + "num_input_tokens_seen": 334946680, + "step": 15525, + "time_per_iteration": 2.6669955253601074 + }, + { + "auxiliary_loss_clip": 0.01042733, + "auxiliary_loss_mlp": 0.01034062, + "balance_loss_clip": 1.03135633, + "balance_loss_mlp": 1.02326775, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 3.452091132355708, + "language_loss": 0.83804488, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.85881281, + "num_input_tokens_seen": 334964785, + "step": 15526, + "time_per_iteration": 2.6899712085723877 + }, + { + "auxiliary_loss_clip": 0.01049464, + "auxiliary_loss_mlp": 0.01030737, + "balance_loss_clip": 1.03138089, + "balance_loss_mlp": 1.01935291, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.8877734999206046, + "language_loss": 0.68791699, + "learning_rate": 4.61230144456366e-08, + "loss": 0.70871902, + "num_input_tokens_seen": 334982400, + "step": 15527, + "time_per_iteration": 2.628521680831909 + }, + { + "auxiliary_loss_clip": 0.01100066, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.03416061, + "balance_loss_mlp": 1.0162828, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 1.831693935722895, + "language_loss": 0.65024424, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67153305, + "num_input_tokens_seen": 334999685, + "step": 15528, + "time_per_iteration": 2.476428985595703 + }, + { + "auxiliary_loss_clip": 0.01098934, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.03289151, + "balance_loss_mlp": 1.01779735, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 2.146011348728704, + "language_loss": 0.74662733, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.7679131, + "num_input_tokens_seen": 335019160, + "step": 15529, + "time_per_iteration": 2.4770445823669434 + }, + { + "auxiliary_loss_clip": 0.01046912, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.02824616, + "balance_loss_mlp": 1.02098596, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.914065644499644, + "language_loss": 0.63037634, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65115696, + "num_input_tokens_seen": 335037350, + "step": 15530, + "time_per_iteration": 2.6540515422821045 + }, + { + "auxiliary_loss_clip": 0.01075902, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.0339973, + "balance_loss_mlp": 1.01722622, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 2.0713463904965153, + "language_loss": 0.72175944, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74279481, + "num_input_tokens_seen": 335056060, + "step": 15531, + "time_per_iteration": 2.5675270557403564 + }, + { + "auxiliary_loss_clip": 0.01072643, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.03143072, + "balance_loss_mlp": 1.01903605, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 1.7505279862183163, + "language_loss": 0.71017444, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.73120379, + "num_input_tokens_seen": 335075410, + "step": 15532, + "time_per_iteration": 2.6355068683624268 + }, + { + "auxiliary_loss_clip": 0.01098295, + "auxiliary_loss_mlp": 0.00749373, + "balance_loss_clip": 1.03381729, + "balance_loss_mlp": 1.00024927, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.5702092863440587, + "language_loss": 0.73077679, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.74925339, + "num_input_tokens_seen": 335095190, + "step": 15533, + "time_per_iteration": 2.4925053119659424 + }, + { + "auxiliary_loss_clip": 0.01059948, + "auxiliary_loss_mlp": 0.01025337, + "balance_loss_clip": 1.02979493, + "balance_loss_mlp": 1.01495397, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.8569729008864724, + "language_loss": 0.80047274, + "learning_rate": 4.554272235700507e-08, + "loss": 0.8213256, + "num_input_tokens_seen": 335113825, + "step": 15534, + "time_per_iteration": 2.595759630203247 + }, + { + "auxiliary_loss_clip": 0.01091043, + "auxiliary_loss_mlp": 0.01022334, + "balance_loss_clip": 1.03333056, + "balance_loss_mlp": 1.01271403, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 2.1294738192888807, + "language_loss": 0.74746114, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76859486, + "num_input_tokens_seen": 335136425, + "step": 15535, + "time_per_iteration": 4.152533054351807 + }, + { + "auxiliary_loss_clip": 0.0108329, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.03516436, + "balance_loss_mlp": 1.01971555, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 1.814445815268855, + "language_loss": 0.77436209, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79550791, + "num_input_tokens_seen": 335157925, + "step": 15536, + "time_per_iteration": 2.6988470554351807 + }, + { + "auxiliary_loss_clip": 0.01067317, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.03303003, + "balance_loss_mlp": 1.01534331, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.4570389404854591, + "language_loss": 0.80828905, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82922375, + "num_input_tokens_seen": 335177840, + "step": 15537, + "time_per_iteration": 2.657693386077881 + }, + { + "auxiliary_loss_clip": 0.01077312, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.03337908, + "balance_loss_mlp": 1.02071381, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.6231749513611713, + "language_loss": 0.77768099, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.79876316, + "num_input_tokens_seen": 335199470, + "step": 15538, + "time_per_iteration": 2.6708788871765137 + }, + { + "auxiliary_loss_clip": 0.01070151, + "auxiliary_loss_mlp": 0.01028473, + "balance_loss_clip": 1.03189576, + "balance_loss_mlp": 1.01775002, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.7622746679640489, + "language_loss": 0.7316941, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75268036, + "num_input_tokens_seen": 335218885, + "step": 15539, + "time_per_iteration": 2.615049362182617 + }, + { + "auxiliary_loss_clip": 0.01049078, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.03352463, + "balance_loss_mlp": 1.01734209, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 2.770367934256377, + "language_loss": 0.65035844, + "learning_rate": 4.504821951247373e-08, + "loss": 0.67112041, + "num_input_tokens_seen": 335239485, + "step": 15540, + "time_per_iteration": 2.7200939655303955 + }, + { + "auxiliary_loss_clip": 0.01085155, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.03220081, + "balance_loss_mlp": 1.01931226, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.9270401738777971, + "language_loss": 0.76749313, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78864628, + "num_input_tokens_seen": 335258355, + "step": 15541, + "time_per_iteration": 2.52946400642395 + }, + { + "auxiliary_loss_clip": 0.01086335, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.03634787, + "balance_loss_mlp": 1.02165389, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 2.333703252979733, + "language_loss": 0.66189408, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.6830827, + "num_input_tokens_seen": 335276835, + "step": 15542, + "time_per_iteration": 2.5963830947875977 + }, + { + "auxiliary_loss_clip": 0.0105203, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.03151107, + "balance_loss_mlp": 1.01668811, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.8705323059746333, + "language_loss": 0.69071567, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71151668, + "num_input_tokens_seen": 335296220, + "step": 15543, + "time_per_iteration": 4.150037527084351 + }, + { + "auxiliary_loss_clip": 0.01085894, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.03078341, + "balance_loss_mlp": 1.01804721, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 2.0743174800108006, + "language_loss": 0.69321442, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71437597, + "num_input_tokens_seen": 335316335, + "step": 15544, + "time_per_iteration": 2.5287253856658936 + }, + { + "auxiliary_loss_clip": 0.01081343, + "auxiliary_loss_mlp": 0.01040968, + "balance_loss_clip": 1.03201365, + "balance_loss_mlp": 1.02861214, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 1.557429440294571, + "language_loss": 0.77284354, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79406667, + "num_input_tokens_seen": 335335545, + "step": 15545, + "time_per_iteration": 2.6054999828338623 + }, + { + "auxiliary_loss_clip": 0.01087426, + "auxiliary_loss_mlp": 0.01026504, + "balance_loss_clip": 1.03361166, + "balance_loss_mlp": 1.01630521, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.9863617344270323, + "language_loss": 0.68932068, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71046001, + "num_input_tokens_seen": 335355350, + "step": 15546, + "time_per_iteration": 2.5227737426757812 + }, + { + "auxiliary_loss_clip": 0.01051334, + "auxiliary_loss_mlp": 0.01027641, + "balance_loss_clip": 1.02872193, + "balance_loss_mlp": 1.01719236, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 2.0523398289774546, + "language_loss": 0.82355464, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84434438, + "num_input_tokens_seen": 335375160, + "step": 15547, + "time_per_iteration": 2.70263409614563 + }, + { + "auxiliary_loss_clip": 0.0107561, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.02950025, + "balance_loss_mlp": 1.02135372, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.8411796654182175, + "language_loss": 0.8376025, + "learning_rate": 4.439303389230087e-08, + "loss": 0.8586868, + "num_input_tokens_seen": 335394080, + "step": 15548, + "time_per_iteration": 3.956308126449585 + }, + { + "auxiliary_loss_clip": 0.01090659, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.03460026, + "balance_loss_mlp": 1.01972795, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 2.755306485526223, + "language_loss": 0.65133011, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.6725539, + "num_input_tokens_seen": 335414230, + "step": 15549, + "time_per_iteration": 2.727747917175293 + }, + { + "auxiliary_loss_clip": 0.0108862, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.03475046, + "balance_loss_mlp": 1.02083802, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 1.689773562674775, + "language_loss": 0.80136228, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82257235, + "num_input_tokens_seen": 335432890, + "step": 15550, + "time_per_iteration": 2.518223285675049 + }, + { + "auxiliary_loss_clip": 0.01085223, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.03532183, + "balance_loss_mlp": 1.01841867, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.5980549001524176, + "language_loss": 0.75219196, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77333593, + "num_input_tokens_seen": 335452085, + "step": 15551, + "time_per_iteration": 2.606379508972168 + }, + { + "auxiliary_loss_clip": 0.01035677, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.02890539, + "balance_loss_mlp": 1.01899171, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.6123196822673824, + "language_loss": 0.7375918, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75823045, + "num_input_tokens_seen": 335472130, + "step": 15552, + "time_per_iteration": 2.7218761444091797 + }, + { + "auxiliary_loss_clip": 0.01045849, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_clip": 1.02880943, + "balance_loss_mlp": 1.03270555, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.7593895854787411, + "language_loss": 0.77142608, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79233408, + "num_input_tokens_seen": 335489970, + "step": 15553, + "time_per_iteration": 2.629396438598633 + }, + { + "auxiliary_loss_clip": 0.01059987, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.03022099, + "balance_loss_mlp": 1.02115035, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.6628695742794755, + "language_loss": 0.78213972, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80306584, + "num_input_tokens_seen": 335509125, + "step": 15554, + "time_per_iteration": 2.626006603240967 + }, + { + "auxiliary_loss_clip": 0.01063646, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.02893615, + "balance_loss_mlp": 1.0204885, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 2.212369286336102, + "language_loss": 0.69284528, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71379113, + "num_input_tokens_seen": 335525620, + "step": 15555, + "time_per_iteration": 2.532696008682251 + }, + { + "auxiliary_loss_clip": 0.01001714, + "auxiliary_loss_mlp": 0.01043878, + "balance_loss_clip": 1.02744007, + "balance_loss_mlp": 1.0313437, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.8960801040261033, + "language_loss": 0.75493908, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77539498, + "num_input_tokens_seen": 335547565, + "step": 15556, + "time_per_iteration": 2.8377702236175537 + }, + { + "auxiliary_loss_clip": 0.01073191, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.03057921, + "balance_loss_mlp": 1.01784134, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.6430185682726164, + "language_loss": 0.72531068, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74632275, + "num_input_tokens_seen": 335570285, + "step": 15557, + "time_per_iteration": 4.373210191726685 + }, + { + "auxiliary_loss_clip": 0.01096594, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.03257537, + "balance_loss_mlp": 1.01784956, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 2.59752214121473, + "language_loss": 0.63056713, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65181994, + "num_input_tokens_seen": 335588600, + "step": 15558, + "time_per_iteration": 2.5343122482299805 + }, + { + "auxiliary_loss_clip": 0.01075407, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.03385067, + "balance_loss_mlp": 1.01706743, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 2.147814958984123, + "language_loss": 0.73217863, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75322288, + "num_input_tokens_seen": 335606235, + "step": 15559, + "time_per_iteration": 2.5942161083221436 + }, + { + "auxiliary_loss_clip": 0.01036855, + "auxiliary_loss_mlp": 0.00749207, + "balance_loss_clip": 1.02778864, + "balance_loss_mlp": 1.00017285, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.9811561024489754, + "language_loss": 0.6367622, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65462279, + "num_input_tokens_seen": 335628240, + "step": 15560, + "time_per_iteration": 2.75933575630188 + }, + { + "auxiliary_loss_clip": 0.01033927, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.03322792, + "balance_loss_mlp": 1.0184927, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 1.9733892553537653, + "language_loss": 0.64099205, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66164953, + "num_input_tokens_seen": 335643755, + "step": 15561, + "time_per_iteration": 2.678403377532959 + }, + { + "auxiliary_loss_clip": 0.010976, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.03470707, + "balance_loss_mlp": 1.01889312, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.7257232590380915, + "language_loss": 0.75448036, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77575767, + "num_input_tokens_seen": 335665160, + "step": 15562, + "time_per_iteration": 2.559253215789795 + }, + { + "auxiliary_loss_clip": 0.01013579, + "auxiliary_loss_mlp": 0.01002236, + "balance_loss_clip": 1.00419295, + "balance_loss_mlp": 1.00133038, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9541450337458803, + "language_loss": 0.62315249, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64331061, + "num_input_tokens_seen": 335715240, + "step": 15563, + "time_per_iteration": 2.9263594150543213 + }, + { + "auxiliary_loss_clip": 0.010456, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.03207862, + "balance_loss_mlp": 1.0181818, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.6997421537856847, + "language_loss": 0.78442717, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80517244, + "num_input_tokens_seen": 335734970, + "step": 15564, + "time_per_iteration": 2.670863389968872 + }, + { + "auxiliary_loss_clip": 0.01097526, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.03272676, + "balance_loss_mlp": 1.01628435, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 11.168715533069419, + "language_loss": 0.78095615, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80221123, + "num_input_tokens_seen": 335753435, + "step": 15565, + "time_per_iteration": 2.533498525619507 + }, + { + "auxiliary_loss_clip": 0.01083065, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.03175056, + "balance_loss_mlp": 1.01992059, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 1.9101651487631477, + "language_loss": 0.71980578, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74093908, + "num_input_tokens_seen": 335772105, + "step": 15566, + "time_per_iteration": 2.5212690830230713 + }, + { + "auxiliary_loss_clip": 0.01054133, + "auxiliary_loss_mlp": 0.00749479, + "balance_loss_clip": 1.02833974, + "balance_loss_mlp": 1.00028992, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 2.1514128197770375, + "language_loss": 0.67190313, + "learning_rate": 4.285599216057889e-08, + "loss": 0.68993932, + "num_input_tokens_seen": 335789125, + "step": 15567, + "time_per_iteration": 2.571483612060547 + }, + { + "auxiliary_loss_clip": 0.01065425, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.03268027, + "balance_loss_mlp": 1.0206939, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 2.2500029569377857, + "language_loss": 0.61906517, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64003909, + "num_input_tokens_seen": 335810995, + "step": 15568, + "time_per_iteration": 2.6243419647216797 + }, + { + "auxiliary_loss_clip": 0.01067558, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.02814913, + "balance_loss_mlp": 1.01986194, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.864276361560003, + "language_loss": 0.78954339, + "learning_rate": 4.269575644764556e-08, + "loss": 0.81052858, + "num_input_tokens_seen": 335830580, + "step": 15569, + "time_per_iteration": 2.604731559753418 + }, + { + "auxiliary_loss_clip": 0.01076633, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.0328238, + "balance_loss_mlp": 1.02353644, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.387376882990913, + "language_loss": 0.69264954, + "learning_rate": 4.261574992142014e-08, + "loss": 0.7137692, + "num_input_tokens_seen": 335846515, + "step": 15570, + "time_per_iteration": 2.599668502807617 + }, + { + "auxiliary_loss_clip": 0.01083092, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.03419137, + "balance_loss_mlp": 1.01964366, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.813535204011481, + "language_loss": 0.78753585, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.80866981, + "num_input_tokens_seen": 335863350, + "step": 15571, + "time_per_iteration": 2.4907455444335938 + }, + { + "auxiliary_loss_clip": 0.01063646, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.03155386, + "balance_loss_mlp": 1.01822948, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 2.8590149124601854, + "language_loss": 0.77607834, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79700851, + "num_input_tokens_seen": 335880510, + "step": 15572, + "time_per_iteration": 2.5998268127441406 + }, + { + "auxiliary_loss_clip": 0.0107181, + "auxiliary_loss_mlp": 0.01039603, + "balance_loss_clip": 1.0308578, + "balance_loss_mlp": 1.02846289, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 1.7981572354957596, + "language_loss": 0.78108633, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80220044, + "num_input_tokens_seen": 335899440, + "step": 15573, + "time_per_iteration": 2.562337875366211 + }, + { + "auxiliary_loss_clip": 0.01056345, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.02950454, + "balance_loss_mlp": 1.01675987, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.5355163906883766, + "language_loss": 0.74103582, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.7618711, + "num_input_tokens_seen": 335919540, + "step": 15574, + "time_per_iteration": 2.6461830139160156 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.0307126, + "balance_loss_mlp": 1.01809263, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.822156627328427, + "language_loss": 0.68078375, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70145845, + "num_input_tokens_seen": 335939665, + "step": 15575, + "time_per_iteration": 4.341504812240601 + }, + { + "auxiliary_loss_clip": 0.0106738, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.03078413, + "balance_loss_mlp": 1.02085984, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 2.500843312993397, + "language_loss": 0.64958167, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67057478, + "num_input_tokens_seen": 335958580, + "step": 15576, + "time_per_iteration": 2.539715051651001 + }, + { + "auxiliary_loss_clip": 0.01079971, + "auxiliary_loss_mlp": 0.01026858, + "balance_loss_clip": 1.02985191, + "balance_loss_mlp": 1.01482964, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.102002082948081, + "language_loss": 0.75735641, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.77842474, + "num_input_tokens_seen": 335974965, + "step": 15577, + "time_per_iteration": 2.5425100326538086 + }, + { + "auxiliary_loss_clip": 0.01047184, + "auxiliary_loss_mlp": 0.01025668, + "balance_loss_clip": 1.02818084, + "balance_loss_mlp": 1.01443887, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 1.845117946884322, + "language_loss": 0.52032351, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.54105204, + "num_input_tokens_seen": 335996575, + "step": 15578, + "time_per_iteration": 2.6969144344329834 + }, + { + "auxiliary_loss_clip": 0.01028026, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.02611852, + "balance_loss_mlp": 1.02105069, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.5501647439669464, + "language_loss": 0.7071197, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72772294, + "num_input_tokens_seen": 336017265, + "step": 15579, + "time_per_iteration": 2.715067148208618 + }, + { + "auxiliary_loss_clip": 0.01068079, + "auxiliary_loss_mlp": 0.01024052, + "balance_loss_clip": 1.02951086, + "balance_loss_mlp": 1.01365137, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 2.0759930223300453, + "language_loss": 0.76592815, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78684944, + "num_input_tokens_seen": 336035905, + "step": 15580, + "time_per_iteration": 2.608391046524048 + }, + { + "auxiliary_loss_clip": 0.01083941, + "auxiliary_loss_mlp": 0.010301, + "balance_loss_clip": 1.03293228, + "balance_loss_mlp": 1.01822042, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 1.6702247293887773, + "language_loss": 0.66335565, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68449605, + "num_input_tokens_seen": 336055585, + "step": 15581, + "time_per_iteration": 2.5704822540283203 + }, + { + "auxiliary_loss_clip": 0.01087477, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.0339272, + "balance_loss_mlp": 1.01473069, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.5495426522817903, + "language_loss": 0.76600301, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78713495, + "num_input_tokens_seen": 336076695, + "step": 15582, + "time_per_iteration": 2.5157485008239746 + }, + { + "auxiliary_loss_clip": 0.01020199, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.03022194, + "balance_loss_mlp": 1.02017891, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.658060577615139, + "language_loss": 0.73767042, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.75818801, + "num_input_tokens_seen": 336094740, + "step": 15583, + "time_per_iteration": 4.231364727020264 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.0362978, + "balance_loss_mlp": 1.01789522, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.2884459491098563, + "language_loss": 0.84099901, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86232471, + "num_input_tokens_seen": 336113985, + "step": 15584, + "time_per_iteration": 2.557543992996216 + }, + { + "auxiliary_loss_clip": 0.0108796, + "auxiliary_loss_mlp": 0.00749549, + "balance_loss_clip": 1.03450155, + "balance_loss_mlp": 1.00030577, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.5421902865442663, + "language_loss": 0.72212654, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.74050164, + "num_input_tokens_seen": 336136395, + "step": 15585, + "time_per_iteration": 2.6657204627990723 + }, + { + "auxiliary_loss_clip": 0.01062364, + "auxiliary_loss_mlp": 0.01025092, + "balance_loss_clip": 1.03091836, + "balance_loss_mlp": 1.01503706, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.7955600752771999, + "language_loss": 0.80110216, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82197672, + "num_input_tokens_seen": 336156345, + "step": 15586, + "time_per_iteration": 2.6204614639282227 + }, + { + "auxiliary_loss_clip": 0.01056225, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.03189087, + "balance_loss_mlp": 1.02316415, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.622938151902997, + "language_loss": 0.7671442, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78804815, + "num_input_tokens_seen": 336176760, + "step": 15587, + "time_per_iteration": 2.644073247909546 + }, + { + "auxiliary_loss_clip": 0.0107751, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.0316515, + "balance_loss_mlp": 1.02010036, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.9741619013621343, + "language_loss": 0.87731981, + "learning_rate": 4.118832771491387e-08, + "loss": 0.89841264, + "num_input_tokens_seen": 336193285, + "step": 15588, + "time_per_iteration": 4.237125635147095 + }, + { + "auxiliary_loss_clip": 0.01094226, + "auxiliary_loss_mlp": 0.00749347, + "balance_loss_clip": 1.03356075, + "balance_loss_mlp": 1.00026178, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 2.3261221389140605, + "language_loss": 0.78419018, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80262589, + "num_input_tokens_seen": 336211425, + "step": 15589, + "time_per_iteration": 2.49564266204834 + }, + { + "auxiliary_loss_clip": 0.01094078, + "auxiliary_loss_mlp": 0.01033457, + "balance_loss_clip": 1.03290677, + "balance_loss_mlp": 1.02216184, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 2.2398718266700497, + "language_loss": 0.77890742, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80018276, + "num_input_tokens_seen": 336230205, + "step": 15590, + "time_per_iteration": 2.4811909198760986 + }, + { + "auxiliary_loss_clip": 0.01063954, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.03080797, + "balance_loss_mlp": 1.02317762, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.7178549067590259, + "language_loss": 0.71450245, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73549646, + "num_input_tokens_seen": 336252440, + "step": 15591, + "time_per_iteration": 2.5861213207244873 + }, + { + "auxiliary_loss_clip": 0.01088363, + "auxiliary_loss_mlp": 0.00749598, + "balance_loss_clip": 1.03477585, + "balance_loss_mlp": 1.0002377, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 1.9840196899378963, + "language_loss": 0.53313547, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.5515151, + "num_input_tokens_seen": 336273845, + "step": 15592, + "time_per_iteration": 2.743144989013672 + }, + { + "auxiliary_loss_clip": 0.01080853, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.03424275, + "balance_loss_mlp": 1.01686645, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 1.6139488350991287, + "language_loss": 0.67232209, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69340521, + "num_input_tokens_seen": 336292790, + "step": 15593, + "time_per_iteration": 2.6273841857910156 + }, + { + "auxiliary_loss_clip": 0.01072814, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.03017759, + "balance_loss_mlp": 1.01864552, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.5228122121086975, + "language_loss": 0.74181634, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.7628355, + "num_input_tokens_seen": 336312600, + "step": 15594, + "time_per_iteration": 2.621042251586914 + }, + { + "auxiliary_loss_clip": 0.01083375, + "auxiliary_loss_mlp": 0.01023695, + "balance_loss_clip": 1.03203893, + "balance_loss_mlp": 1.01390183, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 2.0504422885408764, + "language_loss": 0.73762286, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75869358, + "num_input_tokens_seen": 336332770, + "step": 15595, + "time_per_iteration": 2.5991368293762207 + }, + { + "auxiliary_loss_clip": 0.01067987, + "auxiliary_loss_mlp": 0.01027279, + "balance_loss_clip": 1.03176355, + "balance_loss_mlp": 1.01702094, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 2.1921185573449, + "language_loss": 0.76202726, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78297985, + "num_input_tokens_seen": 336351445, + "step": 15596, + "time_per_iteration": 2.589712619781494 + }, + { + "auxiliary_loss_clip": 0.01069172, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.03292, + "balance_loss_mlp": 1.01996958, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.875806280316384, + "language_loss": 0.7877714, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80877185, + "num_input_tokens_seen": 336368690, + "step": 15597, + "time_per_iteration": 4.094984769821167 + }, + { + "auxiliary_loss_clip": 0.01100625, + "auxiliary_loss_mlp": 0.01032064, + "balance_loss_clip": 1.03405738, + "balance_loss_mlp": 1.0205183, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.426229086887279, + "language_loss": 0.80985045, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83117735, + "num_input_tokens_seen": 336388165, + "step": 15598, + "time_per_iteration": 2.5050251483917236 + }, + { + "auxiliary_loss_clip": 0.01066415, + "auxiliary_loss_mlp": 0.01026993, + "balance_loss_clip": 1.03217185, + "balance_loss_mlp": 1.01556087, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 2.1723347178636025, + "language_loss": 0.6320067, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65294081, + "num_input_tokens_seen": 336406475, + "step": 15599, + "time_per_iteration": 2.668276071548462 + }, + { + "auxiliary_loss_clip": 0.01053151, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.03048348, + "balance_loss_mlp": 1.01931691, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 2.076750753152355, + "language_loss": 0.72982526, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75066191, + "num_input_tokens_seen": 336424690, + "step": 15600, + "time_per_iteration": 2.603123188018799 + }, + { + "auxiliary_loss_clip": 0.01072272, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.03296649, + "balance_loss_mlp": 1.02043962, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 2.0240517077664837, + "language_loss": 0.69587982, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.7169044, + "num_input_tokens_seen": 336443055, + "step": 15601, + "time_per_iteration": 2.545422315597534 + }, + { + "auxiliary_loss_clip": 0.01013017, + "auxiliary_loss_mlp": 0.01004321, + "balance_loss_clip": 1.00278342, + "balance_loss_mlp": 1.0033555, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7562356453631792, + "language_loss": 0.58148873, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60166204, + "num_input_tokens_seen": 336510190, + "step": 15602, + "time_per_iteration": 3.2471539974212646 + }, + { + "auxiliary_loss_clip": 0.01019947, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.03063059, + "balance_loss_mlp": 1.01934433, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.085319106437464, + "language_loss": 0.72233313, + "learning_rate": 4.001719234324663e-08, + "loss": 0.74284202, + "num_input_tokens_seen": 336529250, + "step": 15603, + "time_per_iteration": 2.9817211627960205 + }, + { + "auxiliary_loss_clip": 0.0108923, + "auxiliary_loss_mlp": 0.01025618, + "balance_loss_clip": 1.03119445, + "balance_loss_mlp": 1.01569355, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.55945313228372, + "language_loss": 0.75997084, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78111935, + "num_input_tokens_seen": 336548530, + "step": 15604, + "time_per_iteration": 2.6929266452789307 + }, + { + "auxiliary_loss_clip": 0.01067405, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.029652, + "balance_loss_mlp": 1.0227381, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 2.319871748458325, + "language_loss": 0.65592062, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67695332, + "num_input_tokens_seen": 336568510, + "step": 15605, + "time_per_iteration": 2.6310901641845703 + }, + { + "auxiliary_loss_clip": 0.0106443, + "auxiliary_loss_mlp": 0.00749603, + "balance_loss_clip": 1.03199494, + "balance_loss_mlp": 1.00027418, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.9407708958571568, + "language_loss": 0.67369711, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69183743, + "num_input_tokens_seen": 336592020, + "step": 15606, + "time_per_iteration": 2.814725399017334 + }, + { + "auxiliary_loss_clip": 0.01079564, + "auxiliary_loss_mlp": 0.01024289, + "balance_loss_clip": 1.03033578, + "balance_loss_mlp": 1.01425731, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.7512074753746718, + "language_loss": 0.77177542, + "learning_rate": 3.970771343058166e-08, + "loss": 0.7928139, + "num_input_tokens_seen": 336610010, + "step": 15607, + "time_per_iteration": 2.563776731491089 + }, + { + "auxiliary_loss_clip": 0.01086537, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.03196943, + "balance_loss_mlp": 1.01743197, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 2.1520141224909586, + "language_loss": 0.82665271, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84779769, + "num_input_tokens_seen": 336628520, + "step": 15608, + "time_per_iteration": 2.5333762168884277 + }, + { + "auxiliary_loss_clip": 0.01088525, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.03586698, + "balance_loss_mlp": 1.02050924, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.6266215360790808, + "language_loss": 0.68507707, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.70627999, + "num_input_tokens_seen": 336647365, + "step": 15609, + "time_per_iteration": 2.5374083518981934 + }, + { + "auxiliary_loss_clip": 0.01067269, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.03172541, + "balance_loss_mlp": 1.0168612, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 1.7879549616320782, + "language_loss": 0.75207317, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77303147, + "num_input_tokens_seen": 336667165, + "step": 15610, + "time_per_iteration": 2.642054796218872 + }, + { + "auxiliary_loss_clip": 0.01033615, + "auxiliary_loss_mlp": 0.01024673, + "balance_loss_clip": 1.03139615, + "balance_loss_mlp": 1.01477289, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 2.001735792729224, + "language_loss": 0.75062865, + "learning_rate": 3.939942386953987e-08, + "loss": 0.7712115, + "num_input_tokens_seen": 336684130, + "step": 15611, + "time_per_iteration": 2.6939332485198975 + }, + { + "auxiliary_loss_clip": 0.0105741, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.03402662, + "balance_loss_mlp": 1.01714563, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 1.6707208269231992, + "language_loss": 0.66109121, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68194413, + "num_input_tokens_seen": 336701520, + "step": 15612, + "time_per_iteration": 2.5913236141204834 + }, + { + "auxiliary_loss_clip": 0.01082254, + "auxiliary_loss_mlp": 0.01027819, + "balance_loss_clip": 1.03210723, + "balance_loss_mlp": 1.01743567, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.7446201216929609, + "language_loss": 0.5697335, + "learning_rate": 3.924572515435742e-08, + "loss": 0.59083426, + "num_input_tokens_seen": 336720675, + "step": 15613, + "time_per_iteration": 2.53722882270813 + }, + { + "auxiliary_loss_clip": 0.01073443, + "auxiliary_loss_mlp": 0.01032205, + "balance_loss_clip": 1.03031695, + "balance_loss_mlp": 1.02122569, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 2.4033742303050922, + "language_loss": 0.70503116, + "learning_rate": 3.916898732330764e-08, + "loss": 0.72608763, + "num_input_tokens_seen": 336741005, + "step": 15614, + "time_per_iteration": 2.6045451164245605 + }, + { + "auxiliary_loss_clip": 0.01088943, + "auxiliary_loss_mlp": 0.01028591, + "balance_loss_clip": 1.03396761, + "balance_loss_mlp": 1.01718855, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.7504076794884411, + "language_loss": 0.81162298, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83279836, + "num_input_tokens_seen": 336757990, + "step": 15615, + "time_per_iteration": 4.100883722305298 + }, + { + "auxiliary_loss_clip": 0.01066698, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.02995908, + "balance_loss_mlp": 1.01625752, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.954830023517682, + "language_loss": 0.72251278, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74345237, + "num_input_tokens_seen": 336777705, + "step": 15616, + "time_per_iteration": 2.624797821044922 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01027092, + "balance_loss_clip": 1.03390491, + "balance_loss_mlp": 1.0160234, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 2.443387558376456, + "language_loss": 0.66538393, + "learning_rate": 3.89392199712355e-08, + "loss": 0.6866256, + "num_input_tokens_seen": 336798275, + "step": 15617, + "time_per_iteration": 2.5963175296783447 + }, + { + "auxiliary_loss_clip": 0.01089668, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.03404975, + "balance_loss_mlp": 1.02316236, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 3.1284302268082773, + "language_loss": 0.73020655, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75145596, + "num_input_tokens_seen": 336813835, + "step": 15618, + "time_per_iteration": 2.5565242767333984 + }, + { + "auxiliary_loss_clip": 0.01101444, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.0339973, + "balance_loss_mlp": 1.02012706, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.903237998764679, + "language_loss": 0.70110911, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72244859, + "num_input_tokens_seen": 336832210, + "step": 15619, + "time_per_iteration": 2.5000038146972656 + }, + { + "auxiliary_loss_clip": 0.01069131, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.02984571, + "balance_loss_mlp": 1.01822007, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.6871573901660557, + "language_loss": 0.77437842, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79536635, + "num_input_tokens_seen": 336851380, + "step": 15620, + "time_per_iteration": 2.608729600906372 + }, + { + "auxiliary_loss_clip": 0.01082383, + "auxiliary_loss_mlp": 0.01025535, + "balance_loss_clip": 1.03205991, + "balance_loss_mlp": 1.01499057, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 11.070466247242514, + "language_loss": 0.73793226, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.75901145, + "num_input_tokens_seen": 336868525, + "step": 15621, + "time_per_iteration": 2.5082743167877197 + }, + { + "auxiliary_loss_clip": 0.01063991, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.03151202, + "balance_loss_mlp": 1.02041531, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 1.9150403555187128, + "language_loss": 0.66051596, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68147963, + "num_input_tokens_seen": 336886200, + "step": 15622, + "time_per_iteration": 2.716007709503174 + }, + { + "auxiliary_loss_clip": 0.01065016, + "auxiliary_loss_mlp": 0.01038894, + "balance_loss_clip": 1.02831388, + "balance_loss_mlp": 1.02676463, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.5693864035989007, + "language_loss": 0.71761203, + "learning_rate": 3.848169316300209e-08, + "loss": 0.7386511, + "num_input_tokens_seen": 336905815, + "step": 15623, + "time_per_iteration": 4.131094455718994 + }, + { + "auxiliary_loss_clip": 0.0109047, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.03620863, + "balance_loss_mlp": 1.01940262, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 1.9391686347500132, + "language_loss": 0.7234503, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74465942, + "num_input_tokens_seen": 336928460, + "step": 15624, + "time_per_iteration": 2.6220412254333496 + }, + { + "auxiliary_loss_clip": 0.01060007, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.03041649, + "balance_loss_mlp": 1.01754487, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 2.838091354796752, + "language_loss": 0.89583355, + "learning_rate": 3.832977924388614e-08, + "loss": 0.9167186, + "num_input_tokens_seen": 336948320, + "step": 15625, + "time_per_iteration": 2.6279735565185547 + }, + { + "auxiliary_loss_clip": 0.01086421, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.03372097, + "balance_loss_mlp": 1.01832557, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 1.9824077147310846, + "language_loss": 0.83655119, + "learning_rate": 3.825393386298592e-08, + "loss": 0.85771596, + "num_input_tokens_seen": 336967670, + "step": 15626, + "time_per_iteration": 2.5534090995788574 + }, + { + "auxiliary_loss_clip": 0.01003, + "auxiliary_loss_mlp": 0.00999475, + "balance_loss_clip": 1.00259697, + "balance_loss_mlp": 0.99852103, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.7787434702422112, + "language_loss": 0.56113523, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58115995, + "num_input_tokens_seen": 337028395, + "step": 15627, + "time_per_iteration": 3.106224298477173 + }, + { + "auxiliary_loss_clip": 0.01042259, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02949798, + "balance_loss_mlp": 1.02333093, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.5656408337477759, + "language_loss": 0.70215702, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72293216, + "num_input_tokens_seen": 337048150, + "step": 15628, + "time_per_iteration": 4.1298065185546875 + }, + { + "auxiliary_loss_clip": 0.01085355, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.03370094, + "balance_loss_mlp": 1.01764226, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.4771447349122893, + "language_loss": 0.75211835, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77325654, + "num_input_tokens_seen": 337069315, + "step": 15629, + "time_per_iteration": 2.5760860443115234 + }, + { + "auxiliary_loss_clip": 0.01029597, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.027812, + "balance_loss_mlp": 1.01856208, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.6669246492049155, + "language_loss": 0.74232364, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76291925, + "num_input_tokens_seen": 337087765, + "step": 15630, + "time_per_iteration": 2.6718757152557373 + }, + { + "auxiliary_loss_clip": 0.01059294, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.03151178, + "balance_loss_mlp": 1.01884019, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 1.922214653722521, + "language_loss": 0.69137776, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71226841, + "num_input_tokens_seen": 337106265, + "step": 15631, + "time_per_iteration": 2.544119358062744 + }, + { + "auxiliary_loss_clip": 0.01042129, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.02897072, + "balance_loss_mlp": 1.02071214, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 2.265850288465588, + "language_loss": 0.75083315, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77156883, + "num_input_tokens_seen": 337126090, + "step": 15632, + "time_per_iteration": 2.6474838256835938 + }, + { + "auxiliary_loss_clip": 0.0109019, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.03390312, + "balance_loss_mlp": 1.02168274, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.7127769443989103, + "language_loss": 0.74497712, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76621985, + "num_input_tokens_seen": 337145655, + "step": 15633, + "time_per_iteration": 2.559511423110962 + }, + { + "auxiliary_loss_clip": 0.01099105, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.03373337, + "balance_loss_mlp": 1.02302909, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 2.030170180717444, + "language_loss": 0.72276008, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74410081, + "num_input_tokens_seen": 337164805, + "step": 15634, + "time_per_iteration": 2.5621302127838135 + }, + { + "auxiliary_loss_clip": 0.01085912, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.03082001, + "balance_loss_mlp": 1.01684415, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 1.6364693963770438, + "language_loss": 0.6868273, + "learning_rate": 3.75746733114144e-08, + "loss": 0.70797282, + "num_input_tokens_seen": 337182280, + "step": 15635, + "time_per_iteration": 2.530728340148926 + }, + { + "auxiliary_loss_clip": 0.01045845, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.03270149, + "balance_loss_mlp": 1.0151875, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.5777675332439751, + "language_loss": 0.74477422, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76548904, + "num_input_tokens_seen": 337203495, + "step": 15636, + "time_per_iteration": 2.684558153152466 + }, + { + "auxiliary_loss_clip": 0.01087687, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.03422701, + "balance_loss_mlp": 1.02253771, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 2.2904088905108533, + "language_loss": 0.82846099, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.84967637, + "num_input_tokens_seen": 337220435, + "step": 15637, + "time_per_iteration": 4.072278261184692 + }, + { + "auxiliary_loss_clip": 0.01054004, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.03498101, + "balance_loss_mlp": 1.01836419, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.6599926968090233, + "language_loss": 0.69013399, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.71096879, + "num_input_tokens_seen": 337238095, + "step": 15638, + "time_per_iteration": 2.739168405532837 + }, + { + "auxiliary_loss_clip": 0.01081138, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.03181434, + "balance_loss_mlp": 1.02324557, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.672986365625036, + "language_loss": 0.8496033, + "learning_rate": 3.727471440859498e-08, + "loss": 0.87074149, + "num_input_tokens_seen": 337256645, + "step": 15639, + "time_per_iteration": 2.67130446434021 + }, + { + "auxiliary_loss_clip": 0.01069885, + "auxiliary_loss_mlp": 0.00749307, + "balance_loss_clip": 1.02953029, + "balance_loss_mlp": 1.00025225, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.4231111160191363, + "language_loss": 0.78437859, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80257058, + "num_input_tokens_seen": 337278360, + "step": 15640, + "time_per_iteration": 2.676839828491211 + }, + { + "auxiliary_loss_clip": 0.0108754, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.03258383, + "balance_loss_mlp": 1.02014303, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.6546225337531248, + "language_loss": 0.74366581, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76485133, + "num_input_tokens_seen": 337302480, + "step": 15641, + "time_per_iteration": 2.613935708999634 + }, + { + "auxiliary_loss_clip": 0.01089547, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.03267872, + "balance_loss_mlp": 1.01776671, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 2.115942426694166, + "language_loss": 0.82531565, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84651631, + "num_input_tokens_seen": 337316600, + "step": 15642, + "time_per_iteration": 2.5014827251434326 + }, + { + "auxiliary_loss_clip": 0.01080277, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.03158903, + "balance_loss_mlp": 1.01884067, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 1.9322461916679476, + "language_loss": 0.68450671, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70559913, + "num_input_tokens_seen": 337336895, + "step": 15643, + "time_per_iteration": 2.585939884185791 + }, + { + "auxiliary_loss_clip": 0.01086457, + "auxiliary_loss_mlp": 0.01036041, + "balance_loss_clip": 1.03463817, + "balance_loss_mlp": 1.02410841, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 2.3388408790104833, + "language_loss": 0.7693941, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.79061913, + "num_input_tokens_seen": 337355105, + "step": 15644, + "time_per_iteration": 2.5692930221557617 + }, + { + "auxiliary_loss_clip": 0.01075362, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.03070009, + "balance_loss_mlp": 1.01985693, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 7.7006356463629055, + "language_loss": 0.67558134, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69663686, + "num_input_tokens_seen": 337374905, + "step": 15645, + "time_per_iteration": 2.608198642730713 + }, + { + "auxiliary_loss_clip": 0.01065876, + "auxiliary_loss_mlp": 0.00749265, + "balance_loss_clip": 1.03250968, + "balance_loss_mlp": 1.00021076, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.5272433758896946, + "language_loss": 0.70504743, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72319889, + "num_input_tokens_seen": 337397130, + "step": 15646, + "time_per_iteration": 2.662257432937622 + }, + { + "auxiliary_loss_clip": 0.01076227, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.02980447, + "balance_loss_mlp": 1.0196538, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 5.290877605882631, + "language_loss": 0.74117821, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76224566, + "num_input_tokens_seen": 337418660, + "step": 15647, + "time_per_iteration": 2.58188796043396 + }, + { + "auxiliary_loss_clip": 0.0100508, + "auxiliary_loss_mlp": 0.01005892, + "balance_loss_clip": 1.00491047, + "balance_loss_mlp": 1.00493264, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8903893269232113, + "language_loss": 0.63539755, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65550727, + "num_input_tokens_seen": 337478055, + "step": 15648, + "time_per_iteration": 3.2226433753967285 + }, + { + "auxiliary_loss_clip": 0.01093261, + "auxiliary_loss_mlp": 0.01027808, + "balance_loss_clip": 1.03315067, + "balance_loss_mlp": 1.01807439, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.4539200883463463, + "language_loss": 0.6636492, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68485987, + "num_input_tokens_seen": 337499405, + "step": 15649, + "time_per_iteration": 2.5015344619750977 + }, + { + "auxiliary_loss_clip": 0.01062654, + "auxiliary_loss_mlp": 0.01029162, + "balance_loss_clip": 1.02989483, + "balance_loss_mlp": 1.01850462, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 2.9684760142973503, + "language_loss": 0.77409023, + "learning_rate": 3.645596817637586e-08, + "loss": 0.79500836, + "num_input_tokens_seen": 337517195, + "step": 15650, + "time_per_iteration": 2.677715301513672 + }, + { + "auxiliary_loss_clip": 0.01050709, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.03342819, + "balance_loss_mlp": 1.01922321, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 1.7628633728007255, + "language_loss": 0.74179697, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76260221, + "num_input_tokens_seen": 337535245, + "step": 15651, + "time_per_iteration": 2.7254087924957275 + }, + { + "auxiliary_loss_clip": 0.01096141, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.03304625, + "balance_loss_mlp": 1.0210669, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 2.0918653963829152, + "language_loss": 0.72350705, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74479407, + "num_input_tokens_seen": 337553040, + "step": 15652, + "time_per_iteration": 2.583404302597046 + }, + { + "auxiliary_loss_clip": 0.01060738, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.03283465, + "balance_loss_mlp": 1.02238536, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.9276608262205461, + "language_loss": 0.66072273, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68167084, + "num_input_tokens_seen": 337574580, + "step": 15653, + "time_per_iteration": 2.642275094985962 + }, + { + "auxiliary_loss_clip": 0.01097379, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.03336489, + "balance_loss_mlp": 1.02070868, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 2.0532142210352817, + "language_loss": 0.7753557, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.79665387, + "num_input_tokens_seen": 337593010, + "step": 15654, + "time_per_iteration": 2.5370256900787354 + }, + { + "auxiliary_loss_clip": 0.01092223, + "auxiliary_loss_mlp": 0.01026022, + "balance_loss_clip": 1.03383803, + "balance_loss_mlp": 1.01495314, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.7072401677700646, + "language_loss": 0.70254862, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72373104, + "num_input_tokens_seen": 337616170, + "step": 15655, + "time_per_iteration": 2.695204496383667 + }, + { + "auxiliary_loss_clip": 0.01095758, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.03295779, + "balance_loss_mlp": 1.01842678, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 1.885317642093745, + "language_loss": 0.72172976, + "learning_rate": 3.601317642987944e-08, + "loss": 0.7429893, + "num_input_tokens_seen": 337635215, + "step": 15656, + "time_per_iteration": 3.9920520782470703 + }, + { + "auxiliary_loss_clip": 0.01058823, + "auxiliary_loss_mlp": 0.01026323, + "balance_loss_clip": 1.03076005, + "balance_loss_mlp": 1.0157367, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 1.8923074693960305, + "language_loss": 0.77502382, + "learning_rate": 3.593963845018377e-08, + "loss": 0.79587531, + "num_input_tokens_seen": 337654195, + "step": 15657, + "time_per_iteration": 2.5847766399383545 + }, + { + "auxiliary_loss_clip": 0.01059546, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.03128469, + "balance_loss_mlp": 1.01806951, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 2.132658994850663, + "language_loss": 0.84480989, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86569929, + "num_input_tokens_seen": 337671810, + "step": 15658, + "time_per_iteration": 2.5716073513031006 + }, + { + "auxiliary_loss_clip": 0.01103634, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.03597379, + "balance_loss_mlp": 1.01871419, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 2.0208322844681716, + "language_loss": 0.70620555, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72755748, + "num_input_tokens_seen": 337689410, + "step": 15659, + "time_per_iteration": 2.511551856994629 + }, + { + "auxiliary_loss_clip": 0.01068966, + "auxiliary_loss_mlp": 0.01039061, + "balance_loss_clip": 1.03112113, + "balance_loss_mlp": 1.02883315, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.717287750572641, + "language_loss": 0.79498625, + "learning_rate": 3.571947138643172e-08, + "loss": 0.8160665, + "num_input_tokens_seen": 337709950, + "step": 15660, + "time_per_iteration": 2.6056010723114014 + }, + { + "auxiliary_loss_clip": 0.01058984, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.02952456, + "balance_loss_mlp": 1.01663876, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.4519189041613516, + "language_loss": 0.679919, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70077777, + "num_input_tokens_seen": 337731320, + "step": 15661, + "time_per_iteration": 2.7445428371429443 + }, + { + "auxiliary_loss_clip": 0.01079146, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.03045714, + "balance_loss_mlp": 1.01799953, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.1771734130452978, + "language_loss": 0.6643219, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68540436, + "num_input_tokens_seen": 337747720, + "step": 15662, + "time_per_iteration": 3.99613094329834 + }, + { + "auxiliary_loss_clip": 0.01002776, + "auxiliary_loss_mlp": 0.01003429, + "balance_loss_clip": 1.0030117, + "balance_loss_mlp": 1.00239837, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7657800353853773, + "language_loss": 0.59235209, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61241418, + "num_input_tokens_seen": 337806930, + "step": 15663, + "time_per_iteration": 3.2070422172546387 + }, + { + "auxiliary_loss_clip": 0.01094, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.03603816, + "balance_loss_mlp": 1.02222228, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.9141889578376958, + "language_loss": 0.66913831, + "learning_rate": 3.542695811435914e-08, + "loss": 0.69042313, + "num_input_tokens_seen": 337828100, + "step": 15664, + "time_per_iteration": 2.6207165718078613 + }, + { + "auxiliary_loss_clip": 0.01073023, + "auxiliary_loss_mlp": 0.01026592, + "balance_loss_clip": 1.03388643, + "balance_loss_mlp": 1.01627445, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 2.135638446375908, + "language_loss": 0.73111498, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75211108, + "num_input_tokens_seen": 337844805, + "step": 15665, + "time_per_iteration": 2.620166063308716 + }, + { + "auxiliary_loss_clip": 0.01094574, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.03394949, + "balance_loss_mlp": 1.01751184, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 2.0839471799831437, + "language_loss": 0.64055634, + "learning_rate": 3.528114844807773e-08, + "loss": 0.6617837, + "num_input_tokens_seen": 337860490, + "step": 15666, + "time_per_iteration": 2.478273391723633 + }, + { + "auxiliary_loss_clip": 0.01060857, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.03132677, + "balance_loss_mlp": 1.01948881, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 2.428522079101771, + "language_loss": 0.78947264, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81038916, + "num_input_tokens_seen": 337878360, + "step": 15667, + "time_per_iteration": 2.6197173595428467 + }, + { + "auxiliary_loss_clip": 0.01093379, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.03197801, + "balance_loss_mlp": 1.01922941, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.6699944236698185, + "language_loss": 0.7495935, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77081728, + "num_input_tokens_seen": 337895635, + "step": 15668, + "time_per_iteration": 3.998379707336426 + }, + { + "auxiliary_loss_clip": 0.01043037, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.03069782, + "balance_loss_mlp": 1.01661766, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 2.568247576050241, + "language_loss": 0.58886409, + "learning_rate": 3.506299272306723e-08, + "loss": 0.60957062, + "num_input_tokens_seen": 337913940, + "step": 15669, + "time_per_iteration": 2.624943494796753 + }, + { + "auxiliary_loss_clip": 0.01052504, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.02927232, + "balance_loss_mlp": 1.01608407, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.7436142791329643, + "language_loss": 0.76910919, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.78989923, + "num_input_tokens_seen": 337932015, + "step": 15670, + "time_per_iteration": 2.6178431510925293 + }, + { + "auxiliary_loss_clip": 0.01097993, + "auxiliary_loss_mlp": 0.01034925, + "balance_loss_clip": 1.0343194, + "balance_loss_mlp": 1.02367127, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 2.0981220550860122, + "language_loss": 0.65489459, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67622375, + "num_input_tokens_seen": 337953345, + "step": 15671, + "time_per_iteration": 2.573028802871704 + }, + { + "auxiliary_loss_clip": 0.01068746, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.0307827, + "balance_loss_mlp": 1.01946497, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 2.659757192192983, + "language_loss": 0.7942816, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81527281, + "num_input_tokens_seen": 337973685, + "step": 15672, + "time_per_iteration": 2.6502864360809326 + }, + { + "auxiliary_loss_clip": 0.01060945, + "auxiliary_loss_mlp": 0.01033726, + "balance_loss_clip": 1.03153157, + "balance_loss_mlp": 1.02148962, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.1334824308410125, + "language_loss": 0.73501849, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75596523, + "num_input_tokens_seen": 337989175, + "step": 15673, + "time_per_iteration": 2.6140658855438232 + }, + { + "auxiliary_loss_clip": 0.01076329, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.03016269, + "balance_loss_mlp": 1.0180465, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.620975821552026, + "language_loss": 0.70163631, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72269922, + "num_input_tokens_seen": 338011800, + "step": 15674, + "time_per_iteration": 2.6024060249328613 + }, + { + "auxiliary_loss_clip": 0.01096897, + "auxiliary_loss_mlp": 0.01022523, + "balance_loss_clip": 1.03358161, + "balance_loss_mlp": 1.0121994, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.736465922871118, + "language_loss": 0.80985975, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83105391, + "num_input_tokens_seen": 338032120, + "step": 15675, + "time_per_iteration": 2.5213782787323 + }, + { + "auxiliary_loss_clip": 0.01069605, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.03214538, + "balance_loss_mlp": 1.01624954, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 2.568820085167136, + "language_loss": 0.62676406, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.64772844, + "num_input_tokens_seen": 338051880, + "step": 15676, + "time_per_iteration": 2.5904316902160645 + }, + { + "auxiliary_loss_clip": 0.01080824, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.03610253, + "balance_loss_mlp": 1.01903248, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.768278226806498, + "language_loss": 0.66866583, + "learning_rate": 3.448452279120984e-08, + "loss": 0.68977296, + "num_input_tokens_seen": 338069665, + "step": 15677, + "time_per_iteration": 4.116663217544556 + }, + { + "auxiliary_loss_clip": 0.01057479, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.02851272, + "balance_loss_mlp": 1.02126765, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.8376438987561245, + "language_loss": 0.64410317, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66501653, + "num_input_tokens_seen": 338090490, + "step": 15678, + "time_per_iteration": 2.665377616882324 + }, + { + "auxiliary_loss_clip": 0.0106791, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.03454816, + "balance_loss_mlp": 1.01435256, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.4752240277435036, + "language_loss": 0.74004292, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76097333, + "num_input_tokens_seen": 338109825, + "step": 15679, + "time_per_iteration": 2.6466639041900635 + }, + { + "auxiliary_loss_clip": 0.01079711, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.03459334, + "balance_loss_mlp": 1.02075839, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.5471174169728235, + "language_loss": 0.77710593, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79822129, + "num_input_tokens_seen": 338125790, + "step": 15680, + "time_per_iteration": 2.6154532432556152 + }, + { + "auxiliary_loss_clip": 0.01085969, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.03298974, + "balance_loss_mlp": 1.0200243, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 1.9812330846317554, + "language_loss": 0.75628299, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77745044, + "num_input_tokens_seen": 338145610, + "step": 15681, + "time_per_iteration": 2.5850019454956055 + }, + { + "auxiliary_loss_clip": 0.01069583, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.03317499, + "balance_loss_mlp": 1.02398252, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 1.7913096946506881, + "language_loss": 0.65385544, + "learning_rate": 3.412540130236086e-08, + "loss": 0.67490256, + "num_input_tokens_seen": 338165960, + "step": 15682, + "time_per_iteration": 2.606696128845215 + }, + { + "auxiliary_loss_clip": 0.01062032, + "auxiliary_loss_mlp": 0.01024891, + "balance_loss_clip": 1.03176594, + "balance_loss_mlp": 1.01422715, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 1.9311215111787878, + "language_loss": 0.76304549, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78391469, + "num_input_tokens_seen": 338187215, + "step": 15683, + "time_per_iteration": 2.702380418777466 + }, + { + "auxiliary_loss_clip": 0.01087275, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.03352475, + "balance_loss_mlp": 1.02601385, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 2.7100822702496763, + "language_loss": 0.75293803, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77419281, + "num_input_tokens_seen": 338201825, + "step": 15684, + "time_per_iteration": 2.5127267837524414 + }, + { + "auxiliary_loss_clip": 0.01093449, + "auxiliary_loss_mlp": 0.01023556, + "balance_loss_clip": 1.03224897, + "balance_loss_mlp": 1.01337588, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.9341273387792761, + "language_loss": 0.77382976, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79499984, + "num_input_tokens_seen": 338220865, + "step": 15685, + "time_per_iteration": 2.522825241088867 + }, + { + "auxiliary_loss_clip": 0.0108351, + "auxiliary_loss_mlp": 0.01027009, + "balance_loss_clip": 1.03291345, + "balance_loss_mlp": 1.01693583, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 2.0480814214789347, + "language_loss": 0.75789487, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77900004, + "num_input_tokens_seen": 338240160, + "step": 15686, + "time_per_iteration": 2.6457929611206055 + }, + { + "auxiliary_loss_clip": 0.0108669, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.03196239, + "balance_loss_mlp": 1.01757371, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 2.6594159026257373, + "language_loss": 0.80810392, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82926226, + "num_input_tokens_seen": 338259305, + "step": 15687, + "time_per_iteration": 2.528369665145874 + }, + { + "auxiliary_loss_clip": 0.01078931, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.03332651, + "balance_loss_mlp": 1.02200711, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 2.175062971497702, + "language_loss": 0.75834143, + "learning_rate": 3.369691556873011e-08, + "loss": 0.77947176, + "num_input_tokens_seen": 338274950, + "step": 15688, + "time_per_iteration": 2.52950119972229 + }, + { + "auxiliary_loss_clip": 0.01065495, + "auxiliary_loss_mlp": 0.01026053, + "balance_loss_clip": 1.03015566, + "balance_loss_mlp": 1.01415586, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.8087728344203489, + "language_loss": 0.68602562, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70694113, + "num_input_tokens_seen": 338295585, + "step": 15689, + "time_per_iteration": 2.617140531539917 + }, + { + "auxiliary_loss_clip": 0.01080341, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.03122437, + "balance_loss_mlp": 1.022488, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.7369390037331942, + "language_loss": 0.80399644, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82511795, + "num_input_tokens_seen": 338314555, + "step": 15690, + "time_per_iteration": 2.5434610843658447 + }, + { + "auxiliary_loss_clip": 0.01086241, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.03382802, + "balance_loss_mlp": 1.01686192, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 1.9174582008882877, + "language_loss": 0.59990674, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62104076, + "num_input_tokens_seen": 338336260, + "step": 15691, + "time_per_iteration": 2.585468053817749 + }, + { + "auxiliary_loss_clip": 0.01053153, + "auxiliary_loss_mlp": 0.0102497, + "balance_loss_clip": 1.03144681, + "balance_loss_mlp": 1.01369905, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.718400796751503, + "language_loss": 0.66307414, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68385535, + "num_input_tokens_seen": 338354680, + "step": 15692, + "time_per_iteration": 2.6175930500030518 + }, + { + "auxiliary_loss_clip": 0.01083498, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.03341103, + "balance_loss_mlp": 1.02221799, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.498693172456064, + "language_loss": 0.74886227, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77002788, + "num_input_tokens_seen": 338372490, + "step": 15693, + "time_per_iteration": 2.549333095550537 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.03177881, + "balance_loss_mlp": 1.02312768, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 1.5679783476023397, + "language_loss": 0.73199594, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75296235, + "num_input_tokens_seen": 338390870, + "step": 15694, + "time_per_iteration": 2.7188827991485596 + }, + { + "auxiliary_loss_clip": 0.00981838, + "auxiliary_loss_mlp": 0.00999386, + "balance_loss_clip": 1.00392652, + "balance_loss_mlp": 0.99825948, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.6997081798411987, + "language_loss": 0.50550461, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52531683, + "num_input_tokens_seen": 338453075, + "step": 15695, + "time_per_iteration": 3.265320062637329 + }, + { + "auxiliary_loss_clip": 0.01062636, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.0282526, + "balance_loss_mlp": 1.01960814, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.6559778583720424, + "language_loss": 0.65079713, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67172539, + "num_input_tokens_seen": 338471770, + "step": 15696, + "time_per_iteration": 4.2287280559539795 + }, + { + "auxiliary_loss_clip": 0.01086373, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.03334033, + "balance_loss_mlp": 1.01611543, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.7144490127324807, + "language_loss": 0.66562831, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68675888, + "num_input_tokens_seen": 338492190, + "step": 15697, + "time_per_iteration": 2.6322579383850098 + }, + { + "auxiliary_loss_clip": 0.00993782, + "auxiliary_loss_mlp": 0.01006851, + "balance_loss_clip": 1.00347424, + "balance_loss_mlp": 1.00593936, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8431556238638263, + "language_loss": 0.63182354, + "learning_rate": 3.298873795868506e-08, + "loss": 0.6518299, + "num_input_tokens_seen": 338552560, + "step": 15698, + "time_per_iteration": 3.105377435684204 + }, + { + "auxiliary_loss_clip": 0.01077979, + "auxiliary_loss_mlp": 0.01038725, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.02660775, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 2.0490718760576754, + "language_loss": 0.69954425, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72071129, + "num_input_tokens_seen": 338571770, + "step": 15699, + "time_per_iteration": 2.5822978019714355 + }, + { + "auxiliary_loss_clip": 0.01056349, + "auxiliary_loss_mlp": 0.01027553, + "balance_loss_clip": 1.03004861, + "balance_loss_mlp": 1.01718771, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 2.289097707934507, + "language_loss": 0.74396807, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76480711, + "num_input_tokens_seen": 338587310, + "step": 15700, + "time_per_iteration": 2.5656285285949707 + }, + { + "auxiliary_loss_clip": 0.0101389, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02629018, + "balance_loss_mlp": 1.02189732, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 6.712113470358545, + "language_loss": 0.70560741, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72607219, + "num_input_tokens_seen": 338606235, + "step": 15701, + "time_per_iteration": 2.737300395965576 + }, + { + "auxiliary_loss_clip": 0.01054553, + "auxiliary_loss_mlp": 0.01025425, + "balance_loss_clip": 1.03027749, + "balance_loss_mlp": 1.01436257, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 9.198094776979627, + "language_loss": 0.77802682, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.79882658, + "num_input_tokens_seen": 338624090, + "step": 15702, + "time_per_iteration": 4.1577417850494385 + }, + { + "auxiliary_loss_clip": 0.01080892, + "auxiliary_loss_mlp": 0.0103893, + "balance_loss_clip": 1.03215015, + "balance_loss_mlp": 1.02705729, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.6961098525247462, + "language_loss": 0.66475666, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68595487, + "num_input_tokens_seen": 338643695, + "step": 15703, + "time_per_iteration": 2.5601003170013428 + }, + { + "auxiliary_loss_clip": 0.0108717, + "auxiliary_loss_mlp": 0.01025223, + "balance_loss_clip": 1.03503036, + "balance_loss_mlp": 1.0139997, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 1.5753341885656489, + "language_loss": 0.73380184, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75492573, + "num_input_tokens_seen": 338664725, + "step": 15704, + "time_per_iteration": 2.6422860622406006 + }, + { + "auxiliary_loss_clip": 0.01084768, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.03406668, + "balance_loss_mlp": 1.02321446, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.8786391374915896, + "language_loss": 0.74681044, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76799977, + "num_input_tokens_seen": 338683990, + "step": 15705, + "time_per_iteration": 2.5664196014404297 + }, + { + "auxiliary_loss_clip": 0.01074875, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.03376579, + "balance_loss_mlp": 1.02215421, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 1.780765715867591, + "language_loss": 0.77152485, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79259694, + "num_input_tokens_seen": 338702025, + "step": 15706, + "time_per_iteration": 2.6197144985198975 + }, + { + "auxiliary_loss_clip": 0.01082402, + "auxiliary_loss_mlp": 0.01023056, + "balance_loss_clip": 1.03205681, + "balance_loss_mlp": 1.01316786, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.4587551543336157, + "language_loss": 0.69266677, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71372133, + "num_input_tokens_seen": 338720920, + "step": 15707, + "time_per_iteration": 2.51747465133667 + }, + { + "auxiliary_loss_clip": 0.01088169, + "auxiliary_loss_mlp": 0.01024601, + "balance_loss_clip": 1.028929, + "balance_loss_mlp": 1.01487339, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.7733032643609918, + "language_loss": 0.69259965, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71372736, + "num_input_tokens_seen": 338739590, + "step": 15708, + "time_per_iteration": 4.036776304244995 + }, + { + "auxiliary_loss_clip": 0.01086845, + "auxiliary_loss_mlp": 0.01025758, + "balance_loss_clip": 1.03431797, + "balance_loss_mlp": 1.0150826, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.3742102668440106, + "language_loss": 0.70411992, + "learning_rate": 3.221835774749748e-08, + "loss": 0.72524595, + "num_input_tokens_seen": 338757240, + "step": 15709, + "time_per_iteration": 2.5504307746887207 + }, + { + "auxiliary_loss_clip": 0.01057212, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.03511763, + "balance_loss_mlp": 1.01732445, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 2.810817982341996, + "language_loss": 0.84696555, + "learning_rate": 3.214877084074774e-08, + "loss": 0.86782116, + "num_input_tokens_seen": 338773750, + "step": 15710, + "time_per_iteration": 2.6546471118927 + }, + { + "auxiliary_loss_clip": 0.01072064, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.03479052, + "balance_loss_mlp": 1.01873004, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 3.2254548012287025, + "language_loss": 0.71386504, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73488873, + "num_input_tokens_seen": 338792115, + "step": 15711, + "time_per_iteration": 2.6584861278533936 + }, + { + "auxiliary_loss_clip": 0.0108501, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.03450453, + "balance_loss_mlp": 1.01824522, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.692699856875575, + "language_loss": 0.68956101, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71070611, + "num_input_tokens_seen": 338812480, + "step": 15712, + "time_per_iteration": 2.563392162322998 + }, + { + "auxiliary_loss_clip": 0.01089946, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.03511095, + "balance_loss_mlp": 1.02153444, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.0856472249101237, + "language_loss": 0.70678866, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.72802365, + "num_input_tokens_seen": 338829105, + "step": 15713, + "time_per_iteration": 2.5544395446777344 + }, + { + "auxiliary_loss_clip": 0.01071056, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.03058243, + "balance_loss_mlp": 1.0202502, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.7298662075544948, + "language_loss": 0.76665497, + "learning_rate": 3.187116945125212e-08, + "loss": 0.78768307, + "num_input_tokens_seen": 338850670, + "step": 15714, + "time_per_iteration": 2.6328625679016113 + }, + { + "auxiliary_loss_clip": 0.01071679, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.03553391, + "balance_loss_mlp": 1.01778352, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 1.7815343046474383, + "language_loss": 0.67265743, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69366658, + "num_input_tokens_seen": 338867795, + "step": 15715, + "time_per_iteration": 2.654661178588867 + }, + { + "auxiliary_loss_clip": 0.0106598, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.03305364, + "balance_loss_mlp": 1.01933599, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 2.590041110044937, + "language_loss": 0.74611032, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76707733, + "num_input_tokens_seen": 338887205, + "step": 15716, + "time_per_iteration": 2.690763473510742 + }, + { + "auxiliary_loss_clip": 0.01082424, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.03757465, + "balance_loss_mlp": 1.02001262, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.6601415330927745, + "language_loss": 0.62269092, + "learning_rate": 3.166375203215565e-08, + "loss": 0.6438266, + "num_input_tokens_seen": 338906130, + "step": 15717, + "time_per_iteration": 4.137066125869751 + }, + { + "auxiliary_loss_clip": 0.01082044, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.0346818, + "balance_loss_mlp": 1.01795208, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.602178221380621, + "language_loss": 0.7863881, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.80749452, + "num_input_tokens_seen": 338923045, + "step": 15718, + "time_per_iteration": 2.527433156967163 + }, + { + "auxiliary_loss_clip": 0.01014033, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 1.00416422, + "balance_loss_mlp": 1.00186145, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.6971541969351476, + "language_loss": 0.57803416, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59820294, + "num_input_tokens_seen": 338987545, + "step": 15719, + "time_per_iteration": 3.12796688079834 + }, + { + "auxiliary_loss_clip": 0.01053124, + "auxiliary_loss_mlp": 0.00749402, + "balance_loss_clip": 1.03125346, + "balance_loss_mlp": 1.00022697, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 2.052370417374617, + "language_loss": 0.75937164, + "learning_rate": 3.145700636861193e-08, + "loss": 0.77739692, + "num_input_tokens_seen": 339007830, + "step": 15720, + "time_per_iteration": 2.681025743484497 + }, + { + "auxiliary_loss_clip": 0.01081769, + "auxiliary_loss_mlp": 0.01027074, + "balance_loss_clip": 1.03195095, + "balance_loss_mlp": 1.01700675, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.789989057394472, + "language_loss": 0.7280587, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74914712, + "num_input_tokens_seen": 339028980, + "step": 15721, + "time_per_iteration": 2.5609607696533203 + }, + { + "auxiliary_loss_clip": 0.01048227, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.02960491, + "balance_loss_mlp": 1.02055037, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 1.9524083027128973, + "language_loss": 0.85164917, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87245178, + "num_input_tokens_seen": 339047950, + "step": 15722, + "time_per_iteration": 2.662947416305542 + }, + { + "auxiliary_loss_clip": 0.01003129, + "auxiliary_loss_mlp": 0.00997321, + "balance_loss_clip": 1.00417256, + "balance_loss_mlp": 0.99642068, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.8999734894259216, + "language_loss": 0.6444779, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66448241, + "num_input_tokens_seen": 339104535, + "step": 15723, + "time_per_iteration": 3.1397082805633545 + }, + { + "auxiliary_loss_clip": 0.01056838, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.03180981, + "balance_loss_mlp": 1.01608717, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 2.0438061902643976, + "language_loss": 0.72965205, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75049496, + "num_input_tokens_seen": 339122050, + "step": 15724, + "time_per_iteration": 2.6370339393615723 + }, + { + "auxiliary_loss_clip": 0.01062158, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.03010106, + "balance_loss_mlp": 1.01691413, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.4419122363990695, + "language_loss": 0.8480764, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86897182, + "num_input_tokens_seen": 339138940, + "step": 15725, + "time_per_iteration": 2.657701253890991 + }, + { + "auxiliary_loss_clip": 0.01079845, + "auxiliary_loss_mlp": 0.01023677, + "balance_loss_clip": 1.03579164, + "balance_loss_mlp": 1.01293612, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 2.5421672825543773, + "language_loss": 0.7087664, + "learning_rate": 3.104553059018822e-08, + "loss": 0.72980165, + "num_input_tokens_seen": 339158245, + "step": 15726, + "time_per_iteration": 2.648902416229248 + }, + { + "auxiliary_loss_clip": 0.01069387, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.03100741, + "balance_loss_mlp": 1.01885474, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.9919786363150087, + "language_loss": 0.60563636, + "learning_rate": 3.097721259896735e-08, + "loss": 0.62663496, + "num_input_tokens_seen": 339178200, + "step": 15727, + "time_per_iteration": 2.6467227935791016 + }, + { + "auxiliary_loss_clip": 0.01082568, + "auxiliary_loss_mlp": 0.01034113, + "balance_loss_clip": 1.03088081, + "balance_loss_mlp": 1.02357471, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.7466871591230826, + "language_loss": 0.81719053, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.83835733, + "num_input_tokens_seen": 339193950, + "step": 15728, + "time_per_iteration": 2.5959982872009277 + }, + { + "auxiliary_loss_clip": 0.00973613, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.00355268, + "balance_loss_mlp": 1.00745666, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7310526200146186, + "language_loss": 0.5899446, + "learning_rate": 3.08408006157368e-08, + "loss": 0.60976684, + "num_input_tokens_seen": 339252330, + "step": 15729, + "time_per_iteration": 3.2115752696990967 + }, + { + "auxiliary_loss_clip": 0.01094692, + "auxiliary_loss_mlp": 0.01024212, + "balance_loss_clip": 1.03243208, + "balance_loss_mlp": 1.01322114, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 2.6351972020290733, + "language_loss": 0.76631105, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78750008, + "num_input_tokens_seen": 339270325, + "step": 15730, + "time_per_iteration": 3.079432725906372 + }, + { + "auxiliary_loss_clip": 0.01072817, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.03562474, + "balance_loss_mlp": 1.01923764, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.3993887891163515, + "language_loss": 0.62690234, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64794183, + "num_input_tokens_seen": 339291980, + "step": 15731, + "time_per_iteration": 2.7190470695495605 + }, + { + "auxiliary_loss_clip": 0.01087814, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.03227699, + "balance_loss_mlp": 1.01578355, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 1.9773747413874307, + "language_loss": 0.64319277, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66434503, + "num_input_tokens_seen": 339311795, + "step": 15732, + "time_per_iteration": 2.659669876098633 + }, + { + "auxiliary_loss_clip": 0.01085608, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.03478575, + "balance_loss_mlp": 1.01720655, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 1.8184056846103054, + "language_loss": 0.84279603, + "learning_rate": 3.056887271848363e-08, + "loss": 0.86394364, + "num_input_tokens_seen": 339327745, + "step": 15733, + "time_per_iteration": 2.744471549987793 + }, + { + "auxiliary_loss_clip": 0.01081911, + "auxiliary_loss_mlp": 0.01027285, + "balance_loss_clip": 1.0319984, + "balance_loss_mlp": 1.01757598, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 2.296432838173669, + "language_loss": 0.72224188, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74333382, + "num_input_tokens_seen": 339346445, + "step": 15734, + "time_per_iteration": 2.6199491024017334 + }, + { + "auxiliary_loss_clip": 0.01079845, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03022695, + "balance_loss_mlp": 1.02089751, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.3572060317394843, + "language_loss": 0.8668344, + "learning_rate": 3.043335684570692e-08, + "loss": 0.88793147, + "num_input_tokens_seen": 339367945, + "step": 15735, + "time_per_iteration": 2.5756258964538574 + }, + { + "auxiliary_loss_clip": 0.01076859, + "auxiliary_loss_mlp": 0.01026584, + "balance_loss_clip": 1.03207302, + "balance_loss_mlp": 1.01611137, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 1.9128283231793866, + "language_loss": 0.67544401, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69647843, + "num_input_tokens_seen": 339386060, + "step": 15736, + "time_per_iteration": 4.30360221862793 + }, + { + "auxiliary_loss_clip": 0.00983619, + "auxiliary_loss_mlp": 0.01006287, + "balance_loss_clip": 1.00720072, + "balance_loss_mlp": 1.00511873, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8709465041097708, + "language_loss": 0.65263557, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67253464, + "num_input_tokens_seen": 339446695, + "step": 15737, + "time_per_iteration": 3.254063844680786 + }, + { + "auxiliary_loss_clip": 0.01013191, + "auxiliary_loss_mlp": 0.01000162, + "balance_loss_clip": 1.00563049, + "balance_loss_mlp": 0.9992258, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.8109949971383478, + "language_loss": 0.58770567, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60783917, + "num_input_tokens_seen": 339510080, + "step": 15738, + "time_per_iteration": 3.154536247253418 + }, + { + "auxiliary_loss_clip": 0.01081986, + "auxiliary_loss_mlp": 0.0102943, + "balance_loss_clip": 1.03081322, + "balance_loss_mlp": 1.01961923, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 2.1205742036017843, + "language_loss": 0.71463239, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73574656, + "num_input_tokens_seen": 339529335, + "step": 15739, + "time_per_iteration": 2.64428973197937 + }, + { + "auxiliary_loss_clip": 0.01080488, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.02983093, + "balance_loss_mlp": 1.0176692, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 3.265921553790536, + "language_loss": 0.64598459, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66708219, + "num_input_tokens_seen": 339548820, + "step": 15740, + "time_per_iteration": 2.5556578636169434 + }, + { + "auxiliary_loss_clip": 0.01072579, + "auxiliary_loss_mlp": 0.01026849, + "balance_loss_clip": 1.03264856, + "balance_loss_mlp": 1.01635838, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.7777854714170782, + "language_loss": 0.66399646, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68499076, + "num_input_tokens_seen": 339566775, + "step": 15741, + "time_per_iteration": 2.589630365371704 + }, + { + "auxiliary_loss_clip": 0.010864, + "auxiliary_loss_mlp": 0.01024109, + "balance_loss_clip": 1.03376341, + "balance_loss_mlp": 1.01363075, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 1.9931752691483813, + "language_loss": 0.75864995, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.77975512, + "num_input_tokens_seen": 339581905, + "step": 15742, + "time_per_iteration": 4.001744508743286 + }, + { + "auxiliary_loss_clip": 0.01080926, + "auxiliary_loss_mlp": 0.01028397, + "balance_loss_clip": 1.03107858, + "balance_loss_mlp": 1.01798368, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 1.7999250255500703, + "language_loss": 0.72162241, + "learning_rate": 2.989428100602187e-08, + "loss": 0.7427156, + "num_input_tokens_seen": 339599870, + "step": 15743, + "time_per_iteration": 2.496525764465332 + }, + { + "auxiliary_loss_clip": 0.01063056, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.03530359, + "balance_loss_mlp": 1.01998496, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 2.0002926240767245, + "language_loss": 0.79785681, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81879884, + "num_input_tokens_seen": 339620250, + "step": 15744, + "time_per_iteration": 2.7475669384002686 + }, + { + "auxiliary_loss_clip": 0.01074979, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.0328989, + "balance_loss_mlp": 1.02175474, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 2.0186092157595996, + "language_loss": 0.78364813, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80473042, + "num_input_tokens_seen": 339639900, + "step": 15745, + "time_per_iteration": 2.5815861225128174 + }, + { + "auxiliary_loss_clip": 0.01069743, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.03027713, + "balance_loss_mlp": 1.01831174, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.797883901032433, + "language_loss": 0.70044625, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72144121, + "num_input_tokens_seen": 339658970, + "step": 15746, + "time_per_iteration": 2.585634708404541 + }, + { + "auxiliary_loss_clip": 0.01073791, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.03416348, + "balance_loss_mlp": 1.01640308, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.3869824159199444, + "language_loss": 0.56527448, + "learning_rate": 2.962653596305964e-08, + "loss": 0.58629227, + "num_input_tokens_seen": 339675600, + "step": 15747, + "time_per_iteration": 2.5871078968048096 + }, + { + "auxiliary_loss_clip": 0.00967389, + "auxiliary_loss_mlp": 0.01001331, + "balance_loss_clip": 1.00693786, + "balance_loss_mlp": 1.00009155, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6566956229019251, + "language_loss": 0.53256011, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55224729, + "num_input_tokens_seen": 339744505, + "step": 15748, + "time_per_iteration": 4.952121734619141 + }, + { + "auxiliary_loss_clip": 0.01075275, + "auxiliary_loss_mlp": 0.01040386, + "balance_loss_clip": 1.03287745, + "balance_loss_mlp": 1.02970493, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 2.004052962217692, + "language_loss": 0.66404366, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68520021, + "num_input_tokens_seen": 339765810, + "step": 15749, + "time_per_iteration": 3.042396068572998 + }, + { + "auxiliary_loss_clip": 0.01062801, + "auxiliary_loss_mlp": 0.01028428, + "balance_loss_clip": 1.0293287, + "balance_loss_mlp": 1.01600003, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 11.634642699286129, + "language_loss": 0.76496834, + "learning_rate": 2.942651169791621e-08, + "loss": 0.78588063, + "num_input_tokens_seen": 339784125, + "step": 15750, + "time_per_iteration": 2.643441915512085 + }, + { + "auxiliary_loss_clip": 0.01085256, + "auxiliary_loss_mlp": 0.0102521, + "balance_loss_clip": 1.03340733, + "balance_loss_mlp": 1.01482105, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.8198812488746157, + "language_loss": 0.68097425, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70207894, + "num_input_tokens_seen": 339803450, + "step": 15751, + "time_per_iteration": 2.555798292160034 + }, + { + "auxiliary_loss_clip": 0.01060634, + "auxiliary_loss_mlp": 0.01025224, + "balance_loss_clip": 1.02996409, + "balance_loss_mlp": 1.01466155, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.697561542840635, + "language_loss": 0.65653092, + "learning_rate": 2.929353580532723e-08, + "loss": 0.6773895, + "num_input_tokens_seen": 339823215, + "step": 15752, + "time_per_iteration": 2.6390092372894287 + }, + { + "auxiliary_loss_clip": 0.01078988, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.03024983, + "balance_loss_mlp": 1.01959753, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.5923385648684103, + "language_loss": 0.71542573, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.7365365, + "num_input_tokens_seen": 339842230, + "step": 15753, + "time_per_iteration": 2.600745439529419 + }, + { + "auxiliary_loss_clip": 0.01097265, + "auxiliary_loss_mlp": 0.01031515, + "balance_loss_clip": 1.03200865, + "balance_loss_mlp": 1.01898587, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.8684043540845146, + "language_loss": 0.70160091, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72288871, + "num_input_tokens_seen": 339861640, + "step": 15754, + "time_per_iteration": 2.5110368728637695 + }, + { + "auxiliary_loss_clip": 0.01100109, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.0334965, + "balance_loss_mlp": 1.018677, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.7464264268406406, + "language_loss": 0.79065681, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.81195462, + "num_input_tokens_seen": 339878210, + "step": 15755, + "time_per_iteration": 2.48807692527771 + }, + { + "auxiliary_loss_clip": 0.0105753, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.03213429, + "balance_loss_mlp": 1.02081287, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.631409869474144, + "language_loss": 0.75159383, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77251208, + "num_input_tokens_seen": 339894255, + "step": 15756, + "time_per_iteration": 2.5942134857177734 + }, + { + "auxiliary_loss_clip": 0.01067381, + "auxiliary_loss_mlp": 0.01027766, + "balance_loss_clip": 1.02863765, + "balance_loss_mlp": 1.01697755, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 2.5848474536204176, + "language_loss": 0.75238287, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.77333438, + "num_input_tokens_seen": 339912425, + "step": 15757, + "time_per_iteration": 4.171384811401367 + }, + { + "auxiliary_loss_clip": 0.01078769, + "auxiliary_loss_mlp": 0.01029746, + "balance_loss_clip": 1.03233278, + "balance_loss_mlp": 1.01783681, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.179284902027879, + "language_loss": 0.79286104, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81394619, + "num_input_tokens_seen": 339929635, + "step": 15758, + "time_per_iteration": 2.5986266136169434 + }, + { + "auxiliary_loss_clip": 0.01062433, + "auxiliary_loss_mlp": 0.00749309, + "balance_loss_clip": 1.03082657, + "balance_loss_mlp": 1.00017977, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.4450545599017826, + "language_loss": 0.71808875, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.73620611, + "num_input_tokens_seen": 339951200, + "step": 15759, + "time_per_iteration": 2.6584532260894775 + }, + { + "auxiliary_loss_clip": 0.01081344, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.03449237, + "balance_loss_mlp": 1.01674461, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.5061488779979568, + "language_loss": 0.75388908, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77496636, + "num_input_tokens_seen": 339971820, + "step": 15760, + "time_per_iteration": 2.6404991149902344 + }, + { + "auxiliary_loss_clip": 0.01096628, + "auxiliary_loss_mlp": 0.00749329, + "balance_loss_clip": 1.03357899, + "balance_loss_mlp": 1.0002141, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.8087708724564033, + "language_loss": 0.72769201, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.74615157, + "num_input_tokens_seen": 339989420, + "step": 15761, + "time_per_iteration": 2.479926109313965 + }, + { + "auxiliary_loss_clip": 0.01076247, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.03626633, + "balance_loss_mlp": 1.02212691, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.1233555221175697, + "language_loss": 0.72131193, + "learning_rate": 2.863314050734722e-08, + "loss": 0.74239951, + "num_input_tokens_seen": 340006690, + "step": 15762, + "time_per_iteration": 2.6036124229431152 + }, + { + "auxiliary_loss_clip": 0.01099721, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.03277278, + "balance_loss_mlp": 1.02063882, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 1.8452474536751005, + "language_loss": 0.67133403, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69265556, + "num_input_tokens_seen": 340025480, + "step": 15763, + "time_per_iteration": 2.450410842895508 + }, + { + "auxiliary_loss_clip": 0.01096532, + "auxiliary_loss_mlp": 0.01030954, + "balance_loss_clip": 1.03237033, + "balance_loss_mlp": 1.01998675, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.8551646853940957, + "language_loss": 0.69915807, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.720433, + "num_input_tokens_seen": 340043785, + "step": 15764, + "time_per_iteration": 2.504361391067505 + }, + { + "auxiliary_loss_clip": 0.01084348, + "auxiliary_loss_mlp": 0.00749088, + "balance_loss_clip": 1.03528106, + "balance_loss_mlp": 1.00020194, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.7582938050980808, + "language_loss": 0.71210897, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.73044336, + "num_input_tokens_seen": 340064360, + "step": 15765, + "time_per_iteration": 2.574655532836914 + }, + { + "auxiliary_loss_clip": 0.01000441, + "auxiliary_loss_mlp": 0.01001147, + "balance_loss_clip": 1.00245988, + "balance_loss_mlp": 1.00021732, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8068769052095736, + "language_loss": 0.59071147, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61072731, + "num_input_tokens_seen": 340114425, + "step": 15766, + "time_per_iteration": 2.884787082672119 + }, + { + "auxiliary_loss_clip": 0.01039467, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.02932131, + "balance_loss_mlp": 1.02888811, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 2.0243797972620188, + "language_loss": 0.73923445, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76003623, + "num_input_tokens_seen": 340132200, + "step": 15767, + "time_per_iteration": 2.6170058250427246 + }, + { + "auxiliary_loss_clip": 0.01068764, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03377247, + "balance_loss_mlp": 1.01953435, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.6987742825768217, + "language_loss": 0.73383468, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75483811, + "num_input_tokens_seen": 340149175, + "step": 15768, + "time_per_iteration": 2.610447406768799 + }, + { + "auxiliary_loss_clip": 0.00983625, + "auxiliary_loss_mlp": 0.01001827, + "balance_loss_clip": 1.00404847, + "balance_loss_mlp": 1.00096881, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7335365633123254, + "language_loss": 0.55325311, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57310766, + "num_input_tokens_seen": 340208155, + "step": 15769, + "time_per_iteration": 3.19161319732666 + }, + { + "auxiliary_loss_clip": 0.01045843, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.02947402, + "balance_loss_mlp": 1.01949, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.4432438053286312, + "language_loss": 0.77340549, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79416311, + "num_input_tokens_seen": 340229275, + "step": 15770, + "time_per_iteration": 2.706575632095337 + }, + { + "auxiliary_loss_clip": 0.01081122, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.03651977, + "balance_loss_mlp": 1.02352262, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 1.9377147999355764, + "language_loss": 0.8032546, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82442224, + "num_input_tokens_seen": 340248920, + "step": 15771, + "time_per_iteration": 2.6617395877838135 + }, + { + "auxiliary_loss_clip": 0.01060946, + "auxiliary_loss_mlp": 0.01025306, + "balance_loss_clip": 1.02967858, + "balance_loss_mlp": 1.01454782, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 1.9863885087009565, + "language_loss": 0.69633693, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71719944, + "num_input_tokens_seen": 340266775, + "step": 15772, + "time_per_iteration": 2.6427507400512695 + }, + { + "auxiliary_loss_clip": 0.0107777, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.03182709, + "balance_loss_mlp": 1.01651812, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.534891397183761, + "language_loss": 0.73936617, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.76042557, + "num_input_tokens_seen": 340285295, + "step": 15773, + "time_per_iteration": 2.560702085494995 + }, + { + "auxiliary_loss_clip": 0.01063075, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.03082776, + "balance_loss_mlp": 1.02030659, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 4.330958492800273, + "language_loss": 0.62649888, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.64744616, + "num_input_tokens_seen": 340304265, + "step": 15774, + "time_per_iteration": 2.5991575717926025 + }, + { + "auxiliary_loss_clip": 0.01096837, + "auxiliary_loss_mlp": 0.01028739, + "balance_loss_clip": 1.03264475, + "balance_loss_mlp": 1.01763475, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 2.7716321320028943, + "language_loss": 0.59007406, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61132985, + "num_input_tokens_seen": 340323690, + "step": 15775, + "time_per_iteration": 2.4872331619262695 + }, + { + "auxiliary_loss_clip": 0.01077553, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.03411376, + "balance_loss_mlp": 1.01567471, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.6036779335630478, + "language_loss": 0.61755157, + "learning_rate": 2.772114638584555e-08, + "loss": 0.63859969, + "num_input_tokens_seen": 340345830, + "step": 15776, + "time_per_iteration": 4.1970508098602295 + }, + { + "auxiliary_loss_clip": 0.01066953, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.02995467, + "balance_loss_mlp": 1.01862693, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 1.7505789556158744, + "language_loss": 0.73946631, + "learning_rate": 2.765656478622458e-08, + "loss": 0.76043749, + "num_input_tokens_seen": 340365910, + "step": 15777, + "time_per_iteration": 2.5616087913513184 + }, + { + "auxiliary_loss_clip": 0.01096279, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.03580403, + "balance_loss_mlp": 1.02284658, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.147326764582507, + "language_loss": 0.72139257, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74270523, + "num_input_tokens_seen": 340383935, + "step": 15778, + "time_per_iteration": 2.537961959838867 + }, + { + "auxiliary_loss_clip": 0.01083503, + "auxiliary_loss_mlp": 0.00749124, + "balance_loss_clip": 1.03428698, + "balance_loss_mlp": 1.00017428, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 6.930546352639484, + "language_loss": 0.70026016, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.71858644, + "num_input_tokens_seen": 340402760, + "step": 15779, + "time_per_iteration": 2.5841832160949707 + }, + { + "auxiliary_loss_clip": 0.01098072, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.03433168, + "balance_loss_mlp": 1.01914048, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 6.397410894466462, + "language_loss": 0.78347814, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80476499, + "num_input_tokens_seen": 340422105, + "step": 15780, + "time_per_iteration": 2.520625591278076 + }, + { + "auxiliary_loss_clip": 0.0107291, + "auxiliary_loss_mlp": 0.00749294, + "balance_loss_clip": 1.03586006, + "balance_loss_mlp": 1.00019777, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.6232530165864927, + "language_loss": 0.6624499, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68067193, + "num_input_tokens_seen": 340441160, + "step": 15781, + "time_per_iteration": 4.124101161956787 + }, + { + "auxiliary_loss_clip": 0.01096617, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.03415203, + "balance_loss_mlp": 1.02054727, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 2.064633529728026, + "language_loss": 0.79603708, + "learning_rate": 2.733477870890999e-08, + "loss": 0.81731856, + "num_input_tokens_seen": 340458200, + "step": 15782, + "time_per_iteration": 2.5120770931243896 + }, + { + "auxiliary_loss_clip": 0.01013632, + "auxiliary_loss_mlp": 0.01002423, + "balance_loss_clip": 1.00389147, + "balance_loss_mlp": 1.00151753, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.721765797610406, + "language_loss": 0.5981434, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61830395, + "num_input_tokens_seen": 340526420, + "step": 15783, + "time_per_iteration": 3.2666616439819336 + }, + { + "auxiliary_loss_clip": 0.010849, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.03128362, + "balance_loss_mlp": 1.02008057, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.6948877177374704, + "language_loss": 0.7407555, + "learning_rate": 2.720658788656105e-08, + "loss": 0.7619195, + "num_input_tokens_seen": 340546325, + "step": 15784, + "time_per_iteration": 2.6852126121520996 + }, + { + "auxiliary_loss_clip": 0.01044675, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.03207397, + "balance_loss_mlp": 1.01745749, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 2.2251201410500236, + "language_loss": 0.69906855, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71981311, + "num_input_tokens_seen": 340565145, + "step": 15785, + "time_per_iteration": 2.7278714179992676 + }, + { + "auxiliary_loss_clip": 0.01098143, + "auxiliary_loss_mlp": 0.01026142, + "balance_loss_clip": 1.03294325, + "balance_loss_mlp": 1.01509118, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.3726185403243136, + "language_loss": 0.76013184, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78137469, + "num_input_tokens_seen": 340585465, + "step": 15786, + "time_per_iteration": 2.4958901405334473 + }, + { + "auxiliary_loss_clip": 0.01064985, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.03343153, + "balance_loss_mlp": 1.0184164, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.6255182189056865, + "language_loss": 0.78855908, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.80949318, + "num_input_tokens_seen": 340606010, + "step": 15787, + "time_per_iteration": 4.043046951293945 + }, + { + "auxiliary_loss_clip": 0.01086011, + "auxiliary_loss_mlp": 0.01027837, + "balance_loss_clip": 1.03506672, + "balance_loss_mlp": 1.01772857, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.8147462428143932, + "language_loss": 0.76315892, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78429741, + "num_input_tokens_seen": 340626135, + "step": 15788, + "time_per_iteration": 2.4864025115966797 + }, + { + "auxiliary_loss_clip": 0.01086651, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.03242886, + "balance_loss_mlp": 1.01824582, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 2.0486359820134594, + "language_loss": 0.7114315, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73259652, + "num_input_tokens_seen": 340644870, + "step": 15789, + "time_per_iteration": 2.4947354793548584 + }, + { + "auxiliary_loss_clip": 0.0104653, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.03076839, + "balance_loss_mlp": 1.01509368, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 2.014783134304538, + "language_loss": 0.73415375, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75488758, + "num_input_tokens_seen": 340663695, + "step": 15790, + "time_per_iteration": 2.5288925170898438 + }, + { + "auxiliary_loss_clip": 0.01059204, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.03016746, + "balance_loss_mlp": 1.01727247, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.8694559160395496, + "language_loss": 0.77829766, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79917705, + "num_input_tokens_seen": 340682970, + "step": 15791, + "time_per_iteration": 2.5104684829711914 + }, + { + "auxiliary_loss_clip": 0.01089212, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.03318834, + "balance_loss_mlp": 1.01725936, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 1.9946261278227753, + "language_loss": 0.73970693, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.76088756, + "num_input_tokens_seen": 340702275, + "step": 15792, + "time_per_iteration": 2.6308975219726562 + }, + { + "auxiliary_loss_clip": 0.01083409, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.03083897, + "balance_loss_mlp": 1.02572, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 2.5152896526916018, + "language_loss": 0.78302222, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80421782, + "num_input_tokens_seen": 340719060, + "step": 15793, + "time_per_iteration": 2.5094339847564697 + }, + { + "auxiliary_loss_clip": 0.01067298, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.03142953, + "balance_loss_mlp": 1.01653481, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 2.012260207780775, + "language_loss": 0.77257597, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79352045, + "num_input_tokens_seen": 340737815, + "step": 15794, + "time_per_iteration": 2.579606533050537 + }, + { + "auxiliary_loss_clip": 0.01066764, + "auxiliary_loss_mlp": 0.00749469, + "balance_loss_clip": 1.03213203, + "balance_loss_mlp": 1.00027561, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.9168475285746405, + "language_loss": 0.61029637, + "learning_rate": 2.650688769211107e-08, + "loss": 0.62845874, + "num_input_tokens_seen": 340756035, + "step": 15795, + "time_per_iteration": 2.6476845741271973 + }, + { + "auxiliary_loss_clip": 0.01084171, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.03324676, + "balance_loss_mlp": 1.01875448, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.612009540369926, + "language_loss": 0.78887814, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81002021, + "num_input_tokens_seen": 340775620, + "step": 15796, + "time_per_iteration": 2.597445487976074 + }, + { + "auxiliary_loss_clip": 0.01087347, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.03372991, + "balance_loss_mlp": 1.01489854, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 1.9930247442542637, + "language_loss": 0.75940621, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.78054661, + "num_input_tokens_seen": 340794510, + "step": 15797, + "time_per_iteration": 2.512002468109131 + }, + { + "auxiliary_loss_clip": 0.01063319, + "auxiliary_loss_mlp": 0.00749429, + "balance_loss_clip": 1.03331709, + "balance_loss_mlp": 1.00022435, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 1.9013083127448818, + "language_loss": 0.65741408, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67554152, + "num_input_tokens_seen": 340812955, + "step": 15798, + "time_per_iteration": 4.1817381381988525 + }, + { + "auxiliary_loss_clip": 0.01088553, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.03513646, + "balance_loss_mlp": 1.02005064, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 1.9812458725217381, + "language_loss": 0.77353287, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79472673, + "num_input_tokens_seen": 340829200, + "step": 15799, + "time_per_iteration": 2.5304677486419678 + }, + { + "auxiliary_loss_clip": 0.0107844, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.03161061, + "balance_loss_mlp": 1.02151132, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 2.35722510876499, + "language_loss": 0.71030211, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.73140699, + "num_input_tokens_seen": 340848035, + "step": 15800, + "time_per_iteration": 2.5434677600860596 + }, + { + "auxiliary_loss_clip": 0.01066063, + "auxiliary_loss_mlp": 0.01025104, + "balance_loss_clip": 1.0283972, + "balance_loss_mlp": 1.0139817, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.7986964794836844, + "language_loss": 0.72030365, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.74121529, + "num_input_tokens_seen": 340870025, + "step": 15801, + "time_per_iteration": 2.589901924133301 + }, + { + "auxiliary_loss_clip": 0.01085759, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.03291059, + "balance_loss_mlp": 1.01921511, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.451278835074722, + "language_loss": 0.80948734, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83064306, + "num_input_tokens_seen": 340892290, + "step": 15802, + "time_per_iteration": 2.583550453186035 + }, + { + "auxiliary_loss_clip": 0.01099532, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.03480959, + "balance_loss_mlp": 1.01915812, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 2.4593253306519776, + "language_loss": 0.6762619, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69755888, + "num_input_tokens_seen": 340912260, + "step": 15803, + "time_per_iteration": 2.5159366130828857 + }, + { + "auxiliary_loss_clip": 0.0106754, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.03082502, + "balance_loss_mlp": 1.02259302, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 1.7066393460587512, + "language_loss": 0.75964212, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.7806536, + "num_input_tokens_seen": 340928930, + "step": 15804, + "time_per_iteration": 2.5617153644561768 + }, + { + "auxiliary_loss_clip": 0.01081667, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.03412652, + "balance_loss_mlp": 1.0220691, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.6395444913562, + "language_loss": 0.73127103, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75242198, + "num_input_tokens_seen": 340946615, + "step": 15805, + "time_per_iteration": 2.51792049407959 + }, + { + "auxiliary_loss_clip": 0.01072546, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.03520393, + "balance_loss_mlp": 1.02141142, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 1.659132162204082, + "language_loss": 0.80319548, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82424843, + "num_input_tokens_seen": 340967545, + "step": 15806, + "time_per_iteration": 2.585296154022217 + }, + { + "auxiliary_loss_clip": 0.0105909, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.03184772, + "balance_loss_mlp": 1.01765275, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.4336546751035586, + "language_loss": 0.82219851, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84307718, + "num_input_tokens_seen": 340984955, + "step": 15807, + "time_per_iteration": 2.604407787322998 + }, + { + "auxiliary_loss_clip": 0.01075985, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.03016353, + "balance_loss_mlp": 1.01893377, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 1.7254375313387216, + "language_loss": 0.71369004, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73474979, + "num_input_tokens_seen": 341007300, + "step": 15808, + "time_per_iteration": 2.5642807483673096 + }, + { + "auxiliary_loss_clip": 0.010861, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.0330267, + "balance_loss_mlp": 1.0172472, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.497396899430949, + "language_loss": 0.7006126, + "learning_rate": 2.562945671948058e-08, + "loss": 0.7217499, + "num_input_tokens_seen": 341026695, + "step": 15809, + "time_per_iteration": 2.5611743927001953 + }, + { + "auxiliary_loss_clip": 0.01070345, + "auxiliary_loss_mlp": 0.01023885, + "balance_loss_clip": 1.03163004, + "balance_loss_mlp": 1.01318622, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 3.8957332108065934, + "language_loss": 0.75736248, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77830482, + "num_input_tokens_seen": 341047080, + "step": 15810, + "time_per_iteration": 2.597517967224121 + }, + { + "auxiliary_loss_clip": 0.01060807, + "auxiliary_loss_mlp": 0.01038285, + "balance_loss_clip": 1.02967536, + "balance_loss_mlp": 1.02642941, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.6668218283870946, + "language_loss": 0.79899567, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.81998658, + "num_input_tokens_seen": 341067310, + "step": 15811, + "time_per_iteration": 2.59727144241333 + }, + { + "auxiliary_loss_clip": 0.01068456, + "auxiliary_loss_mlp": 0.0103127, + "balance_loss_clip": 1.03114879, + "balance_loss_mlp": 1.01945591, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 2.4901744178095084, + "language_loss": 0.69661474, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.71761203, + "num_input_tokens_seen": 341085110, + "step": 15812, + "time_per_iteration": 2.614914894104004 + }, + { + "auxiliary_loss_clip": 0.01057116, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.03075361, + "balance_loss_mlp": 1.02330172, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 2.4174624236775193, + "language_loss": 0.65590727, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67683345, + "num_input_tokens_seen": 341103190, + "step": 15813, + "time_per_iteration": 2.6126883029937744 + }, + { + "auxiliary_loss_clip": 0.01088215, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.03340077, + "balance_loss_mlp": 1.02464008, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.4446426826515657, + "language_loss": 0.70397973, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72521776, + "num_input_tokens_seen": 341125695, + "step": 15814, + "time_per_iteration": 2.552661895751953 + }, + { + "auxiliary_loss_clip": 0.01084284, + "auxiliary_loss_mlp": 0.01026312, + "balance_loss_clip": 1.03293204, + "balance_loss_mlp": 1.01632881, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 2.592355137403011, + "language_loss": 0.63177395, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65287995, + "num_input_tokens_seen": 341143930, + "step": 15815, + "time_per_iteration": 2.6080219745635986 + }, + { + "auxiliary_loss_clip": 0.01072969, + "auxiliary_loss_mlp": 0.01027128, + "balance_loss_clip": 1.03205681, + "balance_loss_mlp": 1.01672697, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.7273739730500786, + "language_loss": 0.58373046, + "learning_rate": 2.519624364862061e-08, + "loss": 0.60473144, + "num_input_tokens_seen": 341164280, + "step": 15816, + "time_per_iteration": 4.17775559425354 + }, + { + "auxiliary_loss_clip": 0.01095026, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.03249812, + "balance_loss_mlp": 1.02652335, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.6365673794035354, + "language_loss": 0.73662031, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75794256, + "num_input_tokens_seen": 341183670, + "step": 15817, + "time_per_iteration": 2.5140843391418457 + }, + { + "auxiliary_loss_clip": 0.0107712, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.03289652, + "balance_loss_mlp": 1.02203298, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.573641656478967, + "language_loss": 0.60048842, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62160707, + "num_input_tokens_seen": 341201900, + "step": 15818, + "time_per_iteration": 2.609074831008911 + }, + { + "auxiliary_loss_clip": 0.01099488, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.0362072, + "balance_loss_mlp": 1.0202086, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.6865237730750342, + "language_loss": 0.69204473, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71335518, + "num_input_tokens_seen": 341218340, + "step": 15819, + "time_per_iteration": 2.4596993923187256 + }, + { + "auxiliary_loss_clip": 0.01057189, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.03441489, + "balance_loss_mlp": 1.01609993, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.725674921907994, + "language_loss": 0.74352562, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76436806, + "num_input_tokens_seen": 341235885, + "step": 15820, + "time_per_iteration": 2.663043260574341 + }, + { + "auxiliary_loss_clip": 0.01076137, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.03204393, + "balance_loss_mlp": 1.02262759, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 2.439166053014001, + "language_loss": 0.78288579, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.8039875, + "num_input_tokens_seen": 341255280, + "step": 15821, + "time_per_iteration": 2.5584757328033447 + }, + { + "auxiliary_loss_clip": 0.01055443, + "auxiliary_loss_mlp": 0.01029707, + "balance_loss_clip": 1.03062594, + "balance_loss_mlp": 1.01864457, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 2.108953409503155, + "language_loss": 0.71228755, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73313904, + "num_input_tokens_seen": 341279055, + "step": 15822, + "time_per_iteration": 4.242429733276367 + }, + { + "auxiliary_loss_clip": 0.01083823, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.03414202, + "balance_loss_mlp": 1.02347183, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 2.6653832927148735, + "language_loss": 0.66308308, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68425971, + "num_input_tokens_seen": 341298560, + "step": 15823, + "time_per_iteration": 2.5798864364624023 + }, + { + "auxiliary_loss_clip": 0.01077921, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.03293085, + "balance_loss_mlp": 1.01695061, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 2.089748653463631, + "language_loss": 0.77086294, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79191959, + "num_input_tokens_seen": 341316650, + "step": 15824, + "time_per_iteration": 2.6002540588378906 + }, + { + "auxiliary_loss_clip": 0.01099471, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.03269303, + "balance_loss_mlp": 1.01713479, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 2.536038990845858, + "language_loss": 0.73749751, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.7587778, + "num_input_tokens_seen": 341336185, + "step": 15825, + "time_per_iteration": 2.561180830001831 + }, + { + "auxiliary_loss_clip": 0.01014706, + "auxiliary_loss_mlp": 0.0100221, + "balance_loss_clip": 1.00765502, + "balance_loss_mlp": 1.00123203, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8573452689084196, + "language_loss": 0.53368461, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55385381, + "num_input_tokens_seen": 341395795, + "step": 15826, + "time_per_iteration": 3.045325994491577 + }, + { + "auxiliary_loss_clip": 0.0107409, + "auxiliary_loss_mlp": 0.01036389, + "balance_loss_clip": 1.03354919, + "balance_loss_mlp": 1.02502847, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 1.7951222441396293, + "language_loss": 0.7252447, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74634951, + "num_input_tokens_seen": 341415675, + "step": 15827, + "time_per_iteration": 4.170046329498291 + }, + { + "auxiliary_loss_clip": 0.01075931, + "auxiliary_loss_mlp": 0.01029155, + "balance_loss_clip": 1.03283048, + "balance_loss_mlp": 1.0178721, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.7889989592159752, + "language_loss": 0.74381709, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.7648679, + "num_input_tokens_seen": 341432990, + "step": 15828, + "time_per_iteration": 2.5849192142486572 + }, + { + "auxiliary_loss_clip": 0.01054564, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.03240192, + "balance_loss_mlp": 1.02476573, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.6710263401228072, + "language_loss": 0.73338485, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75428003, + "num_input_tokens_seen": 341454100, + "step": 15829, + "time_per_iteration": 2.747913122177124 + }, + { + "auxiliary_loss_clip": 0.01078888, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.0307982, + "balance_loss_mlp": 1.01905406, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 3.154799268337768, + "language_loss": 0.61180419, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.6328994, + "num_input_tokens_seen": 341472955, + "step": 15830, + "time_per_iteration": 2.5451948642730713 + }, + { + "auxiliary_loss_clip": 0.01084146, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.03439569, + "balance_loss_mlp": 1.01917601, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 2.503056914053349, + "language_loss": 0.72820556, + "learning_rate": 2.428028693179729e-08, + "loss": 0.74936026, + "num_input_tokens_seen": 341490165, + "step": 15831, + "time_per_iteration": 2.4893360137939453 + }, + { + "auxiliary_loss_clip": 0.01039632, + "auxiliary_loss_mlp": 0.01024032, + "balance_loss_clip": 1.02822244, + "balance_loss_mlp": 1.01399446, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.8316521174299463, + "language_loss": 0.65294838, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67358506, + "num_input_tokens_seen": 341508055, + "step": 15832, + "time_per_iteration": 2.6242690086364746 + }, + { + "auxiliary_loss_clip": 0.01079564, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.03419852, + "balance_loss_mlp": 1.02130044, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.950952648254223, + "language_loss": 0.78075033, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80187237, + "num_input_tokens_seen": 341526155, + "step": 15833, + "time_per_iteration": 2.5585107803344727 + }, + { + "auxiliary_loss_clip": 0.01056771, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.0316956, + "balance_loss_mlp": 1.02058268, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.414844280842349, + "language_loss": 0.75176209, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.77264345, + "num_input_tokens_seen": 341540450, + "step": 15834, + "time_per_iteration": 2.596245765686035 + }, + { + "auxiliary_loss_clip": 0.01087914, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.03654587, + "balance_loss_mlp": 1.02232766, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 2.232726583931392, + "language_loss": 0.76406658, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78528994, + "num_input_tokens_seen": 341557865, + "step": 15835, + "time_per_iteration": 2.5298361778259277 + }, + { + "auxiliary_loss_clip": 0.01072152, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.03068113, + "balance_loss_mlp": 1.01961851, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 2.005779085117484, + "language_loss": 0.6601001, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68113452, + "num_input_tokens_seen": 341573890, + "step": 15836, + "time_per_iteration": 2.5526745319366455 + }, + { + "auxiliary_loss_clip": 0.0106267, + "auxiliary_loss_mlp": 0.01022529, + "balance_loss_clip": 1.03261983, + "balance_loss_mlp": 1.01159143, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.82490004716472, + "language_loss": 0.70365345, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72450542, + "num_input_tokens_seen": 341593770, + "step": 15837, + "time_per_iteration": 4.157652139663696 + }, + { + "auxiliary_loss_clip": 0.01099598, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03407764, + "balance_loss_mlp": 1.01937246, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 2.1524989437476347, + "language_loss": 0.73645914, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75777602, + "num_input_tokens_seen": 341612065, + "step": 15838, + "time_per_iteration": 2.5558114051818848 + }, + { + "auxiliary_loss_clip": 0.01063394, + "auxiliary_loss_mlp": 0.01025723, + "balance_loss_clip": 1.03240371, + "balance_loss_mlp": 1.01458263, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.7996896032740612, + "language_loss": 0.78249609, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80338728, + "num_input_tokens_seen": 341631365, + "step": 15839, + "time_per_iteration": 2.6684720516204834 + }, + { + "auxiliary_loss_clip": 0.01072251, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.03631866, + "balance_loss_mlp": 1.01781929, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.5108299076400369, + "language_loss": 0.80285686, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82386774, + "num_input_tokens_seen": 341650300, + "step": 15840, + "time_per_iteration": 2.648725748062134 + }, + { + "auxiliary_loss_clip": 0.01067225, + "auxiliary_loss_mlp": 0.01026687, + "balance_loss_clip": 1.03074002, + "balance_loss_mlp": 1.01728129, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 5.236612825389791, + "language_loss": 0.73001373, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75095284, + "num_input_tokens_seen": 341667680, + "step": 15841, + "time_per_iteration": 2.6730101108551025 + }, + { + "auxiliary_loss_clip": 0.01069535, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.03271449, + "balance_loss_mlp": 1.01502681, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.7861710351605506, + "language_loss": 0.78981972, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81076562, + "num_input_tokens_seen": 341685760, + "step": 15842, + "time_per_iteration": 2.654362678527832 + }, + { + "auxiliary_loss_clip": 0.0107505, + "auxiliary_loss_mlp": 0.01031469, + "balance_loss_clip": 1.03493583, + "balance_loss_mlp": 1.02047217, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 1.9206986227830978, + "language_loss": 0.72383589, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74490112, + "num_input_tokens_seen": 341705300, + "step": 15843, + "time_per_iteration": 2.6402556896209717 + }, + { + "auxiliary_loss_clip": 0.01067896, + "auxiliary_loss_mlp": 0.00749331, + "balance_loss_clip": 1.03366351, + "balance_loss_mlp": 1.00018001, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.6435709654636308, + "language_loss": 0.78271085, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80088311, + "num_input_tokens_seen": 341724565, + "step": 15844, + "time_per_iteration": 2.592944383621216 + }, + { + "auxiliary_loss_clip": 0.01060988, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.03205848, + "balance_loss_mlp": 1.02043664, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 2.740538318878731, + "language_loss": 0.70063174, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72158659, + "num_input_tokens_seen": 341743605, + "step": 15845, + "time_per_iteration": 2.6563894748687744 + }, + { + "auxiliary_loss_clip": 0.01061521, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.03345478, + "balance_loss_mlp": 1.02376652, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.5342049099757329, + "language_loss": 0.75436252, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77532178, + "num_input_tokens_seen": 341763475, + "step": 15846, + "time_per_iteration": 2.6474809646606445 + }, + { + "auxiliary_loss_clip": 0.01067491, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.03321338, + "balance_loss_mlp": 1.01757276, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 3.721027330913343, + "language_loss": 0.77984285, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80080318, + "num_input_tokens_seen": 341781265, + "step": 15847, + "time_per_iteration": 2.6040449142456055 + }, + { + "auxiliary_loss_clip": 0.01053004, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.03080308, + "balance_loss_mlp": 1.01986957, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 1.7720156742155504, + "language_loss": 0.77935112, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80018294, + "num_input_tokens_seen": 341798825, + "step": 15848, + "time_per_iteration": 2.5800299644470215 + }, + { + "auxiliary_loss_clip": 0.01075015, + "auxiliary_loss_mlp": 0.01037796, + "balance_loss_clip": 1.03207433, + "balance_loss_mlp": 1.02563047, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.533318655651997, + "language_loss": 0.7212624, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74239051, + "num_input_tokens_seen": 341819480, + "step": 15849, + "time_per_iteration": 2.6202619075775146 + }, + { + "auxiliary_loss_clip": 0.01092767, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.0351696, + "balance_loss_mlp": 1.02219141, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 4.429077608451213, + "language_loss": 0.75093555, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77220398, + "num_input_tokens_seen": 341838035, + "step": 15850, + "time_per_iteration": 2.534403085708618 + }, + { + "auxiliary_loss_clip": 0.01066934, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.03086972, + "balance_loss_mlp": 1.01924205, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 1.8687540978918895, + "language_loss": 0.72323847, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74420488, + "num_input_tokens_seen": 341855895, + "step": 15851, + "time_per_iteration": 2.620664119720459 + }, + { + "auxiliary_loss_clip": 0.01071231, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.03237057, + "balance_loss_mlp": 1.01792765, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 1.7655263185637153, + "language_loss": 0.79468858, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.81568307, + "num_input_tokens_seen": 341875240, + "step": 15852, + "time_per_iteration": 2.5655276775360107 + }, + { + "auxiliary_loss_clip": 0.01085532, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.03174448, + "balance_loss_mlp": 1.01975405, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.5709777909751619, + "language_loss": 0.597947, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61911559, + "num_input_tokens_seen": 341901020, + "step": 15853, + "time_per_iteration": 2.820840835571289 + }, + { + "auxiliary_loss_clip": 0.01073049, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.03191733, + "balance_loss_mlp": 1.01938915, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.793235823167618, + "language_loss": 0.72368079, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74470913, + "num_input_tokens_seen": 341919365, + "step": 15854, + "time_per_iteration": 2.57082462310791 + }, + { + "auxiliary_loss_clip": 0.01070772, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.03050995, + "balance_loss_mlp": 1.01609111, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.0961115466318607, + "language_loss": 0.67635751, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69733244, + "num_input_tokens_seen": 341939985, + "step": 15855, + "time_per_iteration": 2.6157174110412598 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.03287625, + "balance_loss_mlp": 1.01722407, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 5.37516513818163, + "language_loss": 0.76453525, + "learning_rate": 2.279115591613556e-08, + "loss": 0.785779, + "num_input_tokens_seen": 341959255, + "step": 15856, + "time_per_iteration": 2.573927640914917 + }, + { + "auxiliary_loss_clip": 0.01066799, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.02898383, + "balance_loss_mlp": 1.02415633, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.6931934172061642, + "language_loss": 0.77392888, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.79494488, + "num_input_tokens_seen": 341977205, + "step": 15857, + "time_per_iteration": 4.073966026306152 + }, + { + "auxiliary_loss_clip": 0.01013738, + "auxiliary_loss_mlp": 0.01002186, + "balance_loss_clip": 1.00358391, + "balance_loss_mlp": 1.00124454, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7127064914267759, + "language_loss": 0.62619579, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64635503, + "num_input_tokens_seen": 342038545, + "step": 15858, + "time_per_iteration": 3.0868477821350098 + }, + { + "auxiliary_loss_clip": 0.01044714, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.0305264, + "balance_loss_mlp": 1.01710331, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.6088801030015554, + "language_loss": 0.56933779, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.59005952, + "num_input_tokens_seen": 342058195, + "step": 15859, + "time_per_iteration": 2.6114416122436523 + }, + { + "auxiliary_loss_clip": 0.01093704, + "auxiliary_loss_mlp": 0.01028985, + "balance_loss_clip": 1.03312469, + "balance_loss_mlp": 1.01907873, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 3.140894310125841, + "language_loss": 0.81988478, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.84111172, + "num_input_tokens_seen": 342075025, + "step": 15860, + "time_per_iteration": 2.4954991340637207 + }, + { + "auxiliary_loss_clip": 0.01047031, + "auxiliary_loss_mlp": 0.00749223, + "balance_loss_clip": 1.03014112, + "balance_loss_mlp": 1.00028372, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.9420267066572245, + "language_loss": 0.667732, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68569452, + "num_input_tokens_seen": 342094595, + "step": 15861, + "time_per_iteration": 2.6627907752990723 + }, + { + "auxiliary_loss_clip": 0.01084735, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.03176212, + "balance_loss_mlp": 1.02321076, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 1.8859566128660703, + "language_loss": 0.65828919, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67948115, + "num_input_tokens_seen": 342115970, + "step": 15862, + "time_per_iteration": 4.070696830749512 + }, + { + "auxiliary_loss_clip": 0.01055122, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.03149772, + "balance_loss_mlp": 1.02084064, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.6973631812978982, + "language_loss": 0.67422801, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69509333, + "num_input_tokens_seen": 342134080, + "step": 15863, + "time_per_iteration": 2.62886118888855 + }, + { + "auxiliary_loss_clip": 0.01072844, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.03023505, + "balance_loss_mlp": 1.0174675, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 4.671515934775716, + "language_loss": 0.78327954, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80429143, + "num_input_tokens_seen": 342154725, + "step": 15864, + "time_per_iteration": 2.656440258026123 + }, + { + "auxiliary_loss_clip": 0.01065629, + "auxiliary_loss_mlp": 0.0102527, + "balance_loss_clip": 1.0334779, + "balance_loss_mlp": 1.01454067, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 2.6738480912588294, + "language_loss": 0.59750175, + "learning_rate": 2.226653824047586e-08, + "loss": 0.61841071, + "num_input_tokens_seen": 342172275, + "step": 15865, + "time_per_iteration": 2.646580457687378 + }, + { + "auxiliary_loss_clip": 0.01052218, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.03003359, + "balance_loss_mlp": 1.02425909, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.679075946777119, + "language_loss": 0.69722134, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.71810889, + "num_input_tokens_seen": 342190880, + "step": 15866, + "time_per_iteration": 2.57942533493042 + }, + { + "auxiliary_loss_clip": 0.01069241, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.03014541, + "balance_loss_mlp": 1.02005243, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.7827191997782137, + "language_loss": 0.84976137, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87078083, + "num_input_tokens_seen": 342208165, + "step": 15867, + "time_per_iteration": 4.095515727996826 + }, + { + "auxiliary_loss_clip": 0.01012909, + "auxiliary_loss_mlp": 0.01001966, + "balance_loss_clip": 1.00532484, + "balance_loss_mlp": 1.00104165, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7554640122446504, + "language_loss": 0.61776567, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63791442, + "num_input_tokens_seen": 342277110, + "step": 15868, + "time_per_iteration": 3.1131696701049805 + }, + { + "auxiliary_loss_clip": 0.01062154, + "auxiliary_loss_mlp": 0.01024836, + "balance_loss_clip": 1.03317738, + "balance_loss_mlp": 1.01355302, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 1.9177246930966032, + "language_loss": 0.59673786, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.61760783, + "num_input_tokens_seen": 342294695, + "step": 15869, + "time_per_iteration": 2.6195242404937744 + }, + { + "auxiliary_loss_clip": 0.0105345, + "auxiliary_loss_mlp": 0.00749687, + "balance_loss_clip": 1.03016496, + "balance_loss_mlp": 1.00030577, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 3.0362919977573775, + "language_loss": 0.71168435, + "learning_rate": 2.197770872795579e-08, + "loss": 0.72971565, + "num_input_tokens_seen": 342314970, + "step": 15870, + "time_per_iteration": 2.6652069091796875 + }, + { + "auxiliary_loss_clip": 0.01057167, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.02976871, + "balance_loss_mlp": 1.01411176, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 2.819984156078271, + "language_loss": 0.7678448, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78866583, + "num_input_tokens_seen": 342334255, + "step": 15871, + "time_per_iteration": 2.612327814102173 + }, + { + "auxiliary_loss_clip": 0.01083136, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.03200459, + "balance_loss_mlp": 1.018646, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 2.0200429021611823, + "language_loss": 0.58748496, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60861987, + "num_input_tokens_seen": 342354730, + "step": 15872, + "time_per_iteration": 2.629384994506836 + }, + { + "auxiliary_loss_clip": 0.01071379, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.03095126, + "balance_loss_mlp": 1.01758182, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 2.4988863928004506, + "language_loss": 0.74880111, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.76981556, + "num_input_tokens_seen": 342374565, + "step": 15873, + "time_per_iteration": 2.5773794651031494 + }, + { + "auxiliary_loss_clip": 0.01098937, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.03479052, + "balance_loss_mlp": 1.01761127, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.9070864246457044, + "language_loss": 0.62703502, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.6483168, + "num_input_tokens_seen": 342394590, + "step": 15874, + "time_per_iteration": 2.5990607738494873 + }, + { + "auxiliary_loss_clip": 0.01071172, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.0304265, + "balance_loss_mlp": 1.02140188, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 1.9015078480437573, + "language_loss": 0.8961156, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91715252, + "num_input_tokens_seen": 342410445, + "step": 15875, + "time_per_iteration": 2.581068754196167 + }, + { + "auxiliary_loss_clip": 0.01101291, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.03462517, + "balance_loss_mlp": 1.02064526, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 2.1227572561133616, + "language_loss": 0.68077838, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.70211655, + "num_input_tokens_seen": 342430970, + "step": 15876, + "time_per_iteration": 2.562861919403076 + }, + { + "auxiliary_loss_clip": 0.01086352, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.03299141, + "balance_loss_mlp": 1.01742315, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 1.7197660500639318, + "language_loss": 0.69220114, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.71335483, + "num_input_tokens_seen": 342449505, + "step": 15877, + "time_per_iteration": 4.03439474105835 + }, + { + "auxiliary_loss_clip": 0.01056198, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.03184056, + "balance_loss_mlp": 1.01783478, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.5832890338794607, + "language_loss": 0.70727706, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.72813451, + "num_input_tokens_seen": 342470390, + "step": 15878, + "time_per_iteration": 2.6226391792297363 + }, + { + "auxiliary_loss_clip": 0.010942, + "auxiliary_loss_mlp": 0.01026408, + "balance_loss_clip": 1.03217137, + "balance_loss_mlp": 1.01545858, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.3225280521598268, + "language_loss": 0.68079114, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70199728, + "num_input_tokens_seen": 342492560, + "step": 15879, + "time_per_iteration": 2.546762704849243 + }, + { + "auxiliary_loss_clip": 0.01060139, + "auxiliary_loss_mlp": 0.0074923, + "balance_loss_clip": 1.03046095, + "balance_loss_mlp": 1.00027049, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 1.7988951403271074, + "language_loss": 0.85112303, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.8692168, + "num_input_tokens_seen": 342512315, + "step": 15880, + "time_per_iteration": 2.711880683898926 + }, + { + "auxiliary_loss_clip": 0.01029385, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.03009319, + "balance_loss_mlp": 1.01741242, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.8938018654269346, + "language_loss": 0.71525288, + "learning_rate": 2.134888478151753e-08, + "loss": 0.7358411, + "num_input_tokens_seen": 342533060, + "step": 15881, + "time_per_iteration": 2.9948747158050537 + }, + { + "auxiliary_loss_clip": 0.01086375, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.03424048, + "balance_loss_mlp": 1.01868486, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.9397093413231776, + "language_loss": 0.71685064, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73800874, + "num_input_tokens_seen": 342550830, + "step": 15882, + "time_per_iteration": 2.7355096340179443 + }, + { + "auxiliary_loss_clip": 0.01076352, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.03307152, + "balance_loss_mlp": 1.01812792, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 1.8551691910253432, + "language_loss": 0.65959442, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.6806457, + "num_input_tokens_seen": 342575070, + "step": 15883, + "time_per_iteration": 2.9364852905273438 + }, + { + "auxiliary_loss_clip": 0.01088395, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.03544617, + "balance_loss_mlp": 1.01760101, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.204920805747603, + "language_loss": 0.7774775, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.79865229, + "num_input_tokens_seen": 342592215, + "step": 15884, + "time_per_iteration": 2.55975604057312 + }, + { + "auxiliary_loss_clip": 0.01099283, + "auxiliary_loss_mlp": 0.01027307, + "balance_loss_clip": 1.03354168, + "balance_loss_mlp": 1.01596379, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.6447833531388925, + "language_loss": 0.78105497, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.8023209, + "num_input_tokens_seen": 342610030, + "step": 15885, + "time_per_iteration": 2.5032477378845215 + }, + { + "auxiliary_loss_clip": 0.01098095, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.03325164, + "balance_loss_mlp": 1.01877236, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.701195691100154, + "language_loss": 0.70243621, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72371221, + "num_input_tokens_seen": 342626475, + "step": 15886, + "time_per_iteration": 2.5765671730041504 + }, + { + "auxiliary_loss_clip": 0.01063829, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.03215241, + "balance_loss_mlp": 1.02001905, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 2.328411792244638, + "language_loss": 0.72252262, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.743487, + "num_input_tokens_seen": 342646645, + "step": 15887, + "time_per_iteration": 2.6778621673583984 + }, + { + "auxiliary_loss_clip": 0.01070487, + "auxiliary_loss_mlp": 0.01026209, + "balance_loss_clip": 1.03111339, + "balance_loss_mlp": 1.01582599, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 2.5635048117213204, + "language_loss": 0.5649991, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.58596611, + "num_input_tokens_seen": 342663615, + "step": 15888, + "time_per_iteration": 2.564934492111206 + }, + { + "auxiliary_loss_clip": 0.01013848, + "auxiliary_loss_mlp": 0.01004842, + "balance_loss_clip": 1.00415599, + "balance_loss_mlp": 1.00388288, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.705907019058006, + "language_loss": 0.57864761, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59883451, + "num_input_tokens_seen": 342728275, + "step": 15889, + "time_per_iteration": 3.116203546524048 + }, + { + "auxiliary_loss_clip": 0.01097693, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.0318377, + "balance_loss_mlp": 1.01501608, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.4328451959212198, + "language_loss": 0.67172164, + "learning_rate": 2.084114508877466e-08, + "loss": 0.6929642, + "num_input_tokens_seen": 342748860, + "step": 15890, + "time_per_iteration": 2.571819543838501 + }, + { + "auxiliary_loss_clip": 0.01098226, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.03493941, + "balance_loss_mlp": 1.01792872, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.4327100144362852, + "language_loss": 0.73965561, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.7609241, + "num_input_tokens_seen": 342769705, + "step": 15891, + "time_per_iteration": 2.504126787185669 + }, + { + "auxiliary_loss_clip": 0.01056878, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.02865624, + "balance_loss_mlp": 1.01922131, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 1.7457132074262431, + "language_loss": 0.78110659, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80196291, + "num_input_tokens_seen": 342787000, + "step": 15892, + "time_per_iteration": 2.618650436401367 + }, + { + "auxiliary_loss_clip": 0.01095614, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.03313828, + "balance_loss_mlp": 1.02048206, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.5165158794209697, + "language_loss": 0.69516557, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.71643794, + "num_input_tokens_seen": 342807795, + "step": 15893, + "time_per_iteration": 2.526536703109741 + }, + { + "auxiliary_loss_clip": 0.01078215, + "auxiliary_loss_mlp": 0.00749248, + "balance_loss_clip": 1.03626895, + "balance_loss_mlp": 1.00021517, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 1.8088914193122867, + "language_loss": 0.65753704, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67581165, + "num_input_tokens_seen": 342825490, + "step": 15894, + "time_per_iteration": 2.5596394538879395 + }, + { + "auxiliary_loss_clip": 0.01086508, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.03232932, + "balance_loss_mlp": 1.0213511, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 8.942549515164727, + "language_loss": 0.81853372, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83972824, + "num_input_tokens_seen": 342844965, + "step": 15895, + "time_per_iteration": 2.5553436279296875 + }, + { + "auxiliary_loss_clip": 0.01072317, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.03251863, + "balance_loss_mlp": 1.017802, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 1.4440209610138353, + "language_loss": 0.72487247, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74588287, + "num_input_tokens_seen": 342865915, + "step": 15896, + "time_per_iteration": 2.6478936672210693 + }, + { + "auxiliary_loss_clip": 0.01095026, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.032583, + "balance_loss_mlp": 1.02172482, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.8605874331625596, + "language_loss": 0.79446918, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81574249, + "num_input_tokens_seen": 342884000, + "step": 15897, + "time_per_iteration": 4.018922805786133 + }, + { + "auxiliary_loss_clip": 0.01068557, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.02887273, + "balance_loss_mlp": 1.01937056, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.6297896618229013, + "language_loss": 0.72704613, + "learning_rate": 2.03949242614303e-08, + "loss": 0.74804598, + "num_input_tokens_seen": 342903095, + "step": 15898, + "time_per_iteration": 2.6366055011749268 + }, + { + "auxiliary_loss_clip": 0.00995482, + "auxiliary_loss_mlp": 0.01012988, + "balance_loss_clip": 1.00523496, + "balance_loss_mlp": 1.01210618, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8979428903828994, + "language_loss": 0.52324617, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54333085, + "num_input_tokens_seen": 342958155, + "step": 15899, + "time_per_iteration": 3.109239101409912 + }, + { + "auxiliary_loss_clip": 0.01090677, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.03424239, + "balance_loss_mlp": 1.01868606, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.4063102107106973, + "language_loss": 0.68665075, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70786679, + "num_input_tokens_seen": 342972500, + "step": 15900, + "time_per_iteration": 2.480391263961792 + }, + { + "auxiliary_loss_clip": 0.01087259, + "auxiliary_loss_mlp": 0.0074944, + "balance_loss_clip": 1.03274012, + "balance_loss_mlp": 1.000283, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 2.091823448156698, + "language_loss": 0.83105421, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.84942114, + "num_input_tokens_seen": 342989035, + "step": 15901, + "time_per_iteration": 2.5444490909576416 + }, + { + "auxiliary_loss_clip": 0.00996678, + "auxiliary_loss_mlp": 0.00999285, + "balance_loss_clip": 1.00957525, + "balance_loss_mlp": 0.99828988, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7112949532055104, + "language_loss": 0.54259109, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56255078, + "num_input_tokens_seen": 343051675, + "step": 15902, + "time_per_iteration": 4.672415733337402 + }, + { + "auxiliary_loss_clip": 0.01074663, + "auxiliary_loss_mlp": 0.01027716, + "balance_loss_clip": 1.03448582, + "balance_loss_mlp": 1.01850104, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.7773516994586516, + "language_loss": 0.85275745, + "learning_rate": 2.01184758473425e-08, + "loss": 0.8737812, + "num_input_tokens_seen": 343068895, + "step": 15903, + "time_per_iteration": 2.601123571395874 + }, + { + "auxiliary_loss_clip": 0.01067055, + "auxiliary_loss_mlp": 0.00749504, + "balance_loss_clip": 1.03040969, + "balance_loss_mlp": 1.00023568, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 2.1995369269845253, + "language_loss": 0.79971373, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.81787932, + "num_input_tokens_seen": 343087115, + "step": 15904, + "time_per_iteration": 2.558952569961548 + }, + { + "auxiliary_loss_clip": 0.01085696, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.03244007, + "balance_loss_mlp": 1.01905358, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 2.741108430052367, + "language_loss": 0.59833789, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.61950171, + "num_input_tokens_seen": 343105575, + "step": 15905, + "time_per_iteration": 2.5646114349365234 + }, + { + "auxiliary_loss_clip": 0.01084291, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.03252363, + "balance_loss_mlp": 1.02013445, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 2.361559511479195, + "language_loss": 0.70315349, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72430062, + "num_input_tokens_seen": 343123025, + "step": 15906, + "time_per_iteration": 2.544618606567383 + }, + { + "auxiliary_loss_clip": 0.01040223, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.02945316, + "balance_loss_mlp": 1.01775885, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.6484551891777337, + "language_loss": 0.71124315, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73193854, + "num_input_tokens_seen": 343141625, + "step": 15907, + "time_per_iteration": 4.136334657669067 + }, + { + "auxiliary_loss_clip": 0.01053578, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.03012919, + "balance_loss_mlp": 1.01682496, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 1.9452526980334897, + "language_loss": 0.70303965, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72385406, + "num_input_tokens_seen": 343161300, + "step": 15908, + "time_per_iteration": 2.675745964050293 + }, + { + "auxiliary_loss_clip": 0.01074706, + "auxiliary_loss_mlp": 0.00749353, + "balance_loss_clip": 1.0334847, + "balance_loss_mlp": 1.00027287, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.7838030253798047, + "language_loss": 0.82536352, + "learning_rate": 1.978921532427802e-08, + "loss": 0.84360409, + "num_input_tokens_seen": 343177815, + "step": 15909, + "time_per_iteration": 2.565053701400757 + }, + { + "auxiliary_loss_clip": 0.0108533, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.03193176, + "balance_loss_mlp": 1.02254915, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 2.000114218822085, + "language_loss": 0.67322081, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69440341, + "num_input_tokens_seen": 343198140, + "step": 15910, + "time_per_iteration": 2.5595815181732178 + }, + { + "auxiliary_loss_clip": 0.01089729, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.03480029, + "balance_loss_mlp": 1.01981783, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 4.0524128512868725, + "language_loss": 0.74400628, + "learning_rate": 1.968006251276444e-08, + "loss": 0.7652123, + "num_input_tokens_seen": 343218280, + "step": 15911, + "time_per_iteration": 2.6137237548828125 + }, + { + "auxiliary_loss_clip": 0.01086641, + "auxiliary_loss_mlp": 0.01026851, + "balance_loss_clip": 1.03305078, + "balance_loss_mlp": 1.01601481, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 1.8614060572398583, + "language_loss": 0.69321823, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71435314, + "num_input_tokens_seen": 343236850, + "step": 15912, + "time_per_iteration": 2.5039618015289307 + }, + { + "auxiliary_loss_clip": 0.01072544, + "auxiliary_loss_mlp": 0.01035721, + "balance_loss_clip": 1.03150368, + "balance_loss_mlp": 1.02412224, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 3.926678865872829, + "language_loss": 0.72642994, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74751258, + "num_input_tokens_seen": 343253065, + "step": 15913, + "time_per_iteration": 2.570737600326538 + }, + { + "auxiliary_loss_clip": 0.01017276, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.03230727, + "balance_loss_mlp": 1.01819456, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 2.1673204902961003, + "language_loss": 0.73867965, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75913835, + "num_input_tokens_seen": 343270330, + "step": 15914, + "time_per_iteration": 2.6911394596099854 + }, + { + "auxiliary_loss_clip": 0.01095605, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.03324926, + "balance_loss_mlp": 1.01592207, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.4542023636637404, + "language_loss": 0.67278731, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.6940099, + "num_input_tokens_seen": 343289625, + "step": 15915, + "time_per_iteration": 2.493820905685425 + }, + { + "auxiliary_loss_clip": 0.01083923, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.03328347, + "balance_loss_mlp": 1.01513839, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 2.431588628221164, + "language_loss": 0.63987291, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66097003, + "num_input_tokens_seen": 343309200, + "step": 15916, + "time_per_iteration": 2.558995246887207 + }, + { + "auxiliary_loss_clip": 0.01091056, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.03226876, + "balance_loss_mlp": 1.01720822, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 3.196777824262175, + "language_loss": 0.80744916, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82863474, + "num_input_tokens_seen": 343326270, + "step": 15917, + "time_per_iteration": 4.081795930862427 + }, + { + "auxiliary_loss_clip": 0.0106422, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.03162789, + "balance_loss_mlp": 1.01971853, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 2.1759886783341162, + "language_loss": 0.72883898, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.7497927, + "num_input_tokens_seen": 343344430, + "step": 15918, + "time_per_iteration": 2.5325770378112793 + }, + { + "auxiliary_loss_clip": 0.00994914, + "auxiliary_loss_mlp": 0.01010547, + "balance_loss_clip": 1.00460339, + "balance_loss_mlp": 1.00959337, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6322777441323099, + "language_loss": 0.53100497, + "learning_rate": 1.924645518878032e-08, + "loss": 0.5510596, + "num_input_tokens_seen": 343416155, + "step": 15919, + "time_per_iteration": 3.2658445835113525 + }, + { + "auxiliary_loss_clip": 0.01093572, + "auxiliary_loss_mlp": 0.01034004, + "balance_loss_clip": 1.03675497, + "balance_loss_mlp": 1.02220821, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 2.9233580736895233, + "language_loss": 0.75590944, + "learning_rate": 1.919259224843972e-08, + "loss": 0.7771852, + "num_input_tokens_seen": 343431715, + "step": 15920, + "time_per_iteration": 2.688533306121826 + }, + { + "auxiliary_loss_clip": 0.01062737, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.03368425, + "balance_loss_mlp": 1.0183692, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 2.0149750311436687, + "language_loss": 0.79323006, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.81415647, + "num_input_tokens_seen": 343450425, + "step": 15921, + "time_per_iteration": 2.6101224422454834 + }, + { + "auxiliary_loss_clip": 0.01089347, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.03165472, + "balance_loss_mlp": 1.01596689, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 2.157779535433634, + "language_loss": 0.5124774, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53364885, + "num_input_tokens_seen": 343470445, + "step": 15922, + "time_per_iteration": 2.6746487617492676 + }, + { + "auxiliary_loss_clip": 0.01039572, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.02821589, + "balance_loss_mlp": 1.02577555, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.2932126356427722, + "language_loss": 0.83794689, + "learning_rate": 1.903145411006557e-08, + "loss": 0.85873264, + "num_input_tokens_seen": 343485200, + "step": 15923, + "time_per_iteration": 2.7236766815185547 + }, + { + "auxiliary_loss_clip": 0.01067723, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.02903867, + "balance_loss_mlp": 1.01871884, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.4972374716382546, + "language_loss": 0.75151479, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77248216, + "num_input_tokens_seen": 343505080, + "step": 15924, + "time_per_iteration": 2.6595535278320312 + }, + { + "auxiliary_loss_clip": 0.010754, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.03178501, + "balance_loss_mlp": 1.01931882, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 2.5643643832260143, + "language_loss": 0.86407423, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88513094, + "num_input_tokens_seen": 343523995, + "step": 15925, + "time_per_iteration": 2.6350114345550537 + }, + { + "auxiliary_loss_clip": 0.01066373, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.03275716, + "balance_loss_mlp": 1.02041006, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 1.9372778739908192, + "language_loss": 0.75680494, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77778745, + "num_input_tokens_seen": 343542015, + "step": 15926, + "time_per_iteration": 2.66843318939209 + }, + { + "auxiliary_loss_clip": 0.01073238, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.03498709, + "balance_loss_mlp": 1.01699281, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.6239880047838762, + "language_loss": 0.77567565, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79667801, + "num_input_tokens_seen": 343561680, + "step": 15927, + "time_per_iteration": 2.6188783645629883 + }, + { + "auxiliary_loss_clip": 0.01050437, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.03254521, + "balance_loss_mlp": 1.01732028, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 1.985203735705543, + "language_loss": 0.68958509, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.71038824, + "num_input_tokens_seen": 343585290, + "step": 15928, + "time_per_iteration": 2.781287908554077 + }, + { + "auxiliary_loss_clip": 0.01078426, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.0352428, + "balance_loss_mlp": 1.0199765, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.973107863566822, + "language_loss": 0.82100379, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84209925, + "num_input_tokens_seen": 343604045, + "step": 15929, + "time_per_iteration": 2.6558444499969482 + }, + { + "auxiliary_loss_clip": 0.01061583, + "auxiliary_loss_mlp": 0.01040875, + "balance_loss_clip": 1.03050041, + "balance_loss_mlp": 1.02800035, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.4160486829157002, + "language_loss": 0.7212218, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74224639, + "num_input_tokens_seen": 343626595, + "step": 15930, + "time_per_iteration": 2.654127359390259 + }, + { + "auxiliary_loss_clip": 0.01020254, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.02753901, + "balance_loss_mlp": 1.02201581, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.4917542303220628, + "language_loss": 0.62295783, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.6435017, + "num_input_tokens_seen": 343646195, + "step": 15931, + "time_per_iteration": 2.711019992828369 + }, + { + "auxiliary_loss_clip": 0.01094629, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.03355968, + "balance_loss_mlp": 1.01647258, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.9061140639552983, + "language_loss": 0.6936394, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71485287, + "num_input_tokens_seen": 343663665, + "step": 15932, + "time_per_iteration": 2.5077977180480957 + }, + { + "auxiliary_loss_clip": 0.01067062, + "auxiliary_loss_mlp": 0.01037254, + "balance_loss_clip": 1.0316186, + "balance_loss_mlp": 1.02454638, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 2.046219838705307, + "language_loss": 0.75683624, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77787936, + "num_input_tokens_seen": 343682145, + "step": 15933, + "time_per_iteration": 2.620588541030884 + }, + { + "auxiliary_loss_clip": 0.00990435, + "auxiliary_loss_mlp": 0.00998189, + "balance_loss_clip": 1.00940037, + "balance_loss_mlp": 0.99702072, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7174574666648506, + "language_loss": 0.57263696, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.5925231, + "num_input_tokens_seen": 343744685, + "step": 15934, + "time_per_iteration": 3.2742807865142822 + }, + { + "auxiliary_loss_clip": 0.0102258, + "auxiliary_loss_mlp": 0.00746589, + "balance_loss_clip": 1.00293601, + "balance_loss_mlp": 0.99979669, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9289644862851222, + "language_loss": 0.6597122, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67740393, + "num_input_tokens_seen": 343801835, + "step": 15935, + "time_per_iteration": 3.0122947692871094 + }, + { + "auxiliary_loss_clip": 0.01006988, + "auxiliary_loss_mlp": 0.01001584, + "balance_loss_clip": 1.01244831, + "balance_loss_mlp": 1.0006659, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7973881645976755, + "language_loss": 0.57044327, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59052902, + "num_input_tokens_seen": 343861515, + "step": 15936, + "time_per_iteration": 3.1288676261901855 + }, + { + "auxiliary_loss_clip": 0.01036349, + "auxiliary_loss_mlp": 0.01027289, + "balance_loss_clip": 1.02987909, + "balance_loss_mlp": 1.01600599, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.935312371925224, + "language_loss": 0.785465, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80610144, + "num_input_tokens_seen": 343881240, + "step": 15937, + "time_per_iteration": 4.4174582958221436 + }, + { + "auxiliary_loss_clip": 0.01082131, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.03236389, + "balance_loss_mlp": 1.01896691, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 1.4680893567018913, + "language_loss": 0.68585873, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70698559, + "num_input_tokens_seen": 343900885, + "step": 15938, + "time_per_iteration": 2.595301628112793 + }, + { + "auxiliary_loss_clip": 0.01064593, + "auxiliary_loss_mlp": 0.01026429, + "balance_loss_clip": 1.03215814, + "balance_loss_mlp": 1.01562285, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 3.8973049435120997, + "language_loss": 0.65719879, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67810893, + "num_input_tokens_seen": 343918460, + "step": 15939, + "time_per_iteration": 2.6356687545776367 + }, + { + "auxiliary_loss_clip": 0.01066168, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.03140557, + "balance_loss_mlp": 1.02002358, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.478998331265312, + "language_loss": 0.73669064, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.7576651, + "num_input_tokens_seen": 343938030, + "step": 15940, + "time_per_iteration": 2.6671760082244873 + }, + { + "auxiliary_loss_clip": 0.01097425, + "auxiliary_loss_mlp": 0.01028356, + "balance_loss_clip": 1.03370357, + "balance_loss_mlp": 1.01673865, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 1.808262329340643, + "language_loss": 0.72841918, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.749677, + "num_input_tokens_seen": 343956635, + "step": 15941, + "time_per_iteration": 2.504146099090576 + }, + { + "auxiliary_loss_clip": 0.0107522, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.03271425, + "balance_loss_mlp": 1.02279866, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.7382803936805358, + "language_loss": 0.7145142, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73560417, + "num_input_tokens_seen": 343976625, + "step": 15942, + "time_per_iteration": 4.327058792114258 + }, + { + "auxiliary_loss_clip": 0.01097792, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.03321099, + "balance_loss_mlp": 1.02054715, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.5412622561551008, + "language_loss": 0.72248054, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74378359, + "num_input_tokens_seen": 343997790, + "step": 15943, + "time_per_iteration": 2.649091958999634 + }, + { + "auxiliary_loss_clip": 0.01087601, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.03293419, + "balance_loss_mlp": 1.02245021, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.665297389327276, + "language_loss": 0.68566453, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70687926, + "num_input_tokens_seen": 344016935, + "step": 15944, + "time_per_iteration": 2.519805431365967 + }, + { + "auxiliary_loss_clip": 0.01096036, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.03211653, + "balance_loss_mlp": 1.02116251, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 2.1097657716377567, + "language_loss": 0.65970677, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.68099582, + "num_input_tokens_seen": 344035590, + "step": 15945, + "time_per_iteration": 2.470088243484497 + }, + { + "auxiliary_loss_clip": 0.00959184, + "auxiliary_loss_mlp": 0.01001291, + "balance_loss_clip": 1.00981772, + "balance_loss_mlp": 1.0002718, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7532945439737698, + "language_loss": 0.61903805, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63864279, + "num_input_tokens_seen": 344100845, + "step": 15946, + "time_per_iteration": 4.885860204696655 + }, + { + "auxiliary_loss_clip": 0.01095436, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.03361714, + "balance_loss_mlp": 1.01878476, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.5952898268161322, + "language_loss": 0.75291872, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.77416968, + "num_input_tokens_seen": 344121780, + "step": 15947, + "time_per_iteration": 2.769087314605713 + }, + { + "auxiliary_loss_clip": 0.01066081, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.02877998, + "balance_loss_mlp": 1.01627719, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.1890569062906184, + "language_loss": 0.69634604, + "learning_rate": 1.771493294473747e-08, + "loss": 0.71727431, + "num_input_tokens_seen": 344140150, + "step": 15948, + "time_per_iteration": 2.756413459777832 + }, + { + "auxiliary_loss_clip": 0.01045813, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.03195882, + "balance_loss_mlp": 1.01964164, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 1.9283473116825303, + "language_loss": 0.78472084, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80548084, + "num_input_tokens_seen": 344158200, + "step": 15949, + "time_per_iteration": 2.6594338417053223 + }, + { + "auxiliary_loss_clip": 0.01099038, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.03503549, + "balance_loss_mlp": 1.02001715, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.8192281377600152, + "language_loss": 0.68664163, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70794916, + "num_input_tokens_seen": 344174720, + "step": 15950, + "time_per_iteration": 2.5285308361053467 + }, + { + "auxiliary_loss_clip": 0.01077472, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.03364491, + "balance_loss_mlp": 1.0196296, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.7580318050321966, + "language_loss": 0.85948658, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88056213, + "num_input_tokens_seen": 344192580, + "step": 15951, + "time_per_iteration": 2.6257193088531494 + }, + { + "auxiliary_loss_clip": 0.010725, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.03320003, + "balance_loss_mlp": 1.02689338, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 3.705136774226044, + "language_loss": 0.79725981, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.81837672, + "num_input_tokens_seen": 344210345, + "step": 15952, + "time_per_iteration": 2.655820369720459 + }, + { + "auxiliary_loss_clip": 0.01086688, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.03487444, + "balance_loss_mlp": 1.01928759, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.687080313433538, + "language_loss": 0.69892466, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.72009921, + "num_input_tokens_seen": 344229540, + "step": 15953, + "time_per_iteration": 2.614718198776245 + }, + { + "auxiliary_loss_clip": 0.01033427, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.02982819, + "balance_loss_mlp": 1.0239861, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 3.179489319992173, + "language_loss": 0.57489192, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.5956012, + "num_input_tokens_seen": 344247830, + "step": 15954, + "time_per_iteration": 2.835036277770996 + }, + { + "auxiliary_loss_clip": 0.01087484, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.0331167, + "balance_loss_mlp": 1.02168989, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.048423984011895, + "language_loss": 0.73748219, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.75869423, + "num_input_tokens_seen": 344267760, + "step": 15955, + "time_per_iteration": 2.9202122688293457 + }, + { + "auxiliary_loss_clip": 0.01075925, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.0323236, + "balance_loss_mlp": 1.02404714, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 1.8048541294565374, + "language_loss": 0.62318265, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.64429951, + "num_input_tokens_seen": 344284905, + "step": 15956, + "time_per_iteration": 2.577221393585205 + }, + { + "auxiliary_loss_clip": 0.01058826, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.0328207, + "balance_loss_mlp": 1.01714206, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.8607713104455275, + "language_loss": 0.59614998, + "learning_rate": 1.725248447997507e-08, + "loss": 0.61702168, + "num_input_tokens_seen": 344302025, + "step": 15957, + "time_per_iteration": 4.238770008087158 + }, + { + "auxiliary_loss_clip": 0.010612, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.0329411, + "balance_loss_mlp": 1.02203071, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.9297150014166462, + "language_loss": 0.74047673, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76142722, + "num_input_tokens_seen": 344321935, + "step": 15958, + "time_per_iteration": 2.7429919242858887 + }, + { + "auxiliary_loss_clip": 0.01073046, + "auxiliary_loss_mlp": 0.00749278, + "balance_loss_clip": 1.02984428, + "balance_loss_mlp": 1.00019538, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.6132481623435355, + "language_loss": 0.74854994, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76677322, + "num_input_tokens_seen": 344340405, + "step": 15959, + "time_per_iteration": 2.580548048019409 + }, + { + "auxiliary_loss_clip": 0.01079445, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.03180647, + "balance_loss_mlp": 1.01639271, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 2.113205376398453, + "language_loss": 0.65055084, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67162251, + "num_input_tokens_seen": 344359925, + "step": 15960, + "time_per_iteration": 2.524507761001587 + }, + { + "auxiliary_loss_clip": 0.01094946, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.03378642, + "balance_loss_mlp": 1.02179039, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.6425343012445113, + "language_loss": 0.77908236, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80036008, + "num_input_tokens_seen": 344379100, + "step": 15961, + "time_per_iteration": 2.5494577884674072 + }, + { + "auxiliary_loss_clip": 0.01049752, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.03387022, + "balance_loss_mlp": 1.0153327, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 2.4175141059291354, + "language_loss": 0.76003319, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78079259, + "num_input_tokens_seen": 344396895, + "step": 15962, + "time_per_iteration": 2.6993165016174316 + }, + { + "auxiliary_loss_clip": 0.01078918, + "auxiliary_loss_mlp": 0.0103326, + "balance_loss_clip": 1.03450394, + "balance_loss_mlp": 1.021703, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 2.0257308040075124, + "language_loss": 0.716295, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.7374168, + "num_input_tokens_seen": 344415115, + "step": 15963, + "time_per_iteration": 2.6574013233184814 + }, + { + "auxiliary_loss_clip": 0.01069036, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.03492141, + "balance_loss_mlp": 1.02224326, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.7961725401828237, + "language_loss": 0.73993993, + "learning_rate": 1.689701268270527e-08, + "loss": 0.7609576, + "num_input_tokens_seen": 344435185, + "step": 15964, + "time_per_iteration": 2.7272679805755615 + }, + { + "auxiliary_loss_clip": 0.00981709, + "auxiliary_loss_mlp": 0.01004533, + "balance_loss_clip": 1.00278902, + "balance_loss_mlp": 1.00361526, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.8847338712829186, + "language_loss": 0.57590854, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59577096, + "num_input_tokens_seen": 344488950, + "step": 15965, + "time_per_iteration": 3.170031785964966 + }, + { + "auxiliary_loss_clip": 0.01097389, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.03314507, + "balance_loss_mlp": 1.01923931, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 1.6503402019633746, + "language_loss": 0.7884053, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.80967641, + "num_input_tokens_seen": 344506740, + "step": 15966, + "time_per_iteration": 2.590571403503418 + }, + { + "auxiliary_loss_clip": 0.01067465, + "auxiliary_loss_mlp": 0.01026393, + "balance_loss_clip": 1.02804685, + "balance_loss_mlp": 1.01524079, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.6287362111094752, + "language_loss": 0.79174638, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81268501, + "num_input_tokens_seen": 344526670, + "step": 15967, + "time_per_iteration": 2.6065478324890137 + }, + { + "auxiliary_loss_clip": 0.01035513, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.02949357, + "balance_loss_mlp": 1.01789522, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 1.9510617091673386, + "language_loss": 0.80747634, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82813972, + "num_input_tokens_seen": 344541995, + "step": 15968, + "time_per_iteration": 2.634939432144165 + }, + { + "auxiliary_loss_clip": 0.0104824, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.03221536, + "balance_loss_mlp": 1.02425575, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.716033490300887, + "language_loss": 0.66814923, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.68900555, + "num_input_tokens_seen": 344559980, + "step": 15969, + "time_per_iteration": 2.6296467781066895 + }, + { + "auxiliary_loss_clip": 0.01082254, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.03214979, + "balance_loss_mlp": 1.02305329, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 3.0381756240283275, + "language_loss": 0.79955506, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.82071435, + "num_input_tokens_seen": 344577765, + "step": 15970, + "time_per_iteration": 2.512429714202881 + }, + { + "auxiliary_loss_clip": 0.01084876, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03421104, + "balance_loss_mlp": 1.02021217, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.9987722256034346, + "language_loss": 0.77361679, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79477811, + "num_input_tokens_seen": 344597650, + "step": 15971, + "time_per_iteration": 2.5841219425201416 + }, + { + "auxiliary_loss_clip": 0.01079019, + "auxiliary_loss_mlp": 0.01024902, + "balance_loss_clip": 1.03187776, + "balance_loss_mlp": 1.01336861, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 1.9367779926839572, + "language_loss": 0.67330754, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69434673, + "num_input_tokens_seen": 344613580, + "step": 15972, + "time_per_iteration": 2.5290229320526123 + }, + { + "auxiliary_loss_clip": 0.01084682, + "auxiliary_loss_mlp": 0.00749341, + "balance_loss_clip": 1.0321188, + "balance_loss_mlp": 1.00020683, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.1270699980733467, + "language_loss": 0.76541251, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78375274, + "num_input_tokens_seen": 344626910, + "step": 15973, + "time_per_iteration": 2.543642282485962 + }, + { + "auxiliary_loss_clip": 0.01055036, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.03627539, + "balance_loss_mlp": 1.02634382, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.9574965826265756, + "language_loss": 0.69412446, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71506631, + "num_input_tokens_seen": 344644330, + "step": 15974, + "time_per_iteration": 2.6753599643707275 + }, + { + "auxiliary_loss_clip": 0.01086296, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.03155529, + "balance_loss_mlp": 1.01640904, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.7223139230919164, + "language_loss": 0.68109703, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70223755, + "num_input_tokens_seen": 344663910, + "step": 15975, + "time_per_iteration": 2.498751401901245 + }, + { + "auxiliary_loss_clip": 0.01094352, + "auxiliary_loss_mlp": 0.01026799, + "balance_loss_clip": 1.0336709, + "balance_loss_mlp": 1.01604068, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 1.970989015096897, + "language_loss": 0.55604374, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.57725531, + "num_input_tokens_seen": 344682320, + "step": 15976, + "time_per_iteration": 2.5377871990203857 + }, + { + "auxiliary_loss_clip": 0.01067199, + "auxiliary_loss_mlp": 0.01024092, + "balance_loss_clip": 1.02979815, + "balance_loss_mlp": 1.01382196, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 1.9798673104123923, + "language_loss": 0.6849544, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70586729, + "num_input_tokens_seen": 344701355, + "step": 15977, + "time_per_iteration": 4.130517482757568 + }, + { + "auxiliary_loss_clip": 0.01085369, + "auxiliary_loss_mlp": 0.01037133, + "balance_loss_clip": 1.03177583, + "balance_loss_mlp": 1.0258441, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 2.5252442104828914, + "language_loss": 0.82058954, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84181452, + "num_input_tokens_seen": 344717980, + "step": 15978, + "time_per_iteration": 2.534358024597168 + }, + { + "auxiliary_loss_clip": 0.01088094, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.03246641, + "balance_loss_mlp": 1.01905918, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.4751953400354854, + "language_loss": 0.82959807, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85078502, + "num_input_tokens_seen": 344733480, + "step": 15979, + "time_per_iteration": 2.557035207748413 + }, + { + "auxiliary_loss_clip": 0.01097143, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.03314435, + "balance_loss_mlp": 1.02450848, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.7202372563184178, + "language_loss": 0.79941058, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82073021, + "num_input_tokens_seen": 344752130, + "step": 15980, + "time_per_iteration": 2.5369787216186523 + }, + { + "auxiliary_loss_clip": 0.01088068, + "auxiliary_loss_mlp": 0.01027105, + "balance_loss_clip": 1.03283381, + "balance_loss_mlp": 1.01596522, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 1.8780371302960355, + "language_loss": 0.68360651, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70475817, + "num_input_tokens_seen": 344771195, + "step": 15981, + "time_per_iteration": 2.611327886581421 + }, + { + "auxiliary_loss_clip": 0.01082144, + "auxiliary_loss_mlp": 0.00749277, + "balance_loss_clip": 1.03130841, + "balance_loss_mlp": 1.00024223, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.3796523276503954, + "language_loss": 0.69504297, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71335715, + "num_input_tokens_seen": 344793150, + "step": 15982, + "time_per_iteration": 4.146455764770508 + }, + { + "auxiliary_loss_clip": 0.00996613, + "auxiliary_loss_mlp": 0.01009829, + "balance_loss_clip": 1.00849485, + "balance_loss_mlp": 1.00876808, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6756905732096924, + "language_loss": 0.53285998, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55292439, + "num_input_tokens_seen": 344852855, + "step": 15983, + "time_per_iteration": 3.277796983718872 + }, + { + "auxiliary_loss_clip": 0.01097926, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.03491426, + "balance_loss_mlp": 1.02079654, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.328210849625335, + "language_loss": 0.67460376, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.69590026, + "num_input_tokens_seen": 344869830, + "step": 15984, + "time_per_iteration": 2.4869039058685303 + }, + { + "auxiliary_loss_clip": 0.01062842, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.03369093, + "balance_loss_mlp": 1.01930451, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.682758804044, + "language_loss": 0.67353177, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69445407, + "num_input_tokens_seen": 344888905, + "step": 15985, + "time_per_iteration": 2.650475025177002 + }, + { + "auxiliary_loss_clip": 0.01098403, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03394198, + "balance_loss_mlp": 1.01637006, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 1.8302695388990267, + "language_loss": 0.78895718, + "learning_rate": 1.580380726142283e-08, + "loss": 0.81021565, + "num_input_tokens_seen": 344907160, + "step": 15986, + "time_per_iteration": 3.9730610847473145 + }, + { + "auxiliary_loss_clip": 0.01050404, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.03508568, + "balance_loss_mlp": 1.01822627, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 2.231844199764483, + "language_loss": 0.63835335, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.65915936, + "num_input_tokens_seen": 344922400, + "step": 15987, + "time_per_iteration": 2.721353769302368 + }, + { + "auxiliary_loss_clip": 0.01094558, + "auxiliary_loss_mlp": 0.01024877, + "balance_loss_clip": 1.03405595, + "balance_loss_mlp": 1.01495242, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.8080302324876163, + "language_loss": 0.66775864, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.68895298, + "num_input_tokens_seen": 344941910, + "step": 15988, + "time_per_iteration": 2.517808675765991 + }, + { + "auxiliary_loss_clip": 0.01085758, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.03245687, + "balance_loss_mlp": 1.02573812, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 1.7307808264530113, + "language_loss": 0.74710178, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76832277, + "num_input_tokens_seen": 344960020, + "step": 15989, + "time_per_iteration": 2.507582902908325 + }, + { + "auxiliary_loss_clip": 0.01003388, + "auxiliary_loss_mlp": 0.00999802, + "balance_loss_clip": 1.00370717, + "balance_loss_mlp": 0.99866313, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8308306049316938, + "language_loss": 0.63123548, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65126741, + "num_input_tokens_seen": 345018290, + "step": 15990, + "time_per_iteration": 3.0530478954315186 + }, + { + "auxiliary_loss_clip": 0.01085587, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.03275084, + "balance_loss_mlp": 1.01806176, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 2.531751805991998, + "language_loss": 0.77530944, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.7964536, + "num_input_tokens_seen": 345040235, + "step": 15991, + "time_per_iteration": 2.588459014892578 + }, + { + "auxiliary_loss_clip": 0.01101541, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.03324175, + "balance_loss_mlp": 1.01842082, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.845824193252007, + "language_loss": 0.84753329, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.8688525, + "num_input_tokens_seen": 345054540, + "step": 15992, + "time_per_iteration": 2.530886173248291 + }, + { + "auxiliary_loss_clip": 0.01073266, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.03118241, + "balance_loss_mlp": 1.01631379, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 1.8087529215445257, + "language_loss": 0.72176063, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74277174, + "num_input_tokens_seen": 345074035, + "step": 15993, + "time_per_iteration": 2.628767967224121 + }, + { + "auxiliary_loss_clip": 0.01058244, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.03158677, + "balance_loss_mlp": 1.01857841, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.7055104142806143, + "language_loss": 0.68072563, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70160878, + "num_input_tokens_seen": 345099270, + "step": 15994, + "time_per_iteration": 2.6916682720184326 + }, + { + "auxiliary_loss_clip": 0.0105996, + "auxiliary_loss_mlp": 0.01028685, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.01776004, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.7380415758226804, + "language_loss": 0.84739935, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86828578, + "num_input_tokens_seen": 345116975, + "step": 15995, + "time_per_iteration": 2.6698410511016846 + }, + { + "auxiliary_loss_clip": 0.01090134, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.03436291, + "balance_loss_mlp": 1.01787436, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.7989870665548944, + "language_loss": 0.76038682, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78158236, + "num_input_tokens_seen": 345133645, + "step": 15996, + "time_per_iteration": 2.526416778564453 + }, + { + "auxiliary_loss_clip": 0.01071888, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.03165948, + "balance_loss_mlp": 1.018327, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 1.8737036270732328, + "language_loss": 0.76946926, + "learning_rate": 1.52708595287494e-08, + "loss": 0.7904864, + "num_input_tokens_seen": 345150740, + "step": 15997, + "time_per_iteration": 2.48587703704834 + }, + { + "auxiliary_loss_clip": 0.01093116, + "auxiliary_loss_mlp": 0.00749163, + "balance_loss_clip": 1.03265023, + "balance_loss_mlp": 1.00017798, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.6278970237465862, + "language_loss": 0.67514491, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69356763, + "num_input_tokens_seen": 345170365, + "step": 15998, + "time_per_iteration": 4.133033990859985 + }, + { + "auxiliary_loss_clip": 0.01061885, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.02689052, + "balance_loss_mlp": 1.01710272, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.6780774117845254, + "language_loss": 0.72853011, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.749439, + "num_input_tokens_seen": 345188930, + "step": 15999, + "time_per_iteration": 2.5450804233551025 + }, + { + "auxiliary_loss_clip": 0.01067224, + "auxiliary_loss_mlp": 0.01025555, + "balance_loss_clip": 1.03028464, + "balance_loss_mlp": 1.01560748, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 1.9470384652712889, + "language_loss": 0.65765423, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67858195, + "num_input_tokens_seen": 345209615, + "step": 16000, + "time_per_iteration": 2.5896518230438232 + }, + { + "auxiliary_loss_clip": 0.01061404, + "auxiliary_loss_mlp": 0.01025541, + "balance_loss_clip": 1.03000236, + "balance_loss_mlp": 1.0138824, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 8.705841686228043, + "language_loss": 0.757025, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.7778945, + "num_input_tokens_seen": 345229175, + "step": 16001, + "time_per_iteration": 2.595925807952881 + }, + { + "auxiliary_loss_clip": 0.01083139, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.03191698, + "balance_loss_mlp": 1.01749945, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.8758350458243191, + "language_loss": 0.68521965, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70633882, + "num_input_tokens_seen": 345247815, + "step": 16002, + "time_per_iteration": 2.491647720336914 + }, + { + "auxiliary_loss_clip": 0.01085868, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.03451467, + "balance_loss_mlp": 1.01800597, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.6380367108871556, + "language_loss": 0.64477795, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66592532, + "num_input_tokens_seen": 345269935, + "step": 16003, + "time_per_iteration": 2.572051525115967 + }, + { + "auxiliary_loss_clip": 0.01046615, + "auxiliary_loss_mlp": 0.0103967, + "balance_loss_clip": 1.03292537, + "balance_loss_mlp": 1.0295316, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.7816481627951928, + "language_loss": 0.76091576, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78177857, + "num_input_tokens_seen": 345288310, + "step": 16004, + "time_per_iteration": 2.6098790168762207 + }, + { + "auxiliary_loss_clip": 0.01084797, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.03315616, + "balance_loss_mlp": 1.01782095, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 2.0286561621903805, + "language_loss": 0.79763752, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81877196, + "num_input_tokens_seen": 345306615, + "step": 16005, + "time_per_iteration": 2.5777275562286377 + }, + { + "auxiliary_loss_clip": 0.01082769, + "auxiliary_loss_mlp": 0.01027395, + "balance_loss_clip": 1.03183198, + "balance_loss_mlp": 1.0171783, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 2.424502538866884, + "language_loss": 0.68126136, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.70236301, + "num_input_tokens_seen": 345331935, + "step": 16006, + "time_per_iteration": 2.848362922668457 + }, + { + "auxiliary_loss_clip": 0.01063513, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.03090072, + "balance_loss_mlp": 1.01996887, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.5536440211821032, + "language_loss": 0.78397435, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80491471, + "num_input_tokens_seen": 345351510, + "step": 16007, + "time_per_iteration": 2.6026504039764404 + }, + { + "auxiliary_loss_clip": 0.01099843, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.03519011, + "balance_loss_mlp": 1.01942277, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 2.2397775677553926, + "language_loss": 0.67715335, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.69846332, + "num_input_tokens_seen": 345367750, + "step": 16008, + "time_per_iteration": 2.4668753147125244 + }, + { + "auxiliary_loss_clip": 0.01078084, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.03556919, + "balance_loss_mlp": 1.0202527, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.4210415123979976, + "language_loss": 0.73209524, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75319999, + "num_input_tokens_seen": 345384790, + "step": 16009, + "time_per_iteration": 2.6030101776123047 + }, + { + "auxiliary_loss_clip": 0.01082905, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.0314002, + "balance_loss_mlp": 1.01750302, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 2.80039952351979, + "language_loss": 0.75470614, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77581739, + "num_input_tokens_seen": 345403390, + "step": 16010, + "time_per_iteration": 2.703737258911133 + }, + { + "auxiliary_loss_clip": 0.0109145, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.03609967, + "balance_loss_mlp": 1.01941121, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.9193574945141358, + "language_loss": 0.69680059, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71804869, + "num_input_tokens_seen": 345418685, + "step": 16011, + "time_per_iteration": 2.4547011852264404 + }, + { + "auxiliary_loss_clip": 0.0108497, + "auxiliary_loss_mlp": 0.01029299, + "balance_loss_clip": 1.03386164, + "balance_loss_mlp": 1.01907659, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.7432861899769925, + "language_loss": 0.68365169, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70479435, + "num_input_tokens_seen": 345442380, + "step": 16012, + "time_per_iteration": 2.984013319015503 + }, + { + "auxiliary_loss_clip": 0.01076311, + "auxiliary_loss_mlp": 0.01038809, + "balance_loss_clip": 1.03137016, + "balance_loss_mlp": 1.02569628, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 1.9929326945157781, + "language_loss": 0.72349155, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74464273, + "num_input_tokens_seen": 345463815, + "step": 16013, + "time_per_iteration": 2.6589596271514893 + }, + { + "auxiliary_loss_clip": 0.01065114, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.03301311, + "balance_loss_mlp": 1.01950383, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.446737989281684, + "language_loss": 0.63221544, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65318, + "num_input_tokens_seen": 345484525, + "step": 16014, + "time_per_iteration": 2.7902426719665527 + }, + { + "auxiliary_loss_clip": 0.01071597, + "auxiliary_loss_mlp": 0.01024554, + "balance_loss_clip": 1.03271174, + "balance_loss_mlp": 1.01544595, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 2.110910619641734, + "language_loss": 0.7177155, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73867702, + "num_input_tokens_seen": 345508295, + "step": 16015, + "time_per_iteration": 2.747267484664917 + }, + { + "auxiliary_loss_clip": 0.01056864, + "auxiliary_loss_mlp": 0.01028243, + "balance_loss_clip": 1.0286411, + "balance_loss_mlp": 1.01716292, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 1.9337182784408151, + "language_loss": 0.76808834, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.78893942, + "num_input_tokens_seen": 345525155, + "step": 16016, + "time_per_iteration": 2.600250005722046 + }, + { + "auxiliary_loss_clip": 0.01022667, + "auxiliary_loss_mlp": 0.01000925, + "balance_loss_clip": 1.00290751, + "balance_loss_mlp": 1.00001264, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8129406255862098, + "language_loss": 0.63153124, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65176713, + "num_input_tokens_seen": 345578905, + "step": 16017, + "time_per_iteration": 4.453543186187744 + }, + { + "auxiliary_loss_clip": 0.01081227, + "auxiliary_loss_mlp": 0.01026603, + "balance_loss_clip": 1.03543091, + "balance_loss_mlp": 1.01577902, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 1.9444454566268135, + "language_loss": 0.66349173, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68457007, + "num_input_tokens_seen": 345598965, + "step": 16018, + "time_per_iteration": 2.646132230758667 + }, + { + "auxiliary_loss_clip": 0.01046729, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03230202, + "balance_loss_mlp": 1.01874042, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 1.9301100091719223, + "language_loss": 0.79251993, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.8132838, + "num_input_tokens_seen": 345617945, + "step": 16019, + "time_per_iteration": 2.626027822494507 + }, + { + "auxiliary_loss_clip": 0.01056626, + "auxiliary_loss_mlp": 0.01026621, + "balance_loss_clip": 1.0295645, + "balance_loss_mlp": 1.01683366, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.4749809150861695, + "language_loss": 0.71998477, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.74081725, + "num_input_tokens_seen": 345637920, + "step": 16020, + "time_per_iteration": 2.7358314990997314 + }, + { + "auxiliary_loss_clip": 0.01075212, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.03397489, + "balance_loss_mlp": 1.01529908, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.6479061460926023, + "language_loss": 0.76902688, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.79003453, + "num_input_tokens_seen": 345656195, + "step": 16021, + "time_per_iteration": 2.6841671466827393 + }, + { + "auxiliary_loss_clip": 0.01063156, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.03173614, + "balance_loss_mlp": 1.0175302, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 2.4985209939320825, + "language_loss": 0.64922917, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.67016709, + "num_input_tokens_seen": 345676700, + "step": 16022, + "time_per_iteration": 2.6625070571899414 + }, + { + "auxiliary_loss_clip": 0.01064651, + "auxiliary_loss_mlp": 0.01033752, + "balance_loss_clip": 1.0291481, + "balance_loss_mlp": 1.02288055, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 1.8749014471613883, + "language_loss": 0.73262942, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75361347, + "num_input_tokens_seen": 345696725, + "step": 16023, + "time_per_iteration": 4.177541255950928 + }, + { + "auxiliary_loss_clip": 0.01078309, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.03100169, + "balance_loss_mlp": 1.01889801, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 2.253985753486907, + "language_loss": 0.81510717, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83619982, + "num_input_tokens_seen": 345716245, + "step": 16024, + "time_per_iteration": 2.595808744430542 + }, + { + "auxiliary_loss_clip": 0.01090419, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.03392422, + "balance_loss_mlp": 1.02005219, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 1.4114288033805045, + "language_loss": 0.81553864, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83675742, + "num_input_tokens_seen": 345739060, + "step": 16025, + "time_per_iteration": 2.6253139972686768 + }, + { + "auxiliary_loss_clip": 0.01088523, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.03246641, + "balance_loss_mlp": 1.01728654, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 1.7639858380294053, + "language_loss": 0.76345801, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.7846272, + "num_input_tokens_seen": 345758325, + "step": 16026, + "time_per_iteration": 2.557335376739502 + }, + { + "auxiliary_loss_clip": 0.01061387, + "auxiliary_loss_mlp": 0.00749712, + "balance_loss_clip": 1.03114939, + "balance_loss_mlp": 1.00030696, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.7564955914081282, + "language_loss": 0.63249052, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65060145, + "num_input_tokens_seen": 345778530, + "step": 16027, + "time_per_iteration": 4.193771839141846 + }, + { + "auxiliary_loss_clip": 0.01085465, + "auxiliary_loss_mlp": 0.01029671, + "balance_loss_clip": 1.03290403, + "balance_loss_mlp": 1.01801777, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 1.768718526996373, + "language_loss": 0.87499684, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.8961482, + "num_input_tokens_seen": 345796535, + "step": 16028, + "time_per_iteration": 2.5032172203063965 + }, + { + "auxiliary_loss_clip": 0.00986673, + "auxiliary_loss_mlp": 0.00999009, + "balance_loss_clip": 1.01282883, + "balance_loss_mlp": 0.99803776, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.7707119889515102, + "language_loss": 0.53153551, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55139232, + "num_input_tokens_seen": 345859700, + "step": 16029, + "time_per_iteration": 3.1791296005249023 + }, + { + "auxiliary_loss_clip": 0.0109699, + "auxiliary_loss_mlp": 0.0102779, + "balance_loss_clip": 1.03279603, + "balance_loss_mlp": 1.01686406, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 3.865686564457258, + "language_loss": 0.73618877, + "learning_rate": 1.372666546129797e-08, + "loss": 0.75743657, + "num_input_tokens_seen": 345878760, + "step": 16030, + "time_per_iteration": 2.510859966278076 + }, + { + "auxiliary_loss_clip": 0.01069118, + "auxiliary_loss_mlp": 0.010285, + "balance_loss_clip": 1.03200579, + "balance_loss_mlp": 1.01823044, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 1.6226796335268847, + "language_loss": 0.65764713, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.67862332, + "num_input_tokens_seen": 345900445, + "step": 16031, + "time_per_iteration": 2.685682773590088 + }, + { + "auxiliary_loss_clip": 0.01011805, + "auxiliary_loss_mlp": 0.00746571, + "balance_loss_clip": 1.00266743, + "balance_loss_mlp": 0.99975157, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8320445947732347, + "language_loss": 0.6077075, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62529129, + "num_input_tokens_seen": 345961020, + "step": 16032, + "time_per_iteration": 3.1372718811035156 + }, + { + "auxiliary_loss_clip": 0.01077925, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.03136444, + "balance_loss_mlp": 1.01816511, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.718668042324733, + "language_loss": 0.66093814, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68199313, + "num_input_tokens_seen": 345980210, + "step": 16033, + "time_per_iteration": 2.6175360679626465 + }, + { + "auxiliary_loss_clip": 0.01037716, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.03144419, + "balance_loss_mlp": 1.01790261, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.6397614553858888, + "language_loss": 0.65385938, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67452282, + "num_input_tokens_seen": 345998280, + "step": 16034, + "time_per_iteration": 2.700754404067993 + }, + { + "auxiliary_loss_clip": 0.0104983, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.03025424, + "balance_loss_mlp": 1.01891327, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.5090122595555417, + "language_loss": 0.74297369, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.76377308, + "num_input_tokens_seen": 346015545, + "step": 16035, + "time_per_iteration": 2.6675214767456055 + }, + { + "auxiliary_loss_clip": 0.01099646, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.03682065, + "balance_loss_mlp": 1.01524353, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 1.9367897064680986, + "language_loss": 0.82034159, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84160221, + "num_input_tokens_seen": 346034055, + "step": 16036, + "time_per_iteration": 2.542847156524658 + }, + { + "auxiliary_loss_clip": 0.01065789, + "auxiliary_loss_mlp": 0.01030036, + "balance_loss_clip": 1.03073025, + "balance_loss_mlp": 1.01878905, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 2.0266390758460373, + "language_loss": 0.69594085, + "learning_rate": 1.340965177371789e-08, + "loss": 0.7168991, + "num_input_tokens_seen": 346054130, + "step": 16037, + "time_per_iteration": 2.622985601425171 + }, + { + "auxiliary_loss_clip": 0.01096679, + "auxiliary_loss_mlp": 0.01024405, + "balance_loss_clip": 1.03290653, + "balance_loss_mlp": 1.01383734, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.7897377694340524, + "language_loss": 0.62830776, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.64951861, + "num_input_tokens_seen": 346072990, + "step": 16038, + "time_per_iteration": 4.02214503288269 + }, + { + "auxiliary_loss_clip": 0.01061987, + "auxiliary_loss_mlp": 0.00749737, + "balance_loss_clip": 1.03193796, + "balance_loss_mlp": 1.00028849, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 1.6123482462048844, + "language_loss": 0.71011806, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.72823524, + "num_input_tokens_seen": 346093745, + "step": 16039, + "time_per_iteration": 2.6859872341156006 + }, + { + "auxiliary_loss_clip": 0.01052303, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.03055263, + "balance_loss_mlp": 1.01934576, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.040550818715832, + "language_loss": 0.73259711, + "learning_rate": 1.327491870605657e-08, + "loss": 0.75342673, + "num_input_tokens_seen": 346110115, + "step": 16040, + "time_per_iteration": 2.669255256652832 + }, + { + "auxiliary_loss_clip": 0.01087365, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.03254986, + "balance_loss_mlp": 1.02131987, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 1.8363704148487034, + "language_loss": 0.73345655, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75465864, + "num_input_tokens_seen": 346127165, + "step": 16041, + "time_per_iteration": 2.5547432899475098 + }, + { + "auxiliary_loss_clip": 0.01068445, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.03135109, + "balance_loss_mlp": 1.02100766, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 2.130848095558395, + "language_loss": 0.7178849, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.73888344, + "num_input_tokens_seen": 346145950, + "step": 16042, + "time_per_iteration": 2.6051785945892334 + }, + { + "auxiliary_loss_clip": 0.01056524, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.03034508, + "balance_loss_mlp": 1.01897407, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.8659949901150688, + "language_loss": 0.81030631, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83117259, + "num_input_tokens_seen": 346165005, + "step": 16043, + "time_per_iteration": 2.643906593322754 + }, + { + "auxiliary_loss_clip": 0.01067069, + "auxiliary_loss_mlp": 0.01027046, + "balance_loss_clip": 1.0315969, + "balance_loss_mlp": 1.0167222, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.4504974780359259, + "language_loss": 0.71768653, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73862767, + "num_input_tokens_seen": 346185095, + "step": 16044, + "time_per_iteration": 2.600520610809326 + }, + { + "auxiliary_loss_clip": 0.0106902, + "auxiliary_loss_mlp": 0.01025488, + "balance_loss_clip": 1.029778, + "balance_loss_mlp": 1.01435947, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 1.7308458496973824, + "language_loss": 0.70076954, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72171462, + "num_input_tokens_seen": 346202580, + "step": 16045, + "time_per_iteration": 2.5915608406066895 + }, + { + "auxiliary_loss_clip": 0.01025722, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.02956712, + "balance_loss_mlp": 1.02191567, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 2.1153505759900115, + "language_loss": 0.75182235, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.7724216, + "num_input_tokens_seen": 346219395, + "step": 16046, + "time_per_iteration": 2.6641952991485596 + }, + { + "auxiliary_loss_clip": 0.01090267, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.03399539, + "balance_loss_mlp": 1.02016473, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.6591067472219454, + "language_loss": 0.62498248, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64620697, + "num_input_tokens_seen": 346239715, + "step": 16047, + "time_per_iteration": 2.5737545490264893 + }, + { + "auxiliary_loss_clip": 0.01078352, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.03688121, + "balance_loss_mlp": 1.02053094, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.9291012385010153, + "language_loss": 0.69245124, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71354973, + "num_input_tokens_seen": 346258500, + "step": 16048, + "time_per_iteration": 2.6177947521209717 + }, + { + "auxiliary_loss_clip": 0.01087505, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.03266644, + "balance_loss_mlp": 1.01796865, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 1.7689335391505367, + "language_loss": 0.63932037, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66049141, + "num_input_tokens_seen": 346279110, + "step": 16049, + "time_per_iteration": 2.642047882080078 + }, + { + "auxiliary_loss_clip": 0.01088695, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.03505373, + "balance_loss_mlp": 1.01941538, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 3.4882612370792576, + "language_loss": 0.70565742, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.72684813, + "num_input_tokens_seen": 346297860, + "step": 16050, + "time_per_iteration": 2.56319260597229 + }, + { + "auxiliary_loss_clip": 0.01084846, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.03019547, + "balance_loss_mlp": 1.02458715, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 2.6148849094341884, + "language_loss": 0.69934386, + "learning_rate": 1.278669873970606e-08, + "loss": 0.7205655, + "num_input_tokens_seen": 346319860, + "step": 16051, + "time_per_iteration": 2.731876850128174 + }, + { + "auxiliary_loss_clip": 0.01013091, + "auxiliary_loss_mlp": 0.01004917, + "balance_loss_clip": 1.00341499, + "balance_loss_mlp": 1.00401711, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8433955697759277, + "language_loss": 0.59170705, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61188704, + "num_input_tokens_seen": 346379025, + "step": 16052, + "time_per_iteration": 3.106844902038574 + }, + { + "auxiliary_loss_clip": 0.01092284, + "auxiliary_loss_mlp": 0.01025969, + "balance_loss_clip": 1.03096902, + "balance_loss_mlp": 1.01516294, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.8928788448213796, + "language_loss": 0.74844861, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76963115, + "num_input_tokens_seen": 346402250, + "step": 16053, + "time_per_iteration": 2.6301097869873047 + }, + { + "auxiliary_loss_clip": 0.0107075, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.03287053, + "balance_loss_mlp": 1.01891553, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.1411877622996234, + "language_loss": 0.68705845, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70806682, + "num_input_tokens_seen": 346419555, + "step": 16054, + "time_per_iteration": 2.6128482818603516 + }, + { + "auxiliary_loss_clip": 0.01081419, + "auxiliary_loss_mlp": 0.00749295, + "balance_loss_clip": 1.03516126, + "balance_loss_mlp": 1.00023246, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.4111876905613039, + "language_loss": 0.62024879, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.63855594, + "num_input_tokens_seen": 346441245, + "step": 16055, + "time_per_iteration": 2.633415699005127 + }, + { + "auxiliary_loss_clip": 0.01055478, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.03088605, + "balance_loss_mlp": 1.02503872, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.7954579791780514, + "language_loss": 0.76505661, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.78597105, + "num_input_tokens_seen": 346460065, + "step": 16056, + "time_per_iteration": 2.612031936645508 + }, + { + "auxiliary_loss_clip": 0.01066941, + "auxiliary_loss_mlp": 0.01025207, + "balance_loss_clip": 1.03042042, + "balance_loss_mlp": 1.01465654, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.6274270165003133, + "language_loss": 0.7149775, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73589897, + "num_input_tokens_seen": 346478005, + "step": 16057, + "time_per_iteration": 4.061199903488159 + }, + { + "auxiliary_loss_clip": 0.01094092, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.03231668, + "balance_loss_mlp": 1.01974261, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 2.1725616182633365, + "language_loss": 0.71627539, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.73751771, + "num_input_tokens_seen": 346497575, + "step": 16058, + "time_per_iteration": 2.4997777938842773 + }, + { + "auxiliary_loss_clip": 0.01084692, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.03264415, + "balance_loss_mlp": 1.02240264, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.4589851627671027, + "language_loss": 0.73966902, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76084787, + "num_input_tokens_seen": 346520000, + "step": 16059, + "time_per_iteration": 2.633542776107788 + }, + { + "auxiliary_loss_clip": 0.01076826, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.03170598, + "balance_loss_mlp": 1.02053916, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 1.8398408794354195, + "language_loss": 0.73921788, + "learning_rate": 1.239402791721722e-08, + "loss": 0.76030183, + "num_input_tokens_seen": 346541605, + "step": 16060, + "time_per_iteration": 2.822148084640503 + }, + { + "auxiliary_loss_clip": 0.0107288, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.03253758, + "balance_loss_mlp": 1.01745629, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.6349075184988726, + "language_loss": 0.766958, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78796077, + "num_input_tokens_seen": 346560955, + "step": 16061, + "time_per_iteration": 2.6272194385528564 + }, + { + "auxiliary_loss_clip": 0.01005567, + "auxiliary_loss_mlp": 0.01000705, + "balance_loss_clip": 1.00514114, + "balance_loss_mlp": 0.99973315, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7263700903969963, + "language_loss": 0.64165097, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66171366, + "num_input_tokens_seen": 346621615, + "step": 16062, + "time_per_iteration": 4.669877290725708 + }, + { + "auxiliary_loss_clip": 0.01040849, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.02652061, + "balance_loss_mlp": 1.0165447, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.347723429836096, + "language_loss": 0.93350351, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95417649, + "num_input_tokens_seen": 346637460, + "step": 16063, + "time_per_iteration": 2.6316895484924316 + }, + { + "auxiliary_loss_clip": 0.01085603, + "auxiliary_loss_mlp": 0.01031005, + "balance_loss_clip": 1.03288603, + "balance_loss_mlp": 1.0198406, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 2.065215711345234, + "language_loss": 0.82129645, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84246254, + "num_input_tokens_seen": 346655625, + "step": 16064, + "time_per_iteration": 2.5481858253479004 + }, + { + "auxiliary_loss_clip": 0.01082558, + "auxiliary_loss_mlp": 0.00749175, + "balance_loss_clip": 1.03505158, + "balance_loss_mlp": 1.0002501, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 1.536966142223168, + "language_loss": 0.84248567, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86080301, + "num_input_tokens_seen": 346675220, + "step": 16065, + "time_per_iteration": 2.6419641971588135 + }, + { + "auxiliary_loss_clip": 0.01071977, + "auxiliary_loss_mlp": 0.01027816, + "balance_loss_clip": 1.03102708, + "balance_loss_mlp": 1.01664031, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.694178838026426, + "language_loss": 0.67297745, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69397539, + "num_input_tokens_seen": 346694710, + "step": 16066, + "time_per_iteration": 2.6170871257781982 + }, + { + "auxiliary_loss_clip": 0.01096146, + "auxiliary_loss_mlp": 0.01023688, + "balance_loss_clip": 1.03259301, + "balance_loss_mlp": 1.01301265, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 1.9290365210239007, + "language_loss": 0.82198036, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84317875, + "num_input_tokens_seen": 346712645, + "step": 16067, + "time_per_iteration": 4.034431219100952 + }, + { + "auxiliary_loss_clip": 0.01074914, + "auxiliary_loss_mlp": 0.01025879, + "balance_loss_clip": 1.03292644, + "balance_loss_mlp": 1.01491189, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 1.9416945603184794, + "language_loss": 0.69333255, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71434045, + "num_input_tokens_seen": 346732375, + "step": 16068, + "time_per_iteration": 2.583305835723877 + }, + { + "auxiliary_loss_clip": 0.0107352, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.0307858, + "balance_loss_mlp": 1.01886463, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.8062370738506905, + "language_loss": 0.68321609, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70423448, + "num_input_tokens_seen": 346750430, + "step": 16069, + "time_per_iteration": 2.546034812927246 + }, + { + "auxiliary_loss_clip": 0.01077913, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.03519535, + "balance_loss_mlp": 1.01756668, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.848674928540221, + "language_loss": 0.88807225, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.90913439, + "num_input_tokens_seen": 346768455, + "step": 16070, + "time_per_iteration": 2.5756161212921143 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.0364902, + "balance_loss_mlp": 1.02072501, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 2.666757741479711, + "language_loss": 0.77610636, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.7974329, + "num_input_tokens_seen": 346786530, + "step": 16071, + "time_per_iteration": 2.50141978263855 + }, + { + "auxiliary_loss_clip": 0.01069088, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.03046048, + "balance_loss_mlp": 1.01962709, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 2.036176051374521, + "language_loss": 0.65777159, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.67878425, + "num_input_tokens_seen": 346804635, + "step": 16072, + "time_per_iteration": 2.5322301387786865 + }, + { + "auxiliary_loss_clip": 0.01089367, + "auxiliary_loss_mlp": 0.01026558, + "balance_loss_clip": 1.03422391, + "balance_loss_mlp": 1.01606131, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.603069649135929, + "language_loss": 0.77485335, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79601252, + "num_input_tokens_seen": 346823070, + "step": 16073, + "time_per_iteration": 2.5797674655914307 + }, + { + "auxiliary_loss_clip": 0.01101582, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.03480983, + "balance_loss_mlp": 1.02128816, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 2.4829628649555118, + "language_loss": 0.75729489, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.77863818, + "num_input_tokens_seen": 346841180, + "step": 16074, + "time_per_iteration": 2.503758192062378 + }, + { + "auxiliary_loss_clip": 0.01074938, + "auxiliary_loss_mlp": 0.01028486, + "balance_loss_clip": 1.0330199, + "balance_loss_mlp": 1.01727474, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.6675502878347457, + "language_loss": 0.75646555, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77749979, + "num_input_tokens_seen": 346864250, + "step": 16075, + "time_per_iteration": 2.6318187713623047 + }, + { + "auxiliary_loss_clip": 0.01067477, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.03420782, + "balance_loss_mlp": 1.01870394, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 1.8259124385805583, + "language_loss": 0.79005527, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81102145, + "num_input_tokens_seen": 346881955, + "step": 16076, + "time_per_iteration": 2.670978307723999 + }, + { + "auxiliary_loss_clip": 0.01079716, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.03650153, + "balance_loss_mlp": 1.02710485, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.8865135338154975, + "language_loss": 0.72200453, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74318802, + "num_input_tokens_seen": 346900445, + "step": 16077, + "time_per_iteration": 2.6512293815612793 + }, + { + "auxiliary_loss_clip": 0.01079248, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.03065705, + "balance_loss_mlp": 1.01980901, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.6812016506993146, + "language_loss": 0.59262502, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61373013, + "num_input_tokens_seen": 346920135, + "step": 16078, + "time_per_iteration": 2.5870065689086914 + }, + { + "auxiliary_loss_clip": 0.0108792, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.03330338, + "balance_loss_mlp": 1.01960194, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.8466431243258639, + "language_loss": 0.7241478, + "learning_rate": 1.158510609718899e-08, + "loss": 0.7453388, + "num_input_tokens_seen": 346940450, + "step": 16079, + "time_per_iteration": 4.07953143119812 + }, + { + "auxiliary_loss_clip": 0.01081444, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.03138947, + "balance_loss_mlp": 1.01677287, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.6231657283686394, + "language_loss": 0.72146904, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74255347, + "num_input_tokens_seen": 346960935, + "step": 16080, + "time_per_iteration": 2.594324827194214 + }, + { + "auxiliary_loss_clip": 0.01059636, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02908468, + "balance_loss_mlp": 1.02030635, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 2.1113279958317617, + "language_loss": 0.74266481, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.76358271, + "num_input_tokens_seen": 346980100, + "step": 16081, + "time_per_iteration": 2.566253423690796 + }, + { + "auxiliary_loss_clip": 0.01066285, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.02953291, + "balance_loss_mlp": 1.01587868, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.617413560538083, + "language_loss": 0.67233145, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69326603, + "num_input_tokens_seen": 347001250, + "step": 16082, + "time_per_iteration": 2.6445987224578857 + }, + { + "auxiliary_loss_clip": 0.01049212, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.02902699, + "balance_loss_mlp": 1.02190626, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 2.03476570111835, + "language_loss": 0.76820827, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78904426, + "num_input_tokens_seen": 347022975, + "step": 16083, + "time_per_iteration": 2.6877899169921875 + }, + { + "auxiliary_loss_clip": 0.01048165, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.03021383, + "balance_loss_mlp": 1.01780009, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 3.1638827600019197, + "language_loss": 0.79196638, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81274003, + "num_input_tokens_seen": 347038780, + "step": 16084, + "time_per_iteration": 2.6411521434783936 + }, + { + "auxiliary_loss_clip": 0.01089316, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.03228951, + "balance_loss_mlp": 1.01743007, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 3.263775955001632, + "language_loss": 0.68025863, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.70144761, + "num_input_tokens_seen": 347056705, + "step": 16085, + "time_per_iteration": 2.454496145248413 + }, + { + "auxiliary_loss_clip": 0.01078475, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.03435636, + "balance_loss_mlp": 1.01793122, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 1.9409378203016685, + "language_loss": 0.68582845, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.70691311, + "num_input_tokens_seen": 347075710, + "step": 16086, + "time_per_iteration": 2.6535933017730713 + }, + { + "auxiliary_loss_clip": 0.01081803, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.03170586, + "balance_loss_mlp": 1.02154589, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.6577752594595756, + "language_loss": 0.78204566, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80319345, + "num_input_tokens_seen": 347092325, + "step": 16087, + "time_per_iteration": 2.504192352294922 + }, + { + "auxiliary_loss_clip": 0.01066112, + "auxiliary_loss_mlp": 0.01025476, + "balance_loss_clip": 1.0306977, + "balance_loss_mlp": 1.01469982, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.8346118820791641, + "language_loss": 0.71271545, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73363131, + "num_input_tokens_seen": 347110595, + "step": 16088, + "time_per_iteration": 2.580796480178833 + }, + { + "auxiliary_loss_clip": 0.01095475, + "auxiliary_loss_mlp": 0.00749336, + "balance_loss_clip": 1.03416312, + "balance_loss_mlp": 1.00020623, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.3906063875997776, + "language_loss": 0.70215642, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72060454, + "num_input_tokens_seen": 347131625, + "step": 16089, + "time_per_iteration": 2.542369842529297 + }, + { + "auxiliary_loss_clip": 0.01099476, + "auxiliary_loss_mlp": 0.0103122, + "balance_loss_clip": 1.03415918, + "balance_loss_mlp": 1.02058673, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.804466994896894, + "language_loss": 0.74933851, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.7706455, + "num_input_tokens_seen": 347147910, + "step": 16090, + "time_per_iteration": 2.460352659225464 + }, + { + "auxiliary_loss_clip": 0.01077476, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.03401387, + "balance_loss_mlp": 1.01837325, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.957597425566987, + "language_loss": 0.68716407, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.70823157, + "num_input_tokens_seen": 347168805, + "step": 16091, + "time_per_iteration": 2.65474796295166 + }, + { + "auxiliary_loss_clip": 0.01095636, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.03276658, + "balance_loss_mlp": 1.02083945, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.88134198401637, + "language_loss": 0.7727747, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79405153, + "num_input_tokens_seen": 347189455, + "step": 16092, + "time_per_iteration": 2.505357027053833 + }, + { + "auxiliary_loss_clip": 0.01097571, + "auxiliary_loss_mlp": 0.01026666, + "balance_loss_clip": 1.03483427, + "balance_loss_mlp": 1.01641369, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.8452329672895327, + "language_loss": 0.76683235, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78807473, + "num_input_tokens_seen": 347206030, + "step": 16093, + "time_per_iteration": 2.467764139175415 + }, + { + "auxiliary_loss_clip": 0.01071253, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.03241897, + "balance_loss_mlp": 1.0156486, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.5736876216228641, + "language_loss": 0.69111276, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71210045, + "num_input_tokens_seen": 347226250, + "step": 16094, + "time_per_iteration": 2.5962579250335693 + }, + { + "auxiliary_loss_clip": 0.01088732, + "auxiliary_loss_mlp": 0.01027037, + "balance_loss_clip": 1.03372765, + "balance_loss_mlp": 1.01651073, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.520896906235884, + "language_loss": 0.75964814, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.78080583, + "num_input_tokens_seen": 347247350, + "step": 16095, + "time_per_iteration": 2.5927438735961914 + }, + { + "auxiliary_loss_clip": 0.01102545, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.03551078, + "balance_loss_mlp": 1.02289915, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 1.7091241006706583, + "language_loss": 0.70156634, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72293699, + "num_input_tokens_seen": 347266870, + "step": 16096, + "time_per_iteration": 2.5533618927001953 + }, + { + "auxiliary_loss_clip": 0.01076593, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.03291976, + "balance_loss_mlp": 1.01509821, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 1.9505298103114463, + "language_loss": 0.71888775, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73991418, + "num_input_tokens_seen": 347290120, + "step": 16097, + "time_per_iteration": 4.305117845535278 + }, + { + "auxiliary_loss_clip": 0.01096545, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.03386164, + "balance_loss_mlp": 1.01996636, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.628725131967938, + "language_loss": 0.78094876, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80222166, + "num_input_tokens_seen": 347308785, + "step": 16098, + "time_per_iteration": 2.50516676902771 + }, + { + "auxiliary_loss_clip": 0.01061651, + "auxiliary_loss_mlp": 0.01028666, + "balance_loss_clip": 1.03296781, + "balance_loss_mlp": 1.01844943, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 1.7850793664649418, + "language_loss": 0.90788478, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92878795, + "num_input_tokens_seen": 347326375, + "step": 16099, + "time_per_iteration": 2.56679368019104 + }, + { + "auxiliary_loss_clip": 0.01086025, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.03209901, + "balance_loss_mlp": 1.01885843, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 1.694455853650918, + "language_loss": 0.65928292, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68044275, + "num_input_tokens_seen": 347348250, + "step": 16100, + "time_per_iteration": 2.662705183029175 + }, + { + "auxiliary_loss_clip": 0.01059806, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.03550851, + "balance_loss_mlp": 1.01858115, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.4606420040561274, + "language_loss": 0.73410529, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.755, + "num_input_tokens_seen": 347367400, + "step": 16101, + "time_per_iteration": 2.612586736679077 + }, + { + "auxiliary_loss_clip": 0.01074174, + "auxiliary_loss_mlp": 0.01027986, + "balance_loss_clip": 1.03222597, + "balance_loss_mlp": 1.01688743, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.6566516898400174, + "language_loss": 0.73675328, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75777489, + "num_input_tokens_seen": 347387600, + "step": 16102, + "time_per_iteration": 4.0924859046936035 + }, + { + "auxiliary_loss_clip": 0.01064066, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.03684735, + "balance_loss_mlp": 1.01997459, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 1.7450703798818, + "language_loss": 0.77433169, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79529375, + "num_input_tokens_seen": 347406915, + "step": 16103, + "time_per_iteration": 2.662287950515747 + }, + { + "auxiliary_loss_clip": 0.01072857, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.03111708, + "balance_loss_mlp": 1.01658821, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.5940568086935973, + "language_loss": 0.80462283, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82562101, + "num_input_tokens_seen": 347425140, + "step": 16104, + "time_per_iteration": 2.6198582649230957 + }, + { + "auxiliary_loss_clip": 0.01074373, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.0288583, + "balance_loss_mlp": 1.02135253, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.769821330572125, + "language_loss": 0.77587563, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79693174, + "num_input_tokens_seen": 347446350, + "step": 16105, + "time_per_iteration": 2.56687593460083 + }, + { + "auxiliary_loss_clip": 0.00992206, + "auxiliary_loss_mlp": 0.00999456, + "balance_loss_clip": 1.00436044, + "balance_loss_mlp": 0.99849075, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8144102457591084, + "language_loss": 0.56748116, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58739781, + "num_input_tokens_seen": 347510135, + "step": 16106, + "time_per_iteration": 3.230252265930176 + }, + { + "auxiliary_loss_clip": 0.0099609, + "auxiliary_loss_mlp": 0.00999672, + "balance_loss_clip": 1.00531793, + "balance_loss_mlp": 0.9987424, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.96433624961883, + "language_loss": 0.61611581, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63607347, + "num_input_tokens_seen": 347562505, + "step": 16107, + "time_per_iteration": 4.612572908401489 + }, + { + "auxiliary_loss_clip": 0.01086952, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.03325272, + "balance_loss_mlp": 1.02143145, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.0852220208914587, + "language_loss": 0.74048585, + "learning_rate": 1.040291854638875e-08, + "loss": 0.7616964, + "num_input_tokens_seen": 347579150, + "step": 16108, + "time_per_iteration": 2.576066017150879 + }, + { + "auxiliary_loss_clip": 0.01079327, + "auxiliary_loss_mlp": 0.01025039, + "balance_loss_clip": 1.03194761, + "balance_loss_mlp": 1.01302886, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.2602545538068717, + "language_loss": 0.57130277, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59234643, + "num_input_tokens_seen": 347596705, + "step": 16109, + "time_per_iteration": 2.5406954288482666 + }, + { + "auxiliary_loss_clip": 0.01012654, + "auxiliary_loss_mlp": 0.01000286, + "balance_loss_clip": 1.00245857, + "balance_loss_mlp": 0.99937451, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.7192157151464942, + "language_loss": 0.54287875, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56300807, + "num_input_tokens_seen": 347661870, + "step": 16110, + "time_per_iteration": 3.094003677368164 + }, + { + "auxiliary_loss_clip": 0.01024242, + "auxiliary_loss_mlp": 0.01041275, + "balance_loss_clip": 1.02842867, + "balance_loss_mlp": 1.02781582, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.4685585291110372, + "language_loss": 0.62494326, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64559841, + "num_input_tokens_seen": 347684295, + "step": 16111, + "time_per_iteration": 2.8124475479125977 + }, + { + "auxiliary_loss_clip": 0.01072828, + "auxiliary_loss_mlp": 0.01025756, + "balance_loss_clip": 1.03115904, + "balance_loss_mlp": 1.01603484, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 3.1747631943331407, + "language_loss": 0.74844962, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76943552, + "num_input_tokens_seen": 347702585, + "step": 16112, + "time_per_iteration": 2.611210584640503 + }, + { + "auxiliary_loss_clip": 0.01086179, + "auxiliary_loss_mlp": 0.01025393, + "balance_loss_clip": 1.03383279, + "balance_loss_mlp": 1.01543319, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 2.1403743473584664, + "language_loss": 0.6681478, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68926352, + "num_input_tokens_seen": 347721810, + "step": 16113, + "time_per_iteration": 2.531627893447876 + }, + { + "auxiliary_loss_clip": 0.01013374, + "auxiliary_loss_mlp": 0.00999574, + "balance_loss_clip": 1.00396252, + "balance_loss_mlp": 0.99863857, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.7002544870890998, + "language_loss": 0.56516588, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58529544, + "num_input_tokens_seen": 347782330, + "step": 16114, + "time_per_iteration": 3.1160638332366943 + }, + { + "auxiliary_loss_clip": 0.01068978, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.03252304, + "balance_loss_mlp": 1.02228534, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 2.0839982245329645, + "language_loss": 0.82733381, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84835804, + "num_input_tokens_seen": 347794835, + "step": 16115, + "time_per_iteration": 2.52382493019104 + }, + { + "auxiliary_loss_clip": 0.01081303, + "auxiliary_loss_mlp": 0.01026325, + "balance_loss_clip": 1.03291833, + "balance_loss_mlp": 1.01626933, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.5443118581036823, + "language_loss": 0.71830457, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.73938084, + "num_input_tokens_seen": 347814320, + "step": 16116, + "time_per_iteration": 2.539306163787842 + }, + { + "auxiliary_loss_clip": 0.01055862, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.03188086, + "balance_loss_mlp": 1.02199161, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 2.150380605030342, + "language_loss": 0.76201808, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.78291553, + "num_input_tokens_seen": 347832125, + "step": 16117, + "time_per_iteration": 2.577376127243042 + }, + { + "auxiliary_loss_clip": 0.01097515, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.03260481, + "balance_loss_mlp": 1.01831961, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 1.8506834521779876, + "language_loss": 0.76652795, + "learning_rate": 1.000997769426548e-08, + "loss": 0.78780234, + "num_input_tokens_seen": 347850765, + "step": 16118, + "time_per_iteration": 2.574392795562744 + }, + { + "auxiliary_loss_clip": 0.01071914, + "auxiliary_loss_mlp": 0.00749392, + "balance_loss_clip": 1.03186321, + "balance_loss_mlp": 1.00029314, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.5732188488439558, + "language_loss": 0.78010702, + "learning_rate": 9.971098618001272e-09, + "loss": 0.79832017, + "num_input_tokens_seen": 347870125, + "step": 16119, + "time_per_iteration": 4.200732469558716 + }, + { + "auxiliary_loss_clip": 0.01043283, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.02780938, + "balance_loss_mlp": 1.02247548, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.4504388441868672, + "language_loss": 0.75635588, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77712107, + "num_input_tokens_seen": 347890615, + "step": 16120, + "time_per_iteration": 2.7171850204467773 + }, + { + "auxiliary_loss_clip": 0.010862, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.03271604, + "balance_loss_mlp": 1.01560569, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.7247827165960006, + "language_loss": 0.69804549, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71916759, + "num_input_tokens_seen": 347908685, + "step": 16121, + "time_per_iteration": 2.6115195751190186 + }, + { + "auxiliary_loss_clip": 0.01084613, + "auxiliary_loss_mlp": 0.01026179, + "balance_loss_clip": 1.03226018, + "balance_loss_mlp": 1.015414, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 1.9479846087116657, + "language_loss": 0.69139087, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71249878, + "num_input_tokens_seen": 347926385, + "step": 16122, + "time_per_iteration": 2.6174721717834473 + }, + { + "auxiliary_loss_clip": 0.01050315, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.02762151, + "balance_loss_mlp": 1.02072406, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 2.094681406597338, + "language_loss": 0.7592119, + "learning_rate": 9.81633694859907e-09, + "loss": 0.78003728, + "num_input_tokens_seen": 347945290, + "step": 16123, + "time_per_iteration": 2.6309545040130615 + }, + { + "auxiliary_loss_clip": 0.01056146, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.02857447, + "balance_loss_mlp": 1.02219188, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.7943100464145811, + "language_loss": 0.74505323, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76596498, + "num_input_tokens_seen": 347966330, + "step": 16124, + "time_per_iteration": 2.6128456592559814 + }, + { + "auxiliary_loss_clip": 0.01085632, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.0322175, + "balance_loss_mlp": 1.01873875, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 2.1034361467027756, + "language_loss": 0.74614888, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76729888, + "num_input_tokens_seen": 347982590, + "step": 16125, + "time_per_iteration": 2.627652883529663 + }, + { + "auxiliary_loss_clip": 0.01013664, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.00388384, + "balance_loss_mlp": 1.00310194, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8904313419482749, + "language_loss": 0.61459923, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63477576, + "num_input_tokens_seen": 348043310, + "step": 16126, + "time_per_iteration": 3.047579526901245 + }, + { + "auxiliary_loss_clip": 0.01086297, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.03499067, + "balance_loss_mlp": 1.02510369, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.6419757809886204, + "language_loss": 0.75068259, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77189803, + "num_input_tokens_seen": 348062200, + "step": 16127, + "time_per_iteration": 2.543062925338745 + }, + { + "auxiliary_loss_clip": 0.01051418, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.03056169, + "balance_loss_mlp": 1.01963222, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.5640777983978387, + "language_loss": 0.69384694, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71467388, + "num_input_tokens_seen": 348080685, + "step": 16128, + "time_per_iteration": 2.6527512073516846 + }, + { + "auxiliary_loss_clip": 0.01063612, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.03279924, + "balance_loss_mlp": 1.02226162, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.5253566142850006, + "language_loss": 0.65366405, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67463315, + "num_input_tokens_seen": 348102500, + "step": 16129, + "time_per_iteration": 2.762164831161499 + }, + { + "auxiliary_loss_clip": 0.01066349, + "auxiliary_loss_mlp": 0.0102743, + "balance_loss_clip": 1.03456402, + "balance_loss_mlp": 1.01637959, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 1.910007235646492, + "language_loss": 0.62805569, + "learning_rate": 9.548409599691166e-09, + "loss": 0.64899349, + "num_input_tokens_seen": 348122515, + "step": 16130, + "time_per_iteration": 2.676786184310913 + }, + { + "auxiliary_loss_clip": 0.01086056, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.0324533, + "balance_loss_mlp": 1.01670647, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 3.4184751785040333, + "language_loss": 0.69851428, + "learning_rate": 9.510436165056867e-09, + "loss": 0.71965474, + "num_input_tokens_seen": 348138775, + "step": 16131, + "time_per_iteration": 2.4960250854492188 + }, + { + "auxiliary_loss_clip": 0.01098186, + "auxiliary_loss_mlp": 0.00749259, + "balance_loss_clip": 1.03304648, + "balance_loss_mlp": 1.00019646, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 2.5155860602875166, + "language_loss": 0.7667771, + "learning_rate": 9.472538209986058e-09, + "loss": 0.7852515, + "num_input_tokens_seen": 348157115, + "step": 16132, + "time_per_iteration": 2.5319907665252686 + }, + { + "auxiliary_loss_clip": 0.01063873, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.03321218, + "balance_loss_mlp": 1.02149844, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 6.707039150180905, + "language_loss": 0.78956676, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81053817, + "num_input_tokens_seen": 348173035, + "step": 16133, + "time_per_iteration": 2.6293258666992188 + }, + { + "auxiliary_loss_clip": 0.0106638, + "auxiliary_loss_mlp": 0.01026283, + "balance_loss_clip": 1.03237247, + "balance_loss_mlp": 1.01650822, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.5764690969443982, + "language_loss": 0.64772481, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66865146, + "num_input_tokens_seen": 348192960, + "step": 16134, + "time_per_iteration": 2.6069681644439697 + }, + { + "auxiliary_loss_clip": 0.0107004, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.02957582, + "balance_loss_mlp": 1.01900196, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 2.1833566627100667, + "language_loss": 0.8088128, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82981485, + "num_input_tokens_seen": 348212805, + "step": 16135, + "time_per_iteration": 2.633025646209717 + }, + { + "auxiliary_loss_clip": 0.01088883, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.03325796, + "balance_loss_mlp": 1.0205431, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 1.8981272824000157, + "language_loss": 0.73027956, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75149322, + "num_input_tokens_seen": 348232900, + "step": 16136, + "time_per_iteration": 2.5872604846954346 + }, + { + "auxiliary_loss_clip": 0.01096815, + "auxiliary_loss_mlp": 0.01028451, + "balance_loss_clip": 1.03445768, + "balance_loss_mlp": 1.01854455, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.7224718459349124, + "language_loss": 0.76180393, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78305656, + "num_input_tokens_seen": 348253065, + "step": 16137, + "time_per_iteration": 4.132816314697266 + }, + { + "auxiliary_loss_clip": 0.0098927, + "auxiliary_loss_mlp": 0.01000462, + "balance_loss_clip": 1.01436496, + "balance_loss_mlp": 0.99930602, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.7902872223346011, + "language_loss": 0.54908973, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56898707, + "num_input_tokens_seen": 348316075, + "step": 16138, + "time_per_iteration": 3.306811571121216 + }, + { + "auxiliary_loss_clip": 0.01074091, + "auxiliary_loss_mlp": 0.01027409, + "balance_loss_clip": 1.03030705, + "balance_loss_mlp": 1.01696038, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 1.9611201332919894, + "language_loss": 0.70839357, + "learning_rate": 9.209366072632007e-09, + "loss": 0.7294085, + "num_input_tokens_seen": 348337605, + "step": 16139, + "time_per_iteration": 2.739424467086792 + }, + { + "auxiliary_loss_clip": 0.01089899, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.0352335, + "balance_loss_mlp": 1.01682568, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 2.0787199567110664, + "language_loss": 0.72439873, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74557924, + "num_input_tokens_seen": 348359430, + "step": 16140, + "time_per_iteration": 2.561612844467163 + }, + { + "auxiliary_loss_clip": 0.010904, + "auxiliary_loss_mlp": 0.00749404, + "balance_loss_clip": 1.03431678, + "balance_loss_mlp": 1.00019288, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.3795042677852902, + "language_loss": 0.68144226, + "learning_rate": 9.13485343089504e-09, + "loss": 0.69984031, + "num_input_tokens_seen": 348377890, + "step": 16141, + "time_per_iteration": 2.505716562271118 + }, + { + "auxiliary_loss_clip": 0.01081445, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.03125572, + "balance_loss_mlp": 1.01814997, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 1.9255863637337973, + "language_loss": 0.68724036, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70833755, + "num_input_tokens_seen": 348396550, + "step": 16142, + "time_per_iteration": 4.074741840362549 + }, + { + "auxiliary_loss_clip": 0.010396, + "auxiliary_loss_mlp": 0.01027957, + "balance_loss_clip": 1.0308609, + "balance_loss_mlp": 1.01698935, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.7370543878506701, + "language_loss": 0.5578838, + "learning_rate": 9.060642764378457e-09, + "loss": 0.5785594, + "num_input_tokens_seen": 348417120, + "step": 16143, + "time_per_iteration": 2.6917152404785156 + }, + { + "auxiliary_loss_clip": 0.01087554, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03391147, + "balance_loss_mlp": 1.02161252, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 3.425686229229677, + "language_loss": 0.67718363, + "learning_rate": 9.023650675347382e-09, + "loss": 0.6983785, + "num_input_tokens_seen": 348437750, + "step": 16144, + "time_per_iteration": 2.6669161319732666 + }, + { + "auxiliary_loss_clip": 0.01084016, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.03267455, + "balance_loss_mlp": 1.02511835, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.937910014533363, + "language_loss": 0.71886492, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74005938, + "num_input_tokens_seen": 348460935, + "step": 16145, + "time_per_iteration": 2.6482391357421875 + }, + { + "auxiliary_loss_clip": 0.01073885, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.03115892, + "balance_loss_mlp": 1.01819158, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 3.264382843399244, + "language_loss": 0.79876983, + "learning_rate": 8.949892992753395e-09, + "loss": 0.81980455, + "num_input_tokens_seen": 348474480, + "step": 16146, + "time_per_iteration": 2.519718647003174 + }, + { + "auxiliary_loss_clip": 0.0099425, + "auxiliary_loss_mlp": 0.00997689, + "balance_loss_clip": 1.00688457, + "balance_loss_mlp": 0.99674171, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.7683560554971265, + "language_loss": 0.54539144, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56531084, + "num_input_tokens_seen": 348541220, + "step": 16147, + "time_per_iteration": 4.7934863567352295 + }, + { + "auxiliary_loss_clip": 0.01057162, + "auxiliary_loss_mlp": 0.00749599, + "balance_loss_clip": 1.02978313, + "balance_loss_mlp": 1.00021958, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 2.5240432598324802, + "language_loss": 0.61332583, + "learning_rate": 8.876437313434682e-09, + "loss": 0.63139343, + "num_input_tokens_seen": 348559230, + "step": 16148, + "time_per_iteration": 2.650998115539551 + }, + { + "auxiliary_loss_clip": 0.01061045, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.03260469, + "balance_loss_mlp": 1.02650881, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.9286534401927367, + "language_loss": 0.73674393, + "learning_rate": 8.839822728487155e-09, + "loss": 0.7577256, + "num_input_tokens_seen": 348577850, + "step": 16149, + "time_per_iteration": 2.6426382064819336 + }, + { + "auxiliary_loss_clip": 0.01083692, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02969384, + "balance_loss_mlp": 1.02369869, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 4.4858802765794215, + "language_loss": 0.74901855, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77020276, + "num_input_tokens_seen": 348598345, + "step": 16150, + "time_per_iteration": 2.7389883995056152 + }, + { + "auxiliary_loss_clip": 0.01081862, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.03470933, + "balance_loss_mlp": 1.01682281, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 2.129017512148437, + "language_loss": 0.74112481, + "learning_rate": 8.766820074958214e-09, + "loss": 0.76224399, + "num_input_tokens_seen": 348616300, + "step": 16151, + "time_per_iteration": 2.6574387550354004 + }, + { + "auxiliary_loss_clip": 0.01083454, + "auxiliary_loss_mlp": 0.01026122, + "balance_loss_clip": 1.03210294, + "balance_loss_mlp": 1.01532745, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 2.0633751150106403, + "language_loss": 0.74704838, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76814413, + "num_input_tokens_seen": 348633845, + "step": 16152, + "time_per_iteration": 2.613007068634033 + }, + { + "auxiliary_loss_clip": 0.01063124, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.03320158, + "balance_loss_mlp": 1.0179615, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 2.0493053530941476, + "language_loss": 0.67250091, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69342101, + "num_input_tokens_seen": 348653070, + "step": 16153, + "time_per_iteration": 2.790492057800293 + }, + { + "auxiliary_loss_clip": 0.01035405, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.03029656, + "balance_loss_mlp": 1.01766896, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.535958631104409, + "language_loss": 0.70587862, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72650653, + "num_input_tokens_seen": 348672145, + "step": 16154, + "time_per_iteration": 2.7169349193573 + }, + { + "auxiliary_loss_clip": 0.01049459, + "auxiliary_loss_mlp": 0.01030503, + "balance_loss_clip": 1.03608227, + "balance_loss_mlp": 1.01804543, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 1.6876696764494266, + "language_loss": 0.81046915, + "learning_rate": 8.621720872059812e-09, + "loss": 0.83126879, + "num_input_tokens_seen": 348690615, + "step": 16155, + "time_per_iteration": 2.7922558784484863 + }, + { + "auxiliary_loss_clip": 0.01087439, + "auxiliary_loss_mlp": 0.00749613, + "balance_loss_clip": 1.03415477, + "balance_loss_mlp": 1.00024545, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 1.948570364711604, + "language_loss": 0.67562604, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69399655, + "num_input_tokens_seen": 348708665, + "step": 16156, + "time_per_iteration": 2.53129243850708 + }, + { + "auxiliary_loss_clip": 0.01086927, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.03176475, + "balance_loss_mlp": 1.02292991, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.629692898881463, + "language_loss": 0.90848553, + "learning_rate": 8.54962434469919e-09, + "loss": 0.92969346, + "num_input_tokens_seen": 348726105, + "step": 16157, + "time_per_iteration": 2.5800981521606445 + }, + { + "auxiliary_loss_clip": 0.01059204, + "auxiliary_loss_mlp": 0.00749291, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 1.00028157, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 1.8770931628842287, + "language_loss": 0.72816455, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74624956, + "num_input_tokens_seen": 348743360, + "step": 16158, + "time_per_iteration": 4.125634670257568 + }, + { + "auxiliary_loss_clip": 0.01047035, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.03196871, + "balance_loss_mlp": 1.02338266, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 1.9534084505341, + "language_loss": 0.60151291, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62232733, + "num_input_tokens_seen": 348759045, + "step": 16159, + "time_per_iteration": 2.6499953269958496 + }, + { + "auxiliary_loss_clip": 0.01092203, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_clip": 1.03267121, + "balance_loss_mlp": 1.01731968, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.788194218926793, + "language_loss": 0.78914881, + "learning_rate": 8.44204592704112e-09, + "loss": 0.81034297, + "num_input_tokens_seen": 348779910, + "step": 16160, + "time_per_iteration": 2.634012222290039 + }, + { + "auxiliary_loss_clip": 0.0102217, + "auxiliary_loss_mlp": 0.00999864, + "balance_loss_clip": 1.00237846, + "balance_loss_mlp": 0.99898833, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7842023839767408, + "language_loss": 0.54269004, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56291032, + "num_input_tokens_seen": 348838995, + "step": 16161, + "time_per_iteration": 3.085731029510498 + }, + { + "auxiliary_loss_clip": 0.0108462, + "auxiliary_loss_mlp": 0.00749071, + "balance_loss_clip": 1.03462243, + "balance_loss_mlp": 1.00019777, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.8146139706761648, + "language_loss": 0.71713954, + "learning_rate": 8.3707045800554e-09, + "loss": 0.73547643, + "num_input_tokens_seen": 348858090, + "step": 16162, + "time_per_iteration": 2.549302816390991 + }, + { + "auxiliary_loss_clip": 0.01059508, + "auxiliary_loss_mlp": 0.01027123, + "balance_loss_clip": 1.02962613, + "balance_loss_mlp": 1.0158813, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.6120010108149658, + "language_loss": 0.78759158, + "learning_rate": 8.335147190060787e-09, + "loss": 0.80845785, + "num_input_tokens_seen": 348877885, + "step": 16163, + "time_per_iteration": 2.6217472553253174 + }, + { + "auxiliary_loss_clip": 0.01072725, + "auxiliary_loss_mlp": 0.01026588, + "balance_loss_clip": 1.03379762, + "balance_loss_mlp": 1.01637197, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 1.7602059161009167, + "language_loss": 0.73093069, + "learning_rate": 8.299665324196903e-09, + "loss": 0.7519238, + "num_input_tokens_seen": 348897720, + "step": 16164, + "time_per_iteration": 2.609502077102661 + }, + { + "auxiliary_loss_clip": 0.01030338, + "auxiliary_loss_mlp": 0.01042861, + "balance_loss_clip": 1.02800107, + "balance_loss_mlp": 1.02927768, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.1551530152357388, + "language_loss": 0.84116852, + "learning_rate": 8.264258983809114e-09, + "loss": 0.86190045, + "num_input_tokens_seen": 348915410, + "step": 16165, + "time_per_iteration": 2.6492905616760254 + }, + { + "auxiliary_loss_clip": 0.01057156, + "auxiliary_loss_mlp": 0.01024804, + "balance_loss_clip": 1.03041041, + "balance_loss_mlp": 1.01558936, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.6942689956415453, + "language_loss": 0.79142201, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81224155, + "num_input_tokens_seen": 348934335, + "step": 16166, + "time_per_iteration": 2.6614129543304443 + }, + { + "auxiliary_loss_clip": 0.01072055, + "auxiliary_loss_mlp": 0.01026025, + "balance_loss_clip": 1.03480268, + "balance_loss_mlp": 1.0156002, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.7595175476308418, + "language_loss": 0.70365292, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72463375, + "num_input_tokens_seen": 348952405, + "step": 16167, + "time_per_iteration": 2.658714771270752 + }, + { + "auxiliary_loss_clip": 0.01068274, + "auxiliary_loss_mlp": 0.01028808, + "balance_loss_clip": 1.03531694, + "balance_loss_mlp": 1.01881862, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.4450475933666254, + "language_loss": 0.75522012, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77619094, + "num_input_tokens_seen": 348973580, + "step": 16168, + "time_per_iteration": 2.609637498855591 + }, + { + "auxiliary_loss_clip": 0.01036105, + "auxiliary_loss_mlp": 0.01047503, + "balance_loss_clip": 1.02919054, + "balance_loss_mlp": 1.0334959, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.535462891235847, + "language_loss": 0.72582597, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74666202, + "num_input_tokens_seen": 348992035, + "step": 16169, + "time_per_iteration": 2.661428928375244 + }, + { + "auxiliary_loss_clip": 0.01059941, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.02904534, + "balance_loss_mlp": 1.01767898, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 1.7990195624839813, + "language_loss": 0.57578886, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59669024, + "num_input_tokens_seen": 349013160, + "step": 16170, + "time_per_iteration": 2.6463654041290283 + }, + { + "auxiliary_loss_clip": 0.01065112, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.03153872, + "balance_loss_mlp": 1.01660013, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.6939817029890358, + "language_loss": 0.71955383, + "learning_rate": 8.053407051471062e-09, + "loss": 0.74048734, + "num_input_tokens_seen": 349033485, + "step": 16171, + "time_per_iteration": 2.5801658630371094 + }, + { + "auxiliary_loss_clip": 0.0105965, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.03080285, + "balance_loss_mlp": 1.02322268, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.6084353388838362, + "language_loss": 0.68233383, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70327103, + "num_input_tokens_seen": 349051705, + "step": 16172, + "time_per_iteration": 2.5780556201934814 + }, + { + "auxiliary_loss_clip": 0.01079028, + "auxiliary_loss_mlp": 0.01026411, + "balance_loss_clip": 1.03116322, + "balance_loss_mlp": 1.01565838, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 2.279589950662394, + "language_loss": 0.8596648, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88071918, + "num_input_tokens_seen": 349070825, + "step": 16173, + "time_per_iteration": 2.5985946655273438 + }, + { + "auxiliary_loss_clip": 0.01040039, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.02726746, + "balance_loss_mlp": 1.01714063, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 2.965046836500776, + "language_loss": 0.6434983, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66420799, + "num_input_tokens_seen": 349089730, + "step": 16174, + "time_per_iteration": 2.61629056930542 + }, + { + "auxiliary_loss_clip": 0.01083192, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.03174758, + "balance_loss_mlp": 1.01762021, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.542764555396235, + "language_loss": 0.77901679, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80012846, + "num_input_tokens_seen": 349111315, + "step": 16175, + "time_per_iteration": 2.57033109664917 + }, + { + "auxiliary_loss_clip": 0.01086885, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.03290153, + "balance_loss_mlp": 1.02097356, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 6.972474534638901, + "language_loss": 0.56969976, + "learning_rate": 7.879774302919307e-09, + "loss": 0.59090012, + "num_input_tokens_seen": 349129495, + "step": 16176, + "time_per_iteration": 2.539174795150757 + }, + { + "auxiliary_loss_clip": 0.01075907, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.03432286, + "balance_loss_mlp": 1.01781344, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.014544502356102, + "language_loss": 0.72321278, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74425256, + "num_input_tokens_seen": 349148850, + "step": 16177, + "time_per_iteration": 4.059208154678345 + }, + { + "auxiliary_loss_clip": 0.01075046, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.03121209, + "balance_loss_mlp": 1.01666367, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.792412050585799, + "language_loss": 0.68621826, + "learning_rate": 7.810849984090984e-09, + "loss": 0.7072413, + "num_input_tokens_seen": 349167620, + "step": 16178, + "time_per_iteration": 2.762122631072998 + }, + { + "auxiliary_loss_clip": 0.01040989, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.02877831, + "balance_loss_mlp": 1.0191015, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 2.1260181413205648, + "language_loss": 0.67343229, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69414598, + "num_input_tokens_seen": 349185845, + "step": 16179, + "time_per_iteration": 2.9366230964660645 + }, + { + "auxiliary_loss_clip": 0.01074233, + "auxiliary_loss_mlp": 0.00749255, + "balance_loss_clip": 1.03406835, + "balance_loss_mlp": 1.00025046, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 4.838317894031954, + "language_loss": 0.76798189, + "learning_rate": 7.742227841308624e-09, + "loss": 0.78621686, + "num_input_tokens_seen": 349204525, + "step": 16180, + "time_per_iteration": 2.6226422786712646 + }, + { + "auxiliary_loss_clip": 0.0108635, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.03153443, + "balance_loss_mlp": 1.02119923, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 6.108435696912855, + "language_loss": 0.76168227, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78286755, + "num_input_tokens_seen": 349228075, + "step": 16181, + "time_per_iteration": 2.6162548065185547 + }, + { + "auxiliary_loss_clip": 0.01096656, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.03358626, + "balance_loss_mlp": 1.02113426, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.6194983034239254, + "language_loss": 0.63673222, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65801656, + "num_input_tokens_seen": 349246990, + "step": 16182, + "time_per_iteration": 3.975081443786621 + }, + { + "auxiliary_loss_clip": 0.01024403, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.0363822, + "balance_loss_mlp": 1.02334499, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 5.423114250119774, + "language_loss": 0.62084782, + "learning_rate": 7.639861229977507e-09, + "loss": 0.6414454, + "num_input_tokens_seen": 349265890, + "step": 16183, + "time_per_iteration": 3.0133869647979736 + }, + { + "auxiliary_loss_clip": 0.01074559, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.03233302, + "balance_loss_mlp": 1.02180564, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.7502538648473291, + "language_loss": 0.78275919, + "learning_rate": 7.605890125470527e-09, + "loss": 0.8038348, + "num_input_tokens_seen": 349285275, + "step": 16184, + "time_per_iteration": 2.8315813541412354 + }, + { + "auxiliary_loss_clip": 0.01053192, + "auxiliary_loss_mlp": 0.01029779, + "balance_loss_clip": 1.02783394, + "balance_loss_mlp": 1.01856112, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.3357294737682053, + "language_loss": 0.79931128, + "learning_rate": 7.571994572747709e-09, + "loss": 0.82014096, + "num_input_tokens_seen": 349301515, + "step": 16185, + "time_per_iteration": 2.606964349746704 + }, + { + "auxiliary_loss_clip": 0.01063619, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.03073347, + "balance_loss_mlp": 1.01818967, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 2.713620359466558, + "language_loss": 0.77932334, + "learning_rate": 7.538174573094469e-09, + "loss": 0.80025041, + "num_input_tokens_seen": 349319590, + "step": 16186, + "time_per_iteration": 2.622171401977539 + }, + { + "auxiliary_loss_clip": 0.01074156, + "auxiliary_loss_mlp": 0.01024149, + "balance_loss_clip": 1.03376269, + "balance_loss_mlp": 1.0136528, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.589353361747046, + "language_loss": 0.65436202, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67534506, + "num_input_tokens_seen": 349339230, + "step": 16187, + "time_per_iteration": 4.129780292510986 + }, + { + "auxiliary_loss_clip": 0.01065877, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.02886438, + "balance_loss_mlp": 1.02441752, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.7386412590286495, + "language_loss": 0.80576193, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82678246, + "num_input_tokens_seen": 349361155, + "step": 16188, + "time_per_iteration": 2.6843011379241943 + }, + { + "auxiliary_loss_clip": 0.01058883, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.03028584, + "balance_loss_mlp": 1.01680136, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.7120681110155265, + "language_loss": 0.77891421, + "learning_rate": 7.437167905363084e-09, + "loss": 0.79977179, + "num_input_tokens_seen": 349379335, + "step": 16189, + "time_per_iteration": 2.686610221862793 + }, + { + "auxiliary_loss_clip": 0.01078739, + "auxiliary_loss_mlp": 0.01029628, + "balance_loss_clip": 1.03067529, + "balance_loss_mlp": 1.01835084, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.7481137049238602, + "language_loss": 0.50756592, + "learning_rate": 7.403650130784367e-09, + "loss": 0.52864957, + "num_input_tokens_seen": 349401575, + "step": 16190, + "time_per_iteration": 2.7963550090789795 + }, + { + "auxiliary_loss_clip": 0.01086846, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.03379154, + "balance_loss_mlp": 1.01530874, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 2.0707835999011763, + "language_loss": 0.80945194, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83058184, + "num_input_tokens_seen": 349420650, + "step": 16191, + "time_per_iteration": 2.567591667175293 + }, + { + "auxiliary_loss_clip": 0.01078839, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.03020906, + "balance_loss_mlp": 1.01793766, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.7571160003277198, + "language_loss": 0.82959133, + "learning_rate": 7.336841261255111e-09, + "loss": 0.85066152, + "num_input_tokens_seen": 349436830, + "step": 16192, + "time_per_iteration": 2.538712978363037 + }, + { + "auxiliary_loss_clip": 0.01034486, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.03323507, + "balance_loss_mlp": 1.02252674, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.7464240687691244, + "language_loss": 0.74805176, + "learning_rate": 7.303550168837658e-09, + "loss": 0.76873851, + "num_input_tokens_seen": 349454325, + "step": 16193, + "time_per_iteration": 2.7165768146514893 + }, + { + "auxiliary_loss_clip": 0.01068875, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.03211737, + "balance_loss_mlp": 1.01921737, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 2.6796966366426433, + "language_loss": 0.85020685, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87118602, + "num_input_tokens_seen": 349470230, + "step": 16194, + "time_per_iteration": 2.6661875247955322 + }, + { + "auxiliary_loss_clip": 0.0105631, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.03068614, + "balance_loss_mlp": 1.02297473, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.5554124737533925, + "language_loss": 0.75834286, + "learning_rate": 7.237194675009828e-09, + "loss": 0.77924281, + "num_input_tokens_seen": 349486250, + "step": 16195, + "time_per_iteration": 2.595916986465454 + }, + { + "auxiliary_loss_clip": 0.01001025, + "auxiliary_loss_mlp": 0.01001923, + "balance_loss_clip": 1.00943327, + "balance_loss_mlp": 1.00107038, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7075395840225317, + "language_loss": 0.52467889, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54470837, + "num_input_tokens_seen": 349545865, + "step": 16196, + "time_per_iteration": 3.101112127304077 + }, + { + "auxiliary_loss_clip": 0.01073606, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.03383493, + "balance_loss_mlp": 1.01702642, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 1.5923011598779413, + "language_loss": 0.76372612, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78473842, + "num_input_tokens_seen": 349566080, + "step": 16197, + "time_per_iteration": 4.285274267196655 + }, + { + "auxiliary_loss_clip": 0.01099304, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.03298843, + "balance_loss_mlp": 1.01627421, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 1.8636885597516926, + "language_loss": 0.67978942, + "learning_rate": 7.13822818063492e-09, + "loss": 0.70105487, + "num_input_tokens_seen": 349585665, + "step": 16198, + "time_per_iteration": 2.6352601051330566 + }, + { + "auxiliary_loss_clip": 0.01095766, + "auxiliary_loss_mlp": 0.0102689, + "balance_loss_clip": 1.03202653, + "balance_loss_mlp": 1.01517749, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.7896879805417467, + "language_loss": 0.77580273, + "learning_rate": 7.10539048654768e-09, + "loss": 0.79702926, + "num_input_tokens_seen": 349605125, + "step": 16199, + "time_per_iteration": 2.628673553466797 + }, + { + "auxiliary_loss_clip": 0.01074018, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.03381526, + "balance_loss_mlp": 1.02092683, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 1.702381264409414, + "language_loss": 0.79490501, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81596196, + "num_input_tokens_seen": 349623360, + "step": 16200, + "time_per_iteration": 2.582662343978882 + }, + { + "auxiliary_loss_clip": 0.0104629, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.03372884, + "balance_loss_mlp": 1.0215745, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.0089874133927235, + "language_loss": 0.68937504, + "learning_rate": 7.039941811905592e-09, + "loss": 0.7101686, + "num_input_tokens_seen": 349644390, + "step": 16201, + "time_per_iteration": 2.8454368114471436 + }, + { + "auxiliary_loss_clip": 0.01063704, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.02993894, + "balance_loss_mlp": 1.01954961, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 5.3927610645663915, + "language_loss": 0.72771311, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.7486515, + "num_input_tokens_seen": 349663200, + "step": 16202, + "time_per_iteration": 2.6738312244415283 + }, + { + "auxiliary_loss_clip": 0.01074491, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.03254414, + "balance_loss_mlp": 1.01784873, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.715141323991782, + "language_loss": 0.72793961, + "learning_rate": 6.974795430241265e-09, + "loss": 0.74897939, + "num_input_tokens_seen": 349681975, + "step": 16203, + "time_per_iteration": 2.5855493545532227 + }, + { + "auxiliary_loss_clip": 0.01097444, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.0333066, + "balance_loss_mlp": 1.02126956, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.7716898186286243, + "language_loss": 0.77423, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79552674, + "num_input_tokens_seen": 349701185, + "step": 16204, + "time_per_iteration": 2.502258062362671 + }, + { + "auxiliary_loss_clip": 0.01079559, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.03434229, + "balance_loss_mlp": 1.02132893, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 1.902218544975718, + "language_loss": 0.79385072, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81497741, + "num_input_tokens_seen": 349720360, + "step": 16205, + "time_per_iteration": 2.672224760055542 + }, + { + "auxiliary_loss_clip": 0.0109576, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.03337979, + "balance_loss_mlp": 1.01959562, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.642654707825353, + "language_loss": 0.7449435, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76620483, + "num_input_tokens_seen": 349741040, + "step": 16206, + "time_per_iteration": 2.6157190799713135 + }, + { + "auxiliary_loss_clip": 0.01040991, + "auxiliary_loss_mlp": 0.01028648, + "balance_loss_clip": 1.03174925, + "balance_loss_mlp": 1.01731086, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.328695182146863, + "language_loss": 0.8395505, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.8602469, + "num_input_tokens_seen": 349758895, + "step": 16207, + "time_per_iteration": 2.7291929721832275 + }, + { + "auxiliary_loss_clip": 0.01085111, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.03263509, + "balance_loss_mlp": 1.02000999, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.602998107864985, + "language_loss": 0.70842886, + "learning_rate": 6.813252072591425e-09, + "loss": 0.72958577, + "num_input_tokens_seen": 349779740, + "step": 16208, + "time_per_iteration": 2.656049966812134 + }, + { + "auxiliary_loss_clip": 0.01056149, + "auxiliary_loss_mlp": 0.0102457, + "balance_loss_clip": 1.03004277, + "balance_loss_mlp": 1.01521254, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 3.1831318111297535, + "language_loss": 0.7719413, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79274857, + "num_input_tokens_seen": 349796820, + "step": 16209, + "time_per_iteration": 2.5760836601257324 + }, + { + "auxiliary_loss_clip": 0.01056902, + "auxiliary_loss_mlp": 0.00749668, + "balance_loss_clip": 1.02887309, + "balance_loss_mlp": 1.00027394, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.820613934385211, + "language_loss": 0.79262286, + "learning_rate": 6.749163793864144e-09, + "loss": 0.81068861, + "num_input_tokens_seen": 349816550, + "step": 16210, + "time_per_iteration": 2.6385552883148193 + }, + { + "auxiliary_loss_clip": 0.01068264, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.03076553, + "balance_loss_mlp": 1.02063513, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.6428474974074327, + "language_loss": 0.77934307, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80034578, + "num_input_tokens_seen": 349834350, + "step": 16211, + "time_per_iteration": 2.5861339569091797 + }, + { + "auxiliary_loss_clip": 0.01061695, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.03137302, + "balance_loss_mlp": 1.01805735, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.0360603360841645, + "language_loss": 0.78403497, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80495214, + "num_input_tokens_seen": 349853460, + "step": 16212, + "time_per_iteration": 2.6641855239868164 + }, + { + "auxiliary_loss_clip": 0.0106346, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.03172755, + "balance_loss_mlp": 1.01885641, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.4944724367128617, + "language_loss": 0.8009569, + "learning_rate": 6.653598260829118e-09, + "loss": 0.8218928, + "num_input_tokens_seen": 349874830, + "step": 16213, + "time_per_iteration": 2.698568105697632 + }, + { + "auxiliary_loss_clip": 0.01051058, + "auxiliary_loss_mlp": 0.01025432, + "balance_loss_clip": 1.02972376, + "balance_loss_mlp": 1.01442266, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 3.0239083863125478, + "language_loss": 0.6606586, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68142349, + "num_input_tokens_seen": 349893690, + "step": 16214, + "time_per_iteration": 2.6754679679870605 + }, + { + "auxiliary_loss_clip": 0.01084272, + "auxiliary_loss_mlp": 0.01027487, + "balance_loss_clip": 1.03808212, + "balance_loss_mlp": 1.01587582, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.5462171085770735, + "language_loss": 0.74120998, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76232755, + "num_input_tokens_seen": 349912480, + "step": 16215, + "time_per_iteration": 2.585998773574829 + }, + { + "auxiliary_loss_clip": 0.01044113, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03037739, + "balance_loss_mlp": 1.01944184, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.659960073612748, + "language_loss": 0.67297131, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69371802, + "num_input_tokens_seen": 349932470, + "step": 16216, + "time_per_iteration": 4.296860694885254 + }, + { + "auxiliary_loss_clip": 0.01036683, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.0273174, + "balance_loss_mlp": 1.01799095, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 2.0626745458999274, + "language_loss": 0.71828866, + "learning_rate": 6.527235786226937e-09, + "loss": 0.73895669, + "num_input_tokens_seen": 349949060, + "step": 16217, + "time_per_iteration": 2.6847994327545166 + }, + { + "auxiliary_loss_clip": 0.01063805, + "auxiliary_loss_mlp": 0.01025439, + "balance_loss_clip": 1.03317785, + "balance_loss_mlp": 1.01438856, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.53579224685382, + "language_loss": 0.78361368, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80450606, + "num_input_tokens_seen": 349968010, + "step": 16218, + "time_per_iteration": 2.640259265899658 + }, + { + "auxiliary_loss_clip": 0.01068987, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.03151667, + "balance_loss_mlp": 1.01867294, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.2755072663266542, + "language_loss": 0.77162474, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79261088, + "num_input_tokens_seen": 349985270, + "step": 16219, + "time_per_iteration": 2.6105995178222656 + }, + { + "auxiliary_loss_clip": 0.01077302, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.033144, + "balance_loss_mlp": 1.02177024, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.9126240330696502, + "language_loss": 0.81331456, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83441472, + "num_input_tokens_seen": 350003935, + "step": 16220, + "time_per_iteration": 2.601850748062134 + }, + { + "auxiliary_loss_clip": 0.01094761, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.03249073, + "balance_loss_mlp": 1.01921511, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 2.0650659238157796, + "language_loss": 0.75309318, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77433705, + "num_input_tokens_seen": 350023595, + "step": 16221, + "time_per_iteration": 2.5200164318084717 + }, + { + "auxiliary_loss_clip": 0.01059553, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.02949572, + "balance_loss_mlp": 1.01908267, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.76071089268571, + "language_loss": 0.66433871, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68523365, + "num_input_tokens_seen": 350045920, + "step": 16222, + "time_per_iteration": 4.156033992767334 + }, + { + "auxiliary_loss_clip": 0.01083943, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.031847, + "balance_loss_mlp": 1.01693416, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 3.4570035918377644, + "language_loss": 0.88418257, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.9052968, + "num_input_tokens_seen": 350063925, + "step": 16223, + "time_per_iteration": 2.639431953430176 + }, + { + "auxiliary_loss_clip": 0.0103365, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.02931774, + "balance_loss_mlp": 1.0228622, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.7750678141920189, + "language_loss": 0.74644887, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76712036, + "num_input_tokens_seen": 350080900, + "step": 16224, + "time_per_iteration": 2.6625828742980957 + }, + { + "auxiliary_loss_clip": 0.01003601, + "auxiliary_loss_mlp": 0.01004251, + "balance_loss_clip": 1.00352323, + "balance_loss_mlp": 1.00330305, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.816549726533117, + "language_loss": 0.59270674, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61278522, + "num_input_tokens_seen": 350144550, + "step": 16225, + "time_per_iteration": 3.0820000171661377 + }, + { + "auxiliary_loss_clip": 0.01063709, + "auxiliary_loss_mlp": 0.00749205, + "balance_loss_clip": 1.03292704, + "balance_loss_mlp": 1.00026226, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 2.068027349896723, + "language_loss": 0.6918515, + "learning_rate": 6.247342505960818e-09, + "loss": 0.70998061, + "num_input_tokens_seen": 350164050, + "step": 16226, + "time_per_iteration": 2.6768219470977783 + }, + { + "auxiliary_loss_clip": 0.01080773, + "auxiliary_loss_mlp": 0.01038307, + "balance_loss_clip": 1.03198981, + "balance_loss_mlp": 1.02707791, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.8282311501155375, + "language_loss": 0.82610196, + "learning_rate": 6.216621253462894e-09, + "loss": 0.84729278, + "num_input_tokens_seen": 350181350, + "step": 16227, + "time_per_iteration": 3.9615166187286377 + }, + { + "auxiliary_loss_clip": 0.0109611, + "auxiliary_loss_mlp": 0.01024263, + "balance_loss_clip": 1.03377223, + "balance_loss_mlp": 1.01368308, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.6929920458219452, + "language_loss": 0.77320647, + "learning_rate": 6.185975605430549e-09, + "loss": 0.79441023, + "num_input_tokens_seen": 350199765, + "step": 16228, + "time_per_iteration": 2.4817559719085693 + }, + { + "auxiliary_loss_clip": 0.01014502, + "auxiliary_loss_mlp": 0.01001015, + "balance_loss_clip": 1.00404978, + "balance_loss_mlp": 0.99999583, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8403013247826016, + "language_loss": 0.55831468, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57846987, + "num_input_tokens_seen": 350256420, + "step": 16229, + "time_per_iteration": 2.989933490753174 + }, + { + "auxiliary_loss_clip": 0.010852, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.03208971, + "balance_loss_mlp": 1.01971304, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 2.0178938598952985, + "language_loss": 0.74738348, + "learning_rate": 6.124911127407984e-09, + "loss": 0.76854753, + "num_input_tokens_seen": 350276270, + "step": 16230, + "time_per_iteration": 2.5352258682250977 + }, + { + "auxiliary_loss_clip": 0.01069647, + "auxiliary_loss_mlp": 0.01026123, + "balance_loss_clip": 1.03183472, + "balance_loss_mlp": 1.01631212, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 2.394262166303447, + "language_loss": 0.7176764, + "learning_rate": 6.094492299733245e-09, + "loss": 0.73863411, + "num_input_tokens_seen": 350295000, + "step": 16231, + "time_per_iteration": 2.5300753116607666 + }, + { + "auxiliary_loss_clip": 0.01078669, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.0347687, + "balance_loss_mlp": 1.01908994, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 2.1771767432516054, + "language_loss": 0.76688182, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78797269, + "num_input_tokens_seen": 350314980, + "step": 16232, + "time_per_iteration": 2.5630640983581543 + }, + { + "auxiliary_loss_clip": 0.01003766, + "auxiliary_loss_mlp": 0.01004628, + "balance_loss_clip": 1.00620127, + "balance_loss_mlp": 1.00365067, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7413818956058891, + "language_loss": 0.53746665, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55755067, + "num_input_tokens_seen": 350371985, + "step": 16233, + "time_per_iteration": 2.9933323860168457 + }, + { + "auxiliary_loss_clip": 0.01096421, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.03200197, + "balance_loss_mlp": 1.02136016, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.912268076341163, + "language_loss": 0.71407437, + "learning_rate": 6.003689475888807e-09, + "loss": 0.7353633, + "num_input_tokens_seen": 350390590, + "step": 16234, + "time_per_iteration": 2.478775978088379 + }, + { + "auxiliary_loss_clip": 0.01088209, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.03300035, + "balance_loss_mlp": 1.01777112, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.727306965129782, + "language_loss": 0.78757894, + "learning_rate": 5.973573091493156e-09, + "loss": 0.80875671, + "num_input_tokens_seen": 350403770, + "step": 16235, + "time_per_iteration": 2.6821532249450684 + }, + { + "auxiliary_loss_clip": 0.01078424, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.03327012, + "balance_loss_mlp": 1.02114117, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.8825626430582791, + "language_loss": 0.7717545, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79287183, + "num_input_tokens_seen": 350421870, + "step": 16236, + "time_per_iteration": 4.002067565917969 + }, + { + "auxiliary_loss_clip": 0.01083591, + "auxiliary_loss_mlp": 0.01029585, + "balance_loss_clip": 1.03096211, + "balance_loss_mlp": 1.01914275, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.7341098228255536, + "language_loss": 0.75562382, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77675557, + "num_input_tokens_seen": 350440025, + "step": 16237, + "time_per_iteration": 2.514317274093628 + }, + { + "auxiliary_loss_clip": 0.01047243, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.0280695, + "balance_loss_mlp": 1.02024901, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.6701368605132465, + "language_loss": 0.73039103, + "learning_rate": 5.8836776249509e-09, + "loss": 0.75119197, + "num_input_tokens_seen": 350459435, + "step": 16238, + "time_per_iteration": 2.6973049640655518 + }, + { + "auxiliary_loss_clip": 0.01076865, + "auxiliary_loss_mlp": 0.00749388, + "balance_loss_clip": 1.03372192, + "balance_loss_mlp": 1.00021887, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.0897916130667724, + "language_loss": 0.83282238, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85108495, + "num_input_tokens_seen": 350472655, + "step": 16239, + "time_per_iteration": 2.5958876609802246 + }, + { + "auxiliary_loss_clip": 0.01050825, + "auxiliary_loss_mlp": 0.01036615, + "balance_loss_clip": 1.0295012, + "balance_loss_mlp": 1.0225786, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 2.4552324867122413, + "language_loss": 0.60126126, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62213564, + "num_input_tokens_seen": 350488160, + "step": 16240, + "time_per_iteration": 2.6121389865875244 + }, + { + "auxiliary_loss_clip": 0.01055561, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.0313673, + "balance_loss_mlp": 1.01914299, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 2.0338245516780447, + "language_loss": 0.82539678, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84625077, + "num_input_tokens_seen": 350506065, + "step": 16241, + "time_per_iteration": 2.6258420944213867 + }, + { + "auxiliary_loss_clip": 0.01055319, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.03071392, + "balance_loss_mlp": 1.02542102, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 1.7698810710130657, + "language_loss": 0.83145446, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85236526, + "num_input_tokens_seen": 350524495, + "step": 16242, + "time_per_iteration": 2.6098015308380127 + }, + { + "auxiliary_loss_clip": 0.01086637, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.03422165, + "balance_loss_mlp": 1.01885426, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.5019023135244298, + "language_loss": 0.75237262, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77353787, + "num_input_tokens_seen": 350544185, + "step": 16243, + "time_per_iteration": 2.578434944152832 + }, + { + "auxiliary_loss_clip": 0.01082929, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.03130269, + "balance_loss_mlp": 1.02331114, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.557433383810608, + "language_loss": 0.70073295, + "learning_rate": 5.705928383713754e-09, + "loss": 0.7219125, + "num_input_tokens_seen": 350562675, + "step": 16244, + "time_per_iteration": 2.553410530090332 + }, + { + "auxiliary_loss_clip": 0.01078627, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.03582418, + "balance_loss_mlp": 1.01883054, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.953015707841235, + "language_loss": 0.83772039, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85880983, + "num_input_tokens_seen": 350581535, + "step": 16245, + "time_per_iteration": 2.6877100467681885 + }, + { + "auxiliary_loss_clip": 0.0104285, + "auxiliary_loss_mlp": 0.010252, + "balance_loss_clip": 1.02957022, + "balance_loss_mlp": 1.01488829, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.4201638933229008, + "language_loss": 0.78538811, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80606854, + "num_input_tokens_seen": 350601615, + "step": 16246, + "time_per_iteration": 2.742814064025879 + }, + { + "auxiliary_loss_clip": 0.01090557, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.03253245, + "balance_loss_mlp": 1.02185845, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.5109003278594237, + "language_loss": 0.73926407, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76048374, + "num_input_tokens_seen": 350619580, + "step": 16247, + "time_per_iteration": 2.696606397628784 + }, + { + "auxiliary_loss_clip": 0.01035544, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.029405, + "balance_loss_mlp": 1.01924431, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.8060922350818522, + "language_loss": 0.80142665, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82208848, + "num_input_tokens_seen": 350640015, + "step": 16248, + "time_per_iteration": 2.752609968185425 + }, + { + "auxiliary_loss_clip": 0.01036896, + "auxiliary_loss_mlp": 0.01045561, + "balance_loss_clip": 1.03398538, + "balance_loss_mlp": 1.03204918, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 1.8739160904659682, + "language_loss": 0.79250395, + "learning_rate": 5.559883660954278e-09, + "loss": 0.8133285, + "num_input_tokens_seen": 350659155, + "step": 16249, + "time_per_iteration": 2.778151750564575 + }, + { + "auxiliary_loss_clip": 0.01083655, + "auxiliary_loss_mlp": 0.01028864, + "balance_loss_clip": 1.03351521, + "balance_loss_mlp": 1.01800382, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 2.0240068483658504, + "language_loss": 0.66472638, + "learning_rate": 5.530901600093507e-09, + "loss": 0.68585151, + "num_input_tokens_seen": 350676615, + "step": 16250, + "time_per_iteration": 2.5540411472320557 + }, + { + "auxiliary_loss_clip": 0.01022219, + "auxiliary_loss_mlp": 0.0100203, + "balance_loss_clip": 1.00250626, + "balance_loss_mlp": 1.00117195, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7780369961181486, + "language_loss": 0.59847528, + "learning_rate": 5.501995169700846e-09, + "loss": 0.61871779, + "num_input_tokens_seen": 350736805, + "step": 16251, + "time_per_iteration": 3.1369357109069824 + }, + { + "auxiliary_loss_clip": 0.01078775, + "auxiliary_loss_mlp": 0.01028164, + "balance_loss_clip": 1.03048396, + "balance_loss_mlp": 1.01713109, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.9852313654597682, + "language_loss": 0.78525484, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80632424, + "num_input_tokens_seen": 350753600, + "step": 16252, + "time_per_iteration": 2.5273075103759766 + }, + { + "auxiliary_loss_clip": 0.01076877, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.03122509, + "balance_loss_mlp": 1.02309096, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.529926393032907, + "language_loss": 0.64652073, + "learning_rate": 5.444409204701461e-09, + "loss": 0.66763723, + "num_input_tokens_seen": 350771225, + "step": 16253, + "time_per_iteration": 2.5654726028442383 + }, + { + "auxiliary_loss_clip": 0.0109064, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.03582144, + "balance_loss_mlp": 1.01738548, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.2173701876534597, + "language_loss": 0.76460385, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78580773, + "num_input_tokens_seen": 350789100, + "step": 16254, + "time_per_iteration": 2.4983584880828857 + }, + { + "auxiliary_loss_clip": 0.01088785, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.03274083, + "balance_loss_mlp": 1.01907706, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.8661897905065508, + "language_loss": 0.6419512, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66314197, + "num_input_tokens_seen": 350811085, + "step": 16255, + "time_per_iteration": 2.69038462638855 + }, + { + "auxiliary_loss_clip": 0.01069966, + "auxiliary_loss_mlp": 0.00749511, + "balance_loss_clip": 1.03364778, + "balance_loss_mlp": 1.00020576, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.8740492706235516, + "language_loss": 0.75784576, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77604055, + "num_input_tokens_seen": 350831065, + "step": 16256, + "time_per_iteration": 4.23907995223999 + }, + { + "auxiliary_loss_clip": 0.01096041, + "auxiliary_loss_mlp": 0.01032548, + "balance_loss_clip": 1.03471756, + "balance_loss_mlp": 1.02074575, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 2.1079261135503256, + "language_loss": 0.78469729, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80598319, + "num_input_tokens_seen": 350849675, + "step": 16257, + "time_per_iteration": 2.5222675800323486 + }, + { + "auxiliary_loss_clip": 0.01082074, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.03332067, + "balance_loss_mlp": 1.02101231, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 2.1877268436176283, + "language_loss": 0.75476187, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77590418, + "num_input_tokens_seen": 350868955, + "step": 16258, + "time_per_iteration": 2.6738486289978027 + }, + { + "auxiliary_loss_clip": 0.01014004, + "auxiliary_loss_mlp": 0.0099849, + "balance_loss_clip": 1.00397968, + "balance_loss_mlp": 0.99759036, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6943362601263435, + "language_loss": 0.59754586, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61767077, + "num_input_tokens_seen": 350935110, + "step": 16259, + "time_per_iteration": 3.146514654159546 + }, + { + "auxiliary_loss_clip": 0.01080952, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.03427601, + "balance_loss_mlp": 1.01962256, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.8764384924574589, + "language_loss": 0.73604429, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75716436, + "num_input_tokens_seen": 350953220, + "step": 16260, + "time_per_iteration": 2.627868890762329 + }, + { + "auxiliary_loss_clip": 0.01087018, + "auxiliary_loss_mlp": 0.01030158, + "balance_loss_clip": 1.03265524, + "balance_loss_mlp": 1.01875579, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.9915100228032552, + "language_loss": 0.79554725, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81671906, + "num_input_tokens_seen": 350971915, + "step": 16261, + "time_per_iteration": 2.550724744796753 + }, + { + "auxiliary_loss_clip": 0.01088074, + "auxiliary_loss_mlp": 0.01022848, + "balance_loss_clip": 1.03359151, + "balance_loss_mlp": 1.01165986, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.769313453468292, + "language_loss": 0.7422657, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76337498, + "num_input_tokens_seen": 350990470, + "step": 16262, + "time_per_iteration": 4.070280075073242 + }, + { + "auxiliary_loss_clip": 0.01087373, + "auxiliary_loss_mlp": 0.01026603, + "balance_loss_clip": 1.03319764, + "balance_loss_mlp": 1.01474714, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.2862897185919517, + "language_loss": 0.69744325, + "learning_rate": 5.16101757762133e-09, + "loss": 0.71858299, + "num_input_tokens_seen": 351010755, + "step": 16263, + "time_per_iteration": 2.6137020587921143 + }, + { + "auxiliary_loss_clip": 0.01088349, + "auxiliary_loss_mlp": 0.01027479, + "balance_loss_clip": 1.0343188, + "balance_loss_mlp": 1.01675606, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 2.3923472609364964, + "language_loss": 0.66567254, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68683082, + "num_input_tokens_seen": 351029965, + "step": 16264, + "time_per_iteration": 2.5411038398742676 + }, + { + "auxiliary_loss_clip": 0.01051356, + "auxiliary_loss_mlp": 0.0102793, + "balance_loss_clip": 1.03432465, + "balance_loss_mlp": 1.015872, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.975076988086399, + "language_loss": 0.73261189, + "learning_rate": 5.105246951967679e-09, + "loss": 0.7534048, + "num_input_tokens_seen": 351046205, + "step": 16265, + "time_per_iteration": 2.6120526790618896 + }, + { + "auxiliary_loss_clip": 0.01084461, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.0325197, + "balance_loss_mlp": 1.02047276, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.9437282141638075, + "language_loss": 0.68651098, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70766795, + "num_input_tokens_seen": 351065390, + "step": 16266, + "time_per_iteration": 4.105881214141846 + }, + { + "auxiliary_loss_clip": 0.01046127, + "auxiliary_loss_mlp": 0.01024728, + "balance_loss_clip": 1.02940798, + "balance_loss_mlp": 1.01492894, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 4.5104076594608316, + "language_loss": 0.868384, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88909256, + "num_input_tokens_seen": 351084355, + "step": 16267, + "time_per_iteration": 2.665400743484497 + }, + { + "auxiliary_loss_clip": 0.01030239, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.03031278, + "balance_loss_mlp": 1.01427436, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 16.78058400280677, + "language_loss": 0.69989753, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72045815, + "num_input_tokens_seen": 351105870, + "step": 16268, + "time_per_iteration": 2.764448642730713 + }, + { + "auxiliary_loss_clip": 0.01072968, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.03106236, + "balance_loss_mlp": 1.01675832, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.677102675055389, + "language_loss": 0.74162608, + "learning_rate": 4.994613468372711e-09, + "loss": 0.76262873, + "num_input_tokens_seen": 351124760, + "step": 16269, + "time_per_iteration": 2.5938284397125244 + }, + { + "auxiliary_loss_clip": 0.01070577, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.03278041, + "balance_loss_mlp": 1.01786351, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.680671009722423, + "language_loss": 0.7107386, + "learning_rate": 4.967144221869501e-09, + "loss": 0.73174858, + "num_input_tokens_seen": 351142820, + "step": 16270, + "time_per_iteration": 2.6618282794952393 + }, + { + "auxiliary_loss_clip": 0.01100025, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.03553677, + "balance_loss_mlp": 1.02378583, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.6237756056505954, + "language_loss": 0.64078128, + "learning_rate": 4.939750627212191e-09, + "loss": 0.66212821, + "num_input_tokens_seen": 351164805, + "step": 16271, + "time_per_iteration": 2.6503357887268066 + }, + { + "auxiliary_loss_clip": 0.01071269, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.03361881, + "balance_loss_mlp": 1.0177635, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.479556835346018, + "language_loss": 0.70547849, + "learning_rate": 4.912432685439505e-09, + "loss": 0.7264784, + "num_input_tokens_seen": 351187005, + "step": 16272, + "time_per_iteration": 2.698948383331299 + }, + { + "auxiliary_loss_clip": 0.01041318, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.03422821, + "balance_loss_mlp": 1.01954353, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 1.802626749702794, + "language_loss": 0.66924274, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68996096, + "num_input_tokens_seen": 351208450, + "step": 16273, + "time_per_iteration": 2.7133820056915283 + }, + { + "auxiliary_loss_clip": 0.01064393, + "auxiliary_loss_mlp": 0.01021583, + "balance_loss_clip": 1.03152966, + "balance_loss_mlp": 1.01007915, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.6172334614657966, + "language_loss": 0.73768902, + "learning_rate": 4.85802376468869e-09, + "loss": 0.75854874, + "num_input_tokens_seen": 351229585, + "step": 16274, + "time_per_iteration": 2.6461782455444336 + }, + { + "auxiliary_loss_clip": 0.01070712, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.0330205, + "balance_loss_mlp": 1.01746821, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.6349661046322481, + "language_loss": 0.78026903, + "learning_rate": 4.830932787773579e-09, + "loss": 0.80125654, + "num_input_tokens_seen": 351249525, + "step": 16275, + "time_per_iteration": 2.6445987224578857 + }, + { + "auxiliary_loss_clip": 0.01024833, + "auxiliary_loss_mlp": 0.0102619, + "balance_loss_clip": 1.03325939, + "balance_loss_mlp": 1.01437068, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.4798502549342776, + "language_loss": 0.7056579, + "learning_rate": 4.803917467869567e-09, + "loss": 0.7261681, + "num_input_tokens_seen": 351272530, + "step": 16276, + "time_per_iteration": 2.8779313564300537 + }, + { + "auxiliary_loss_clip": 0.01063596, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.0297271, + "balance_loss_mlp": 1.01794446, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 1.868269213015591, + "language_loss": 0.85440588, + "learning_rate": 4.776977806000726e-09, + "loss": 0.87532675, + "num_input_tokens_seen": 351288530, + "step": 16277, + "time_per_iteration": 4.092678070068359 + }, + { + "auxiliary_loss_clip": 0.01076633, + "auxiliary_loss_mlp": 0.01025142, + "balance_loss_clip": 1.03126097, + "balance_loss_mlp": 1.01369786, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 2.1083740541168474, + "language_loss": 0.71178436, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73280209, + "num_input_tokens_seen": 351305890, + "step": 16278, + "time_per_iteration": 2.555551052093506 + }, + { + "auxiliary_loss_clip": 0.01075506, + "auxiliary_loss_mlp": 0.01026519, + "balance_loss_clip": 1.03083742, + "balance_loss_mlp": 1.01493216, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 2.0432812549130017, + "language_loss": 0.84264672, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86366689, + "num_input_tokens_seen": 351325010, + "step": 16279, + "time_per_iteration": 2.5533254146575928 + }, + { + "auxiliary_loss_clip": 0.01084387, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.0310576, + "balance_loss_mlp": 1.01873446, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 21.969955145203876, + "language_loss": 0.79189873, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81304681, + "num_input_tokens_seen": 351343060, + "step": 16280, + "time_per_iteration": 2.556194305419922 + }, + { + "auxiliary_loss_clip": 0.0105317, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.0318346, + "balance_loss_mlp": 1.02306545, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.6321168242211175, + "language_loss": 0.79591846, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81678039, + "num_input_tokens_seen": 351363260, + "step": 16281, + "time_per_iteration": 2.7424890995025635 + }, + { + "auxiliary_loss_clip": 0.0108543, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.03360295, + "balance_loss_mlp": 1.01837659, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.6233232624491563, + "language_loss": 0.801364, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82252014, + "num_input_tokens_seen": 351382610, + "step": 16282, + "time_per_iteration": 2.5994503498077393 + }, + { + "auxiliary_loss_clip": 0.01076999, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.03346598, + "balance_loss_mlp": 1.0243665, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 1.943090049789214, + "language_loss": 0.83412862, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85525179, + "num_input_tokens_seen": 351401075, + "step": 16283, + "time_per_iteration": 2.5978522300720215 + }, + { + "auxiliary_loss_clip": 0.01081112, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.03324044, + "balance_loss_mlp": 1.02270174, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.728203300584127, + "language_loss": 0.7197206, + "learning_rate": 4.590518683360134e-09, + "loss": 0.74086821, + "num_input_tokens_seen": 351419275, + "step": 16284, + "time_per_iteration": 2.5218493938446045 + }, + { + "auxiliary_loss_clip": 0.01084486, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.03373098, + "balance_loss_mlp": 1.02505612, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 2.448741449634803, + "language_loss": 0.64320898, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66440547, + "num_input_tokens_seen": 351437375, + "step": 16285, + "time_per_iteration": 2.505923271179199 + }, + { + "auxiliary_loss_clip": 0.01072475, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.03299856, + "balance_loss_mlp": 1.01765847, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.9864221248811609, + "language_loss": 0.71053094, + "learning_rate": 4.537925628385286e-09, + "loss": 0.73154002, + "num_input_tokens_seen": 351457810, + "step": 16286, + "time_per_iteration": 2.5980701446533203 + }, + { + "auxiliary_loss_clip": 0.01080563, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.03264785, + "balance_loss_mlp": 1.01883459, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.4169258320791351, + "language_loss": 0.58365607, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60475457, + "num_input_tokens_seen": 351478825, + "step": 16287, + "time_per_iteration": 2.62699556350708 + }, + { + "auxiliary_loss_clip": 0.01085803, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.0332129, + "balance_loss_mlp": 1.02322614, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.613046024959523, + "language_loss": 0.81601667, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83721781, + "num_input_tokens_seen": 351498785, + "step": 16288, + "time_per_iteration": 2.5723607540130615 + }, + { + "auxiliary_loss_clip": 0.01069454, + "auxiliary_loss_mlp": 0.00749446, + "balance_loss_clip": 1.03153336, + "balance_loss_mlp": 1.00023234, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.4505111558972936, + "language_loss": 0.71822846, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73641747, + "num_input_tokens_seen": 351520235, + "step": 16289, + "time_per_iteration": 2.592878580093384 + }, + { + "auxiliary_loss_clip": 0.01061274, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.03274393, + "balance_loss_mlp": 1.02207541, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.4433381560063543, + "language_loss": 0.75918633, + "learning_rate": 4.43364754382003e-09, + "loss": 0.78013033, + "num_input_tokens_seen": 351538900, + "step": 16290, + "time_per_iteration": 2.6578686237335205 + }, + { + "auxiliary_loss_clip": 0.01085979, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.03243399, + "balance_loss_mlp": 1.01982391, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.8191959439116347, + "language_loss": 0.67226756, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69343889, + "num_input_tokens_seen": 351558715, + "step": 16291, + "time_per_iteration": 2.5217182636260986 + }, + { + "auxiliary_loss_clip": 0.0110032, + "auxiliary_loss_mlp": 0.00749462, + "balance_loss_clip": 1.03388262, + "balance_loss_mlp": 1.00023413, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.853796275375978, + "language_loss": 0.63124716, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64974499, + "num_input_tokens_seen": 351578450, + "step": 16292, + "time_per_iteration": 2.564283609390259 + }, + { + "auxiliary_loss_clip": 0.01062994, + "auxiliary_loss_mlp": 0.01028825, + "balance_loss_clip": 1.03316557, + "balance_loss_mlp": 1.01807785, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 1.6856638184172412, + "language_loss": 0.73458785, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75550604, + "num_input_tokens_seen": 351597195, + "step": 16293, + "time_per_iteration": 2.6774144172668457 + }, + { + "auxiliary_loss_clip": 0.01088417, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.03286302, + "balance_loss_mlp": 1.01757872, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 6.000913555098181, + "language_loss": 0.84041399, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86158681, + "num_input_tokens_seen": 351617460, + "step": 16294, + "time_per_iteration": 2.708188056945801 + }, + { + "auxiliary_loss_clip": 0.01057909, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.02911663, + "balance_loss_mlp": 1.02270782, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 2.522244002977392, + "language_loss": 0.71820557, + "learning_rate": 4.305002567088767e-09, + "loss": 0.73911583, + "num_input_tokens_seen": 351635900, + "step": 16295, + "time_per_iteration": 4.09872841835022 + }, + { + "auxiliary_loss_clip": 0.01092435, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.03527784, + "balance_loss_mlp": 1.02381277, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.6950729317586244, + "language_loss": 0.80892527, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.83020163, + "num_input_tokens_seen": 351655400, + "step": 16296, + "time_per_iteration": 2.551093339920044 + }, + { + "auxiliary_loss_clip": 0.01067492, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.03143764, + "balance_loss_mlp": 1.02077174, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 2.050180627661422, + "language_loss": 0.75508118, + "learning_rate": 4.254074308266853e-09, + "loss": 0.7760703, + "num_input_tokens_seen": 351675505, + "step": 16297, + "time_per_iteration": 2.6246609687805176 + }, + { + "auxiliary_loss_clip": 0.01084754, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.03180301, + "balance_loss_mlp": 1.02226448, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 6.700457979799126, + "language_loss": 0.780092, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80127525, + "num_input_tokens_seen": 351697920, + "step": 16298, + "time_per_iteration": 2.5801639556884766 + }, + { + "auxiliary_loss_clip": 0.0108188, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.03167737, + "balance_loss_mlp": 1.0143168, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 2.222023566979494, + "language_loss": 0.72652566, + "learning_rate": 4.203448764984019e-09, + "loss": 0.74759197, + "num_input_tokens_seen": 351717615, + "step": 16299, + "time_per_iteration": 2.5881612300872803 + }, + { + "auxiliary_loss_clip": 0.01071221, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.0314455, + "balance_loss_mlp": 1.01525211, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 2.2179158406943, + "language_loss": 0.89438391, + "learning_rate": 4.178249514071419e-09, + "loss": 0.9153592, + "num_input_tokens_seen": 351735260, + "step": 16300, + "time_per_iteration": 2.6066062450408936 + }, + { + "auxiliary_loss_clip": 0.01085608, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.03206229, + "balance_loss_mlp": 1.01965904, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.1187103977365096, + "language_loss": 0.78544724, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80661052, + "num_input_tokens_seen": 351755800, + "step": 16301, + "time_per_iteration": 2.5453357696533203 + }, + { + "auxiliary_loss_clip": 0.01075118, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.03220439, + "balance_loss_mlp": 1.02350616, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.189164226300034, + "language_loss": 0.75528008, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77637988, + "num_input_tokens_seen": 351774790, + "step": 16302, + "time_per_iteration": 4.045789480209351 + }, + { + "auxiliary_loss_clip": 0.01066876, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.0322839, + "balance_loss_mlp": 1.01966774, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.7817808262012678, + "language_loss": 0.79431391, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81529403, + "num_input_tokens_seen": 351792855, + "step": 16303, + "time_per_iteration": 2.547314167022705 + }, + { + "auxiliary_loss_clip": 0.01049754, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.02931869, + "balance_loss_mlp": 1.02095461, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 1.9410574573102177, + "language_loss": 0.83236367, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85319674, + "num_input_tokens_seen": 351811450, + "step": 16304, + "time_per_iteration": 2.632673501968384 + }, + { + "auxiliary_loss_clip": 0.01058917, + "auxiliary_loss_mlp": 0.0102518, + "balance_loss_clip": 1.03167832, + "balance_loss_mlp": 1.01529789, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.7898326284361585, + "language_loss": 0.70425248, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72509348, + "num_input_tokens_seen": 351831960, + "step": 16305, + "time_per_iteration": 2.687596321105957 + }, + { + "auxiliary_loss_clip": 0.01065413, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.03218818, + "balance_loss_mlp": 1.02834225, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 1.8653634671753923, + "language_loss": 0.71865529, + "learning_rate": 4.028643358815032e-09, + "loss": 0.73972797, + "num_input_tokens_seen": 351851585, + "step": 16306, + "time_per_iteration": 4.145172595977783 + }, + { + "auxiliary_loss_clip": 0.01067928, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.03031826, + "balance_loss_mlp": 1.02188182, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.5540515130605814, + "language_loss": 0.737436, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75844002, + "num_input_tokens_seen": 351871085, + "step": 16307, + "time_per_iteration": 2.636228084564209 + }, + { + "auxiliary_loss_clip": 0.01065299, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.03535903, + "balance_loss_mlp": 1.01986003, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.3740765647018023, + "language_loss": 0.74928981, + "learning_rate": 3.979380129822018e-09, + "loss": 0.77023822, + "num_input_tokens_seen": 351891775, + "step": 16308, + "time_per_iteration": 2.6484968662261963 + }, + { + "auxiliary_loss_clip": 0.01002946, + "auxiliary_loss_mlp": 0.0100692, + "balance_loss_clip": 1.0031569, + "balance_loss_mlp": 1.00597274, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.7652841538406032, + "language_loss": 0.57863641, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59873503, + "num_input_tokens_seen": 351946770, + "step": 16309, + "time_per_iteration": 3.00248646736145 + }, + { + "auxiliary_loss_clip": 0.01055008, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03159153, + "balance_loss_mlp": 1.01996183, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 3.641556619082478, + "language_loss": 0.66178536, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68264502, + "num_input_tokens_seen": 351966155, + "step": 16310, + "time_per_iteration": 2.6603119373321533 + }, + { + "auxiliary_loss_clip": 0.01005449, + "auxiliary_loss_mlp": 0.01003173, + "balance_loss_clip": 1.00777566, + "balance_loss_mlp": 1.00224364, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 1.2514369901925773, + "language_loss": 0.54521239, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56529862, + "num_input_tokens_seen": 352031655, + "step": 16311, + "time_per_iteration": 3.179812431335449 + }, + { + "auxiliary_loss_clip": 0.01083271, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.03209376, + "balance_loss_mlp": 1.01618564, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.6528405679367275, + "language_loss": 0.7996195, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82072008, + "num_input_tokens_seen": 352051920, + "step": 16312, + "time_per_iteration": 2.5809028148651123 + }, + { + "auxiliary_loss_clip": 0.01071368, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.03114748, + "balance_loss_mlp": 1.02059364, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.902104685088445, + "language_loss": 0.63390434, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65492535, + "num_input_tokens_seen": 352069315, + "step": 16313, + "time_per_iteration": 2.573540210723877 + }, + { + "auxiliary_loss_clip": 0.01085403, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.03331041, + "balance_loss_mlp": 1.01774716, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 2.504618396293357, + "language_loss": 0.72678703, + "learning_rate": 3.833407015731316e-09, + "loss": 0.74793118, + "num_input_tokens_seen": 352089480, + "step": 16314, + "time_per_iteration": 2.5418050289154053 + }, + { + "auxiliary_loss_clip": 0.00995398, + "auxiliary_loss_mlp": 0.01000985, + "balance_loss_clip": 1.00569892, + "balance_loss_mlp": 1.00015008, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6914757972023531, + "language_loss": 0.51674986, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53671372, + "num_input_tokens_seen": 352150000, + "step": 16315, + "time_per_iteration": 3.173856735229492 + }, + { + "auxiliary_loss_clip": 0.01084409, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.03151369, + "balance_loss_mlp": 1.01786113, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.3354340300875083, + "language_loss": 0.69755745, + "learning_rate": 3.785354859932033e-09, + "loss": 0.7186802, + "num_input_tokens_seen": 352170990, + "step": 16316, + "time_per_iteration": 2.6350183486938477 + }, + { + "auxiliary_loss_clip": 0.01098282, + "auxiliary_loss_mlp": 0.01025853, + "balance_loss_clip": 1.03405309, + "balance_loss_mlp": 1.01490927, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.7290769637467562, + "language_loss": 0.55303079, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57427216, + "num_input_tokens_seen": 352195335, + "step": 16317, + "time_per_iteration": 4.104888677597046 + }, + { + "auxiliary_loss_clip": 0.01033361, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02636933, + "balance_loss_mlp": 1.02325749, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.6031046185589686, + "language_loss": 0.72968316, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75036204, + "num_input_tokens_seen": 352214170, + "step": 16318, + "time_per_iteration": 2.746586561203003 + }, + { + "auxiliary_loss_clip": 0.01073012, + "auxiliary_loss_mlp": 0.01025069, + "balance_loss_clip": 1.03194356, + "balance_loss_mlp": 1.01498413, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.0122964674186443, + "language_loss": 0.82168132, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.8426621, + "num_input_tokens_seen": 352231470, + "step": 16319, + "time_per_iteration": 2.5687832832336426 + }, + { + "auxiliary_loss_clip": 0.01012213, + "auxiliary_loss_mlp": 0.01003559, + "balance_loss_clip": 1.00265718, + "balance_loss_mlp": 1.0023731, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7119439617443649, + "language_loss": 0.53520328, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55536097, + "num_input_tokens_seen": 352291770, + "step": 16320, + "time_per_iteration": 2.9841432571411133 + }, + { + "auxiliary_loss_clip": 0.0104361, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.02939558, + "balance_loss_mlp": 1.02006674, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 1.8879067441058184, + "language_loss": 0.73551172, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75625598, + "num_input_tokens_seen": 352310735, + "step": 16321, + "time_per_iteration": 2.707348346710205 + }, + { + "auxiliary_loss_clip": 0.01074488, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.03506446, + "balance_loss_mlp": 1.01992393, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.5247227746962182, + "language_loss": 0.78358215, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80463481, + "num_input_tokens_seen": 352329545, + "step": 16322, + "time_per_iteration": 2.6119277477264404 + }, + { + "auxiliary_loss_clip": 0.01085011, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.03252017, + "balance_loss_mlp": 1.01912761, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.5098374095594234, + "language_loss": 0.80505061, + "learning_rate": 3.619556806799595e-09, + "loss": 0.82620299, + "num_input_tokens_seen": 352352080, + "step": 16323, + "time_per_iteration": 2.612802505493164 + }, + { + "auxiliary_loss_clip": 0.01099079, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.03454423, + "balance_loss_mlp": 1.02428281, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.0010066922979632, + "language_loss": 0.84464049, + "learning_rate": 3.596174175278799e-09, + "loss": 0.86597997, + "num_input_tokens_seen": 352366455, + "step": 16324, + "time_per_iteration": 2.48930287361145 + }, + { + "auxiliary_loss_clip": 0.01075378, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.03400731, + "balance_loss_mlp": 1.01694882, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.5643501275793057, + "language_loss": 0.74528098, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76631689, + "num_input_tokens_seen": 352386090, + "step": 16325, + "time_per_iteration": 2.7119786739349365 + }, + { + "auxiliary_loss_clip": 0.01045606, + "auxiliary_loss_mlp": 0.01031124, + "balance_loss_clip": 1.03336716, + "balance_loss_mlp": 1.02140236, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.7234666385245412, + "language_loss": 0.76723647, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.7880038, + "num_input_tokens_seen": 352404000, + "step": 16326, + "time_per_iteration": 2.643667697906494 + }, + { + "auxiliary_loss_clip": 0.01061135, + "auxiliary_loss_mlp": 0.01025951, + "balance_loss_clip": 1.0318594, + "balance_loss_mlp": 1.01476943, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.7372537739869471, + "language_loss": 0.67306572, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69393659, + "num_input_tokens_seen": 352423540, + "step": 16327, + "time_per_iteration": 2.6809380054473877 + }, + { + "auxiliary_loss_clip": 0.01089071, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.03298712, + "balance_loss_mlp": 1.02291131, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.5465003049015218, + "language_loss": 0.73785019, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75909078, + "num_input_tokens_seen": 352445530, + "step": 16328, + "time_per_iteration": 2.6439027786254883 + }, + { + "auxiliary_loss_clip": 0.01078397, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.03254652, + "balance_loss_mlp": 1.02403545, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.7362237753001006, + "language_loss": 0.81171513, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.8328613, + "num_input_tokens_seen": 352466325, + "step": 16329, + "time_per_iteration": 2.6203837394714355 + }, + { + "auxiliary_loss_clip": 0.01099193, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.03274083, + "balance_loss_mlp": 1.01671481, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 5.065283846735381, + "language_loss": 0.7584334, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.77971113, + "num_input_tokens_seen": 352485505, + "step": 16330, + "time_per_iteration": 2.5659172534942627 + }, + { + "auxiliary_loss_clip": 0.01105308, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.03527117, + "balance_loss_mlp": 1.01727545, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.8651877312365692, + "language_loss": 0.66450042, + "learning_rate": 3.434615511252126e-09, + "loss": 0.6858623, + "num_input_tokens_seen": 352505360, + "step": 16331, + "time_per_iteration": 2.5682246685028076 + }, + { + "auxiliary_loss_clip": 0.01083629, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.0314703, + "balance_loss_mlp": 1.01777804, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 2.1261243502807146, + "language_loss": 0.73174912, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75287014, + "num_input_tokens_seen": 352524035, + "step": 16332, + "time_per_iteration": 2.55293345451355 + }, + { + "auxiliary_loss_clip": 0.01086065, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.03415704, + "balance_loss_mlp": 1.0194478, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 2.164574243166513, + "language_loss": 0.76369971, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78485763, + "num_input_tokens_seen": 352543210, + "step": 16333, + "time_per_iteration": 2.5668163299560547 + }, + { + "auxiliary_loss_clip": 0.01080305, + "auxiliary_loss_mlp": 0.00749227, + "balance_loss_clip": 1.03331065, + "balance_loss_mlp": 1.00020003, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.2075503257924556, + "language_loss": 0.72917008, + "learning_rate": 3.366511715771958e-09, + "loss": 0.74746537, + "num_input_tokens_seen": 352559770, + "step": 16334, + "time_per_iteration": 2.5307998657226562 + }, + { + "auxiliary_loss_clip": 0.010466, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.03124583, + "balance_loss_mlp": 1.02368546, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.8688552155249225, + "language_loss": 0.78318328, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80399466, + "num_input_tokens_seen": 352577690, + "step": 16335, + "time_per_iteration": 4.1845433712005615 + }, + { + "auxiliary_loss_clip": 0.01071049, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.03142142, + "balance_loss_mlp": 1.02738619, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.1864052102710554, + "language_loss": 0.64304966, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66416717, + "num_input_tokens_seen": 352598850, + "step": 16336, + "time_per_iteration": 2.712066411972046 + }, + { + "auxiliary_loss_clip": 0.01067474, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.03355587, + "balance_loss_mlp": 1.02300012, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.8822289011556355, + "language_loss": 0.73316348, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75419378, + "num_input_tokens_seen": 352616130, + "step": 16337, + "time_per_iteration": 2.6161441802978516 + }, + { + "auxiliary_loss_clip": 0.0108041, + "auxiliary_loss_mlp": 0.01028005, + "balance_loss_clip": 1.03208876, + "balance_loss_mlp": 1.01607192, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.893120499045941, + "language_loss": 0.73128927, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75237334, + "num_input_tokens_seen": 352636885, + "step": 16338, + "time_per_iteration": 2.549140691757202 + }, + { + "auxiliary_loss_clip": 0.01025341, + "auxiliary_loss_mlp": 0.01029059, + "balance_loss_clip": 1.0273397, + "balance_loss_mlp": 1.01772857, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.579801390503414, + "language_loss": 0.81289679, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83344078, + "num_input_tokens_seen": 352657905, + "step": 16339, + "time_per_iteration": 2.790919780731201 + }, + { + "auxiliary_loss_clip": 0.01038526, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.0263555, + "balance_loss_mlp": 1.02347994, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 2.020413887882622, + "language_loss": 0.62180519, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64254326, + "num_input_tokens_seen": 352676320, + "step": 16340, + "time_per_iteration": 2.73380184173584 + }, + { + "auxiliary_loss_clip": 0.01100903, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.03527522, + "balance_loss_mlp": 1.01966739, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.2120340125250744, + "language_loss": 0.85756314, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.87888014, + "num_input_tokens_seen": 352692665, + "step": 16341, + "time_per_iteration": 2.498525857925415 + }, + { + "auxiliary_loss_clip": 0.01068018, + "auxiliary_loss_mlp": 0.01026577, + "balance_loss_clip": 1.03098178, + "balance_loss_mlp": 1.01608634, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.3729090130397021, + "language_loss": 0.66715741, + "learning_rate": 3.188233008645014e-09, + "loss": 0.68810338, + "num_input_tokens_seen": 352716130, + "step": 16342, + "time_per_iteration": 4.146767616271973 + }, + { + "auxiliary_loss_clip": 0.01096828, + "auxiliary_loss_mlp": 0.01025629, + "balance_loss_clip": 1.03244209, + "balance_loss_mlp": 1.01456642, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.681864589644233, + "language_loss": 0.77221084, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79343534, + "num_input_tokens_seen": 352734705, + "step": 16343, + "time_per_iteration": 2.5283448696136475 + }, + { + "auxiliary_loss_clip": 0.01061971, + "auxiliary_loss_mlp": 0.01028029, + "balance_loss_clip": 1.03188705, + "balance_loss_mlp": 1.01849794, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.6197947342106036, + "language_loss": 0.75199819, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.7728982, + "num_input_tokens_seen": 352756225, + "step": 16344, + "time_per_iteration": 2.6466779708862305 + }, + { + "auxiliary_loss_clip": 0.010703, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.03169227, + "balance_loss_mlp": 1.01869082, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 2.0324190572885663, + "language_loss": 0.66498917, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68599343, + "num_input_tokens_seen": 352776210, + "step": 16345, + "time_per_iteration": 2.6137218475341797 + }, + { + "auxiliary_loss_clip": 0.01079709, + "auxiliary_loss_mlp": 0.01025, + "balance_loss_clip": 1.03083944, + "balance_loss_mlp": 1.01524258, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.5021031655554118, + "language_loss": 0.79493558, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.8159827, + "num_input_tokens_seen": 352795455, + "step": 16346, + "time_per_iteration": 4.071993827819824 + }, + { + "auxiliary_loss_clip": 0.01087174, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.03317952, + "balance_loss_mlp": 1.01635623, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 1.8699757468663063, + "language_loss": 0.74991953, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77107286, + "num_input_tokens_seen": 352812895, + "step": 16347, + "time_per_iteration": 2.5265281200408936 + }, + { + "auxiliary_loss_clip": 0.01017805, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.02594912, + "balance_loss_mlp": 1.02335858, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.6806229723911164, + "language_loss": 0.66547972, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68599606, + "num_input_tokens_seen": 352835470, + "step": 16348, + "time_per_iteration": 2.7879014015197754 + }, + { + "auxiliary_loss_clip": 0.0107608, + "auxiliary_loss_mlp": 0.01029495, + "balance_loss_clip": 1.03284049, + "balance_loss_mlp": 1.01847982, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.9674030427988263, + "language_loss": 0.69302601, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71408176, + "num_input_tokens_seen": 352854295, + "step": 16349, + "time_per_iteration": 2.6219828128814697 + }, + { + "auxiliary_loss_clip": 0.01062918, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.03150344, + "balance_loss_mlp": 1.01731873, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 1.9772664121453178, + "language_loss": 0.75719881, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.77810222, + "num_input_tokens_seen": 352869695, + "step": 16350, + "time_per_iteration": 2.6117894649505615 + }, + { + "auxiliary_loss_clip": 0.0106589, + "auxiliary_loss_mlp": 0.01026769, + "balance_loss_clip": 1.03284955, + "balance_loss_mlp": 1.01514602, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 1.9567929357484626, + "language_loss": 0.84255731, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86348385, + "num_input_tokens_seen": 352887430, + "step": 16351, + "time_per_iteration": 2.6609127521514893 + }, + { + "auxiliary_loss_clip": 0.01068641, + "auxiliary_loss_mlp": 0.01024818, + "balance_loss_clip": 1.0344975, + "balance_loss_mlp": 1.01370215, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.7695392212060197, + "language_loss": 0.68817556, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70911014, + "num_input_tokens_seen": 352907555, + "step": 16352, + "time_per_iteration": 2.719282865524292 + }, + { + "auxiliary_loss_clip": 0.01076015, + "auxiliary_loss_mlp": 0.00749248, + "balance_loss_clip": 1.03224039, + "balance_loss_mlp": 1.0002234, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.496360975863498, + "language_loss": 0.66487187, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68312448, + "num_input_tokens_seen": 352928670, + "step": 16353, + "time_per_iteration": 2.5886263847351074 + }, + { + "auxiliary_loss_clip": 0.01069322, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.02906585, + "balance_loss_mlp": 1.01847577, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 2.611479359332533, + "language_loss": 0.74376005, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76474154, + "num_input_tokens_seen": 352948345, + "step": 16354, + "time_per_iteration": 2.602872610092163 + }, + { + "auxiliary_loss_clip": 0.01086121, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.03340507, + "balance_loss_mlp": 1.01584077, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 7.058616743409821, + "language_loss": 0.77530712, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79643238, + "num_input_tokens_seen": 352967250, + "step": 16355, + "time_per_iteration": 2.566819190979004 + }, + { + "auxiliary_loss_clip": 0.01083508, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.03190458, + "balance_loss_mlp": 1.01935661, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 1.9702543427045591, + "language_loss": 0.73698819, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75812572, + "num_input_tokens_seen": 352984725, + "step": 16356, + "time_per_iteration": 2.561318874359131 + }, + { + "auxiliary_loss_clip": 0.010696, + "auxiliary_loss_mlp": 0.0103041, + "balance_loss_clip": 1.03040802, + "balance_loss_mlp": 1.01926422, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.6070943414618413, + "language_loss": 0.7605921, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.78159219, + "num_input_tokens_seen": 353003480, + "step": 16357, + "time_per_iteration": 4.04570198059082 + }, + { + "auxiliary_loss_clip": 0.01076492, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.03371108, + "balance_loss_mlp": 1.01354504, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 2.1224216954520214, + "language_loss": 0.79893899, + "learning_rate": 2.846214118442436e-09, + "loss": 0.81995493, + "num_input_tokens_seen": 353021425, + "step": 16358, + "time_per_iteration": 2.5836527347564697 + }, + { + "auxiliary_loss_clip": 0.01081958, + "auxiliary_loss_mlp": 0.01025968, + "balance_loss_clip": 1.02988422, + "balance_loss_mlp": 1.01558495, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.1175700285398156, + "language_loss": 0.67311382, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69419312, + "num_input_tokens_seen": 353039870, + "step": 16359, + "time_per_iteration": 2.568974494934082 + }, + { + "auxiliary_loss_clip": 0.0109334, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.03247476, + "balance_loss_mlp": 1.01832211, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 1.661717725763134, + "language_loss": 0.69297624, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71418828, + "num_input_tokens_seen": 353059750, + "step": 16360, + "time_per_iteration": 2.5329298973083496 + }, + { + "auxiliary_loss_clip": 0.01088122, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.03426456, + "balance_loss_mlp": 1.01889801, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.8421154594367957, + "language_loss": 0.84229839, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86348253, + "num_input_tokens_seen": 353079940, + "step": 16361, + "time_per_iteration": 2.629915237426758 + }, + { + "auxiliary_loss_clip": 0.01095015, + "auxiliary_loss_mlp": 0.01026373, + "balance_loss_clip": 1.03203893, + "balance_loss_mlp": 1.01599038, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.6016978691153871, + "language_loss": 0.75853097, + "learning_rate": 2.76373855876022e-09, + "loss": 0.77974486, + "num_input_tokens_seen": 353099990, + "step": 16362, + "time_per_iteration": 2.5348987579345703 + }, + { + "auxiliary_loss_clip": 0.01097603, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.0342952, + "balance_loss_mlp": 1.01984966, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.6227477734159554, + "language_loss": 0.71009266, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73137951, + "num_input_tokens_seen": 353118710, + "step": 16363, + "time_per_iteration": 2.4956510066986084 + }, + { + "auxiliary_loss_clip": 0.01070325, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.03060198, + "balance_loss_mlp": 1.01544118, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 1.9872794958792503, + "language_loss": 0.63209188, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65305209, + "num_input_tokens_seen": 353136415, + "step": 16364, + "time_per_iteration": 2.5977020263671875 + }, + { + "auxiliary_loss_clip": 0.01059687, + "auxiliary_loss_mlp": 0.01028214, + "balance_loss_clip": 1.03810811, + "balance_loss_mlp": 1.01840949, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 2.42139293982317, + "language_loss": 0.7530489, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77392793, + "num_input_tokens_seen": 353154650, + "step": 16365, + "time_per_iteration": 2.682375431060791 + }, + { + "auxiliary_loss_clip": 0.01051703, + "auxiliary_loss_mlp": 0.01026207, + "balance_loss_clip": 1.03173518, + "balance_loss_mlp": 1.01513219, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 1.7502203723186591, + "language_loss": 0.75939327, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78017235, + "num_input_tokens_seen": 353174065, + "step": 16366, + "time_per_iteration": 2.6267781257629395 + }, + { + "auxiliary_loss_clip": 0.01092667, + "auxiliary_loss_mlp": 0.01023065, + "balance_loss_clip": 1.0315758, + "balance_loss_mlp": 1.01290894, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.5681652850083447, + "language_loss": 0.77302444, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79418176, + "num_input_tokens_seen": 353193560, + "step": 16367, + "time_per_iteration": 2.5646936893463135 + }, + { + "auxiliary_loss_clip": 0.01074268, + "auxiliary_loss_mlp": 0.01030222, + "balance_loss_clip": 1.03503346, + "balance_loss_mlp": 1.0187304, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.51887499497109, + "language_loss": 0.61610132, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63714623, + "num_input_tokens_seen": 353213525, + "step": 16368, + "time_per_iteration": 2.5982720851898193 + }, + { + "auxiliary_loss_clip": 0.0108093, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.0310775, + "balance_loss_mlp": 1.02149749, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.5287483154964068, + "language_loss": 0.65510386, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67622399, + "num_input_tokens_seen": 353234000, + "step": 16369, + "time_per_iteration": 2.5876870155334473 + }, + { + "auxiliary_loss_clip": 0.01087371, + "auxiliary_loss_mlp": 0.00749353, + "balance_loss_clip": 1.03319955, + "balance_loss_mlp": 1.00022101, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.930828898817869, + "language_loss": 0.6869303, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70529759, + "num_input_tokens_seen": 353254940, + "step": 16370, + "time_per_iteration": 2.596802234649658 + }, + { + "auxiliary_loss_clip": 0.01097388, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.03235674, + "balance_loss_mlp": 1.01971531, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.7968840477009607, + "language_loss": 0.73730099, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75858891, + "num_input_tokens_seen": 353272590, + "step": 16371, + "time_per_iteration": 2.4594643115997314 + }, + { + "auxiliary_loss_clip": 0.01012971, + "auxiliary_loss_mlp": 0.00999535, + "balance_loss_clip": 1.0025717, + "balance_loss_mlp": 0.99860555, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7727522540228245, + "language_loss": 0.65222704, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67235208, + "num_input_tokens_seen": 353334380, + "step": 16372, + "time_per_iteration": 3.092255115509033 + }, + { + "auxiliary_loss_clip": 0.01084421, + "auxiliary_loss_mlp": 0.01029159, + "balance_loss_clip": 1.03247809, + "balance_loss_mlp": 1.01858497, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.6751906515477157, + "language_loss": 0.70754004, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72867584, + "num_input_tokens_seen": 353351640, + "step": 16373, + "time_per_iteration": 2.628270387649536 + }, + { + "auxiliary_loss_clip": 0.01095135, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.03367472, + "balance_loss_mlp": 1.0174526, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.794159207759391, + "language_loss": 0.81449795, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83573127, + "num_input_tokens_seen": 353372555, + "step": 16374, + "time_per_iteration": 2.4978621006011963 + }, + { + "auxiliary_loss_clip": 0.01054269, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.03608894, + "balance_loss_mlp": 1.0219326, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.9681860506949473, + "language_loss": 0.69128841, + "learning_rate": 2.504062005197927e-09, + "loss": 0.7121582, + "num_input_tokens_seen": 353391385, + "step": 16375, + "time_per_iteration": 4.088545322418213 + }, + { + "auxiliary_loss_clip": 0.01068507, + "auxiliary_loss_mlp": 0.01040922, + "balance_loss_clip": 1.02903187, + "balance_loss_mlp": 1.02761281, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.859588170235565, + "language_loss": 0.81133896, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83243322, + "num_input_tokens_seen": 353411630, + "step": 16376, + "time_per_iteration": 2.624206304550171 + }, + { + "auxiliary_loss_clip": 0.01094309, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.03262246, + "balance_loss_mlp": 1.02014673, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 2.2573674266207755, + "language_loss": 0.6240586, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64531147, + "num_input_tokens_seen": 353432895, + "step": 16377, + "time_per_iteration": 2.5484817028045654 + }, + { + "auxiliary_loss_clip": 0.01068345, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.03349602, + "balance_loss_mlp": 1.02270198, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 2.883430306957067, + "language_loss": 0.72787166, + "learning_rate": 2.445954472695133e-09, + "loss": 0.74889803, + "num_input_tokens_seen": 353454195, + "step": 16378, + "time_per_iteration": 2.651569366455078 + }, + { + "auxiliary_loss_clip": 0.01096754, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.03286052, + "balance_loss_mlp": 1.02193832, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.6566267157246033, + "language_loss": 0.70873415, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73002648, + "num_input_tokens_seen": 353475125, + "step": 16379, + "time_per_iteration": 2.5397021770477295 + }, + { + "auxiliary_loss_clip": 0.01066025, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.03331852, + "balance_loss_mlp": 1.02098012, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 1.8952211238951784, + "language_loss": 0.68544078, + "learning_rate": 2.407594853716999e-09, + "loss": 0.70641875, + "num_input_tokens_seen": 353493265, + "step": 16380, + "time_per_iteration": 2.6162068843841553 + }, + { + "auxiliary_loss_clip": 0.01063186, + "auxiliary_loss_mlp": 0.01035059, + "balance_loss_clip": 1.0320437, + "balance_loss_mlp": 1.0237937, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 3.2476933137483406, + "language_loss": 0.79016984, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81115234, + "num_input_tokens_seen": 353511650, + "step": 16381, + "time_per_iteration": 2.5610828399658203 + }, + { + "auxiliary_loss_clip": 0.01085036, + "auxiliary_loss_mlp": 0.0102514, + "balance_loss_clip": 1.033728, + "balance_loss_mlp": 1.01426768, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.4951684704903712, + "language_loss": 0.82382512, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84492689, + "num_input_tokens_seen": 353534035, + "step": 16382, + "time_per_iteration": 4.149328231811523 + }, + { + "auxiliary_loss_clip": 0.01064764, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.02920508, + "balance_loss_mlp": 1.02167439, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.8767351895291124, + "language_loss": 0.74331379, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76429647, + "num_input_tokens_seen": 353549950, + "step": 16383, + "time_per_iteration": 2.588858127593994 + }, + { + "auxiliary_loss_clip": 0.01050548, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.03433633, + "balance_loss_mlp": 1.01941895, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.6473345248295606, + "language_loss": 0.65852433, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.67933142, + "num_input_tokens_seen": 353573745, + "step": 16384, + "time_per_iteration": 2.8755125999450684 + }, + { + "auxiliary_loss_clip": 0.01078737, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.03536212, + "balance_loss_mlp": 1.02277982, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 2.1898132425606462, + "language_loss": 0.70230925, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72344601, + "num_input_tokens_seen": 353595335, + "step": 16385, + "time_per_iteration": 2.758185625076294 + }, + { + "auxiliary_loss_clip": 0.01080269, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.03641605, + "balance_loss_mlp": 1.02003527, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 3.667021225503261, + "language_loss": 0.80795538, + "learning_rate": 2.294333993509978e-09, + "loss": 0.82906771, + "num_input_tokens_seen": 353614270, + "step": 16386, + "time_per_iteration": 4.142850399017334 + }, + { + "auxiliary_loss_clip": 0.01067724, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.03278136, + "balance_loss_mlp": 1.02114725, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 2.071768241590062, + "language_loss": 0.67961872, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.70062315, + "num_input_tokens_seen": 353634900, + "step": 16387, + "time_per_iteration": 2.7710824012756348 + }, + { + "auxiliary_loss_clip": 0.01079086, + "auxiliary_loss_mlp": 0.00749064, + "balance_loss_clip": 1.03114438, + "balance_loss_mlp": 1.00020015, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.6455578830972692, + "language_loss": 0.7399143, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75819582, + "num_input_tokens_seen": 353652890, + "step": 16388, + "time_per_iteration": 2.5463547706604004 + }, + { + "auxiliary_loss_clip": 0.01079244, + "auxiliary_loss_mlp": 0.01027645, + "balance_loss_clip": 1.03028643, + "balance_loss_mlp": 1.01709461, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 1.675969262299436, + "language_loss": 0.82001555, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84108448, + "num_input_tokens_seen": 353671295, + "step": 16389, + "time_per_iteration": 2.537205934524536 + }, + { + "auxiliary_loss_clip": 0.01072975, + "auxiliary_loss_mlp": 0.00749367, + "balance_loss_clip": 1.03128219, + "balance_loss_mlp": 1.00024724, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 2.2003636442809826, + "language_loss": 0.67310393, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.69132733, + "num_input_tokens_seen": 353690560, + "step": 16390, + "time_per_iteration": 2.630298614501953 + }, + { + "auxiliary_loss_clip": 0.01056344, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.03200746, + "balance_loss_mlp": 1.0217154, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.7038160727580633, + "language_loss": 0.77293599, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79383516, + "num_input_tokens_seen": 353710660, + "step": 16391, + "time_per_iteration": 2.6575872898101807 + }, + { + "auxiliary_loss_clip": 0.01053721, + "auxiliary_loss_mlp": 0.00749129, + "balance_loss_clip": 1.03094554, + "balance_loss_mlp": 1.00017428, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 1.807236053915176, + "language_loss": 0.68115121, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.69917977, + "num_input_tokens_seen": 353730440, + "step": 16392, + "time_per_iteration": 2.6109366416931152 + }, + { + "auxiliary_loss_clip": 0.01063457, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.03053653, + "balance_loss_mlp": 1.01784468, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 1.902410744164055, + "language_loss": 0.56704021, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.58797628, + "num_input_tokens_seen": 353748360, + "step": 16393, + "time_per_iteration": 2.6383674144744873 + }, + { + "auxiliary_loss_clip": 0.0105882, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.03040254, + "balance_loss_mlp": 1.0178082, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 2.805432228369653, + "language_loss": 0.7909019, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81179535, + "num_input_tokens_seen": 353760880, + "step": 16394, + "time_per_iteration": 2.6536221504211426 + }, + { + "auxiliary_loss_clip": 0.01083591, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.03250432, + "balance_loss_mlp": 1.02372169, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.4743107456814935, + "language_loss": 0.75961685, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78080648, + "num_input_tokens_seen": 353782255, + "step": 16395, + "time_per_iteration": 2.6878559589385986 + }, + { + "auxiliary_loss_clip": 0.01083015, + "auxiliary_loss_mlp": 0.01028009, + "balance_loss_clip": 1.03258502, + "balance_loss_mlp": 1.0169282, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 2.4687770572138295, + "language_loss": 0.75417483, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77528507, + "num_input_tokens_seen": 353803580, + "step": 16396, + "time_per_iteration": 2.633819103240967 + }, + { + "auxiliary_loss_clip": 0.01056088, + "auxiliary_loss_mlp": 0.0102323, + "balance_loss_clip": 1.03109276, + "balance_loss_mlp": 1.01210201, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.451154753908907, + "language_loss": 0.70952773, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.73032087, + "num_input_tokens_seen": 353824200, + "step": 16397, + "time_per_iteration": 4.144931077957153 + }, + { + "auxiliary_loss_clip": 0.0106976, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.03273916, + "balance_loss_mlp": 1.0207572, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.739458307729772, + "language_loss": 0.71475804, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73576391, + "num_input_tokens_seen": 353843350, + "step": 16398, + "time_per_iteration": 2.592409372329712 + }, + { + "auxiliary_loss_clip": 0.01059382, + "auxiliary_loss_mlp": 0.01024166, + "balance_loss_clip": 1.03037119, + "balance_loss_mlp": 1.01400328, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.3510471096674648, + "language_loss": 0.74078584, + "learning_rate": 2.058291183208771e-09, + "loss": 0.7616213, + "num_input_tokens_seen": 353864520, + "step": 16399, + "time_per_iteration": 2.6538965702056885 + }, + { + "auxiliary_loss_clip": 0.01096562, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.03275633, + "balance_loss_mlp": 1.01662135, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.2583044814146525, + "language_loss": 0.57395399, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.5951975, + "num_input_tokens_seen": 353882240, + "step": 16400, + "time_per_iteration": 2.492227077484131 + }, + { + "auxiliary_loss_clip": 0.01077003, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.03269517, + "balance_loss_mlp": 1.01682067, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 7.6265060321994005, + "language_loss": 0.80214965, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82320625, + "num_input_tokens_seen": 353901590, + "step": 16401, + "time_per_iteration": 2.5787665843963623 + }, + { + "auxiliary_loss_clip": 0.01084954, + "auxiliary_loss_mlp": 0.01032117, + "balance_loss_clip": 1.03323388, + "balance_loss_mlp": 1.01999331, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.621606549560909, + "language_loss": 0.78416359, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80533433, + "num_input_tokens_seen": 353918785, + "step": 16402, + "time_per_iteration": 2.502981662750244 + }, + { + "auxiliary_loss_clip": 0.01088617, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.03472567, + "balance_loss_mlp": 1.02036476, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.5323606181286809, + "language_loss": 0.69850922, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.71970689, + "num_input_tokens_seen": 353940390, + "step": 16403, + "time_per_iteration": 2.5839295387268066 + }, + { + "auxiliary_loss_clip": 0.01077812, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.02880538, + "balance_loss_mlp": 1.02044082, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 2.0190278499165832, + "language_loss": 0.74649048, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.7675786, + "num_input_tokens_seen": 353962180, + "step": 16404, + "time_per_iteration": 2.5431854724884033 + }, + { + "auxiliary_loss_clip": 0.0108227, + "auxiliary_loss_mlp": 0.00749454, + "balance_loss_clip": 1.03072107, + "balance_loss_mlp": 1.00029516, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 1.9522515754914938, + "language_loss": 0.69838464, + "learning_rate": 1.953666699415768e-09, + "loss": 0.71670187, + "num_input_tokens_seen": 353984305, + "step": 16405, + "time_per_iteration": 2.654918909072876 + }, + { + "auxiliary_loss_clip": 0.01075391, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.0349071, + "balance_loss_mlp": 1.02106631, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.6643718737339297, + "language_loss": 0.69944686, + "learning_rate": 1.93649446302846e-09, + "loss": 0.72051084, + "num_input_tokens_seen": 354004495, + "step": 16406, + "time_per_iteration": 2.5914134979248047 + }, + { + "auxiliary_loss_clip": 0.01037039, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.03378046, + "balance_loss_mlp": 1.01975417, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 3.006632825443116, + "language_loss": 0.74103338, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.76171166, + "num_input_tokens_seen": 354015985, + "step": 16407, + "time_per_iteration": 2.732442617416382 + }, + { + "auxiliary_loss_clip": 0.01071487, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.03044891, + "balance_loss_mlp": 1.02051663, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 1.9055958431255857, + "language_loss": 0.77029431, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79131973, + "num_input_tokens_seen": 354033260, + "step": 16408, + "time_per_iteration": 2.564129590988159 + }, + { + "auxiliary_loss_clip": 0.01090129, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.03404188, + "balance_loss_mlp": 1.01693606, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.840061303527374, + "language_loss": 0.68161356, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70280576, + "num_input_tokens_seen": 354052825, + "step": 16409, + "time_per_iteration": 2.53244948387146 + }, + { + "auxiliary_loss_clip": 0.01005091, + "auxiliary_loss_mlp": 0.00999815, + "balance_loss_clip": 1.00469661, + "balance_loss_mlp": 0.99878371, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.802200140730802, + "language_loss": 0.61033529, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63038433, + "num_input_tokens_seen": 354113920, + "step": 16410, + "time_per_iteration": 3.2059884071350098 + }, + { + "auxiliary_loss_clip": 0.01087311, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.0339067, + "balance_loss_mlp": 1.01912415, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 3.527937341541534, + "language_loss": 0.66255748, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68372929, + "num_input_tokens_seen": 354134210, + "step": 16411, + "time_per_iteration": 2.612823963165283 + }, + { + "auxiliary_loss_clip": 0.01022809, + "auxiliary_loss_mlp": 0.01006442, + "balance_loss_clip": 1.00291121, + "balance_loss_mlp": 1.00544035, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7224513812734197, + "language_loss": 0.56233013, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58262265, + "num_input_tokens_seen": 354198010, + "step": 16412, + "time_per_iteration": 3.187441110610962 + }, + { + "auxiliary_loss_clip": 0.01066613, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.03168154, + "balance_loss_mlp": 1.02175844, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 1.883019730049328, + "language_loss": 0.73074138, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75174582, + "num_input_tokens_seen": 354220000, + "step": 16413, + "time_per_iteration": 2.6522111892700195 + }, + { + "auxiliary_loss_clip": 0.01051188, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03190923, + "balance_loss_mlp": 1.01871991, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.4587535419298923, + "language_loss": 0.71383989, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73464847, + "num_input_tokens_seen": 354240910, + "step": 16414, + "time_per_iteration": 4.127901792526245 + }, + { + "auxiliary_loss_clip": 0.01076943, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.03292072, + "balance_loss_mlp": 1.02125788, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.5861029627609724, + "language_loss": 0.70381629, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72490382, + "num_input_tokens_seen": 354259430, + "step": 16415, + "time_per_iteration": 2.531589984893799 + }, + { + "auxiliary_loss_clip": 0.01052984, + "auxiliary_loss_mlp": 0.01026525, + "balance_loss_clip": 1.03062463, + "balance_loss_mlp": 1.01646972, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.4067548422584364, + "language_loss": 0.75280076, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77359587, + "num_input_tokens_seen": 354279490, + "step": 16416, + "time_per_iteration": 2.6509346961975098 + }, + { + "auxiliary_loss_clip": 0.0107172, + "auxiliary_loss_mlp": 0.01026059, + "balance_loss_clip": 1.03352487, + "balance_loss_mlp": 1.01533628, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.2668982119269354, + "language_loss": 0.70688576, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.72786355, + "num_input_tokens_seen": 354295080, + "step": 16417, + "time_per_iteration": 2.5557796955108643 + }, + { + "auxiliary_loss_clip": 0.01075638, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.0347091, + "balance_loss_mlp": 1.02224159, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 2.008123477260715, + "language_loss": 0.70672351, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.727817, + "num_input_tokens_seen": 354314610, + "step": 16418, + "time_per_iteration": 2.670677661895752 + }, + { + "auxiliary_loss_clip": 0.0102304, + "auxiliary_loss_mlp": 0.01001765, + "balance_loss_clip": 1.00324392, + "balance_loss_mlp": 1.00076318, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6582339877220421, + "language_loss": 0.53690511, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55715317, + "num_input_tokens_seen": 354383115, + "step": 16419, + "time_per_iteration": 3.191234588623047 + }, + { + "auxiliary_loss_clip": 0.01074873, + "auxiliary_loss_mlp": 0.01034657, + "balance_loss_clip": 1.0311054, + "balance_loss_mlp": 1.02237868, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.7198897614040785, + "language_loss": 0.78104222, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80213755, + "num_input_tokens_seen": 354403115, + "step": 16420, + "time_per_iteration": 2.6218395233154297 + }, + { + "auxiliary_loss_clip": 0.01061687, + "auxiliary_loss_mlp": 0.01026416, + "balance_loss_clip": 1.03782928, + "balance_loss_mlp": 1.01547241, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 1.6566028535108723, + "language_loss": 0.70767623, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.72855729, + "num_input_tokens_seen": 354424520, + "step": 16421, + "time_per_iteration": 2.650336503982544 + }, + { + "auxiliary_loss_clip": 0.01089008, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.03340316, + "balance_loss_mlp": 1.02158391, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 2.1671447110592412, + "language_loss": 0.82069814, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84192568, + "num_input_tokens_seen": 354444800, + "step": 16422, + "time_per_iteration": 4.218950271606445 + }, + { + "auxiliary_loss_clip": 0.01060639, + "auxiliary_loss_mlp": 0.01022565, + "balance_loss_clip": 1.03166389, + "balance_loss_mlp": 1.0121398, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.7495750853351546, + "language_loss": 0.86080575, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88163781, + "num_input_tokens_seen": 354464590, + "step": 16423, + "time_per_iteration": 2.6534414291381836 + }, + { + "auxiliary_loss_clip": 0.01089096, + "auxiliary_loss_mlp": 0.01024479, + "balance_loss_clip": 1.03409326, + "balance_loss_mlp": 1.01339805, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 2.134860853159361, + "language_loss": 0.70332503, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72446084, + "num_input_tokens_seen": 354484145, + "step": 16424, + "time_per_iteration": 2.5658349990844727 + }, + { + "auxiliary_loss_clip": 0.01086503, + "auxiliary_loss_mlp": 0.0074927, + "balance_loss_clip": 1.03201342, + "balance_loss_mlp": 1.00025439, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 1.96801179332989, + "language_loss": 0.80541599, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82377374, + "num_input_tokens_seen": 354502475, + "step": 16425, + "time_per_iteration": 4.0858237743377686 + }, + { + "auxiliary_loss_clip": 0.01039946, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.02824545, + "balance_loss_mlp": 1.01732993, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 1.736397165318413, + "language_loss": 0.80170566, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82240266, + "num_input_tokens_seen": 354521855, + "step": 16426, + "time_per_iteration": 2.700044870376587 + }, + { + "auxiliary_loss_clip": 0.01087946, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.03513718, + "balance_loss_mlp": 1.02093983, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.830011574848029, + "language_loss": 0.84876347, + "learning_rate": 1.593380599750338e-09, + "loss": 0.86995685, + "num_input_tokens_seen": 354539535, + "step": 16427, + "time_per_iteration": 2.531235933303833 + }, + { + "auxiliary_loss_clip": 0.01095973, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.03391457, + "balance_loss_mlp": 1.01790857, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.914333244616175, + "language_loss": 0.70425057, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72549927, + "num_input_tokens_seen": 354557430, + "step": 16428, + "time_per_iteration": 2.6049206256866455 + }, + { + "auxiliary_loss_clip": 0.01055896, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.03065348, + "balance_loss_mlp": 1.0201416, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 1.8943145384346272, + "language_loss": 0.80170131, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82256627, + "num_input_tokens_seen": 354574735, + "step": 16429, + "time_per_iteration": 2.673318386077881 + }, + { + "auxiliary_loss_clip": 0.01095453, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.03289342, + "balance_loss_mlp": 1.01778424, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.5857508244616807, + "language_loss": 0.61914873, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64038557, + "num_input_tokens_seen": 354597050, + "step": 16430, + "time_per_iteration": 2.6084041595458984 + }, + { + "auxiliary_loss_clip": 0.01097205, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03325009, + "balance_loss_mlp": 1.02332544, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.341483470429869, + "language_loss": 0.73145914, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75277531, + "num_input_tokens_seen": 354619095, + "step": 16431, + "time_per_iteration": 2.5143325328826904 + }, + { + "auxiliary_loss_clip": 0.01097315, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.03446174, + "balance_loss_mlp": 1.02228415, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.6096693565780797, + "language_loss": 0.80647129, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.82777441, + "num_input_tokens_seen": 354633790, + "step": 16432, + "time_per_iteration": 2.4503793716430664 + }, + { + "auxiliary_loss_clip": 0.01081722, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.03137279, + "balance_loss_mlp": 1.01910329, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 2.4901209159292996, + "language_loss": 0.80397093, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82507467, + "num_input_tokens_seen": 354653180, + "step": 16433, + "time_per_iteration": 2.52402400970459 + }, + { + "auxiliary_loss_clip": 0.01093955, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.03341651, + "balance_loss_mlp": 1.01975799, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.503743214538171, + "language_loss": 0.64527392, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.66651952, + "num_input_tokens_seen": 354669900, + "step": 16434, + "time_per_iteration": 2.591610908508301 + }, + { + "auxiliary_loss_clip": 0.01085284, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.03048921, + "balance_loss_mlp": 1.02130055, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.6093530211657452, + "language_loss": 0.69208014, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71326101, + "num_input_tokens_seen": 354693165, + "step": 16435, + "time_per_iteration": 2.6835525035858154 + }, + { + "auxiliary_loss_clip": 0.01038281, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.03219581, + "balance_loss_mlp": 1.02028239, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.7609188190902156, + "language_loss": 0.75516033, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77585876, + "num_input_tokens_seen": 354711915, + "step": 16436, + "time_per_iteration": 2.6195273399353027 + }, + { + "auxiliary_loss_clip": 0.01064772, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.03161597, + "balance_loss_mlp": 1.01791811, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.7710651350841333, + "language_loss": 0.74237078, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76331705, + "num_input_tokens_seen": 354729135, + "step": 16437, + "time_per_iteration": 2.569032669067383 + }, + { + "auxiliary_loss_clip": 0.01060576, + "auxiliary_loss_mlp": 0.01027894, + "balance_loss_clip": 1.03128493, + "balance_loss_mlp": 1.01761794, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 3.6180517693650343, + "language_loss": 0.60457575, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62546051, + "num_input_tokens_seen": 354752530, + "step": 16438, + "time_per_iteration": 4.126100540161133 + }, + { + "auxiliary_loss_clip": 0.01068217, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.03101587, + "balance_loss_mlp": 1.01973915, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 1.8145497070045806, + "language_loss": 0.71945584, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74044919, + "num_input_tokens_seen": 354771135, + "step": 16439, + "time_per_iteration": 2.5522968769073486 + }, + { + "auxiliary_loss_clip": 0.01084108, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.03373086, + "balance_loss_mlp": 1.0200479, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.7449551061488489, + "language_loss": 0.59756845, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.61871898, + "num_input_tokens_seen": 354791800, + "step": 16440, + "time_per_iteration": 2.6744582653045654 + }, + { + "auxiliary_loss_clip": 0.0109666, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.03166568, + "balance_loss_mlp": 1.01956511, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 3.1590261758680227, + "language_loss": 0.75268388, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.77395642, + "num_input_tokens_seen": 354809200, + "step": 16441, + "time_per_iteration": 2.5114264488220215 + }, + { + "auxiliary_loss_clip": 0.01071287, + "auxiliary_loss_mlp": 0.01027878, + "balance_loss_clip": 1.03059971, + "balance_loss_mlp": 1.01702976, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 1.9776553093828313, + "language_loss": 0.67743504, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.69842666, + "num_input_tokens_seen": 354829945, + "step": 16442, + "time_per_iteration": 2.721719264984131 + }, + { + "auxiliary_loss_clip": 0.01084963, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.03231001, + "balance_loss_mlp": 1.01776695, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.205527811534297, + "language_loss": 0.74470603, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.7658447, + "num_input_tokens_seen": 354845055, + "step": 16443, + "time_per_iteration": 2.5811023712158203 + }, + { + "auxiliary_loss_clip": 0.01074081, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.03078794, + "balance_loss_mlp": 1.0157001, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 1.7826381834367626, + "language_loss": 0.73476422, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75577861, + "num_input_tokens_seen": 354864680, + "step": 16444, + "time_per_iteration": 2.5782253742218018 + }, + { + "auxiliary_loss_clip": 0.01047711, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.03349602, + "balance_loss_mlp": 1.02475429, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 1.7534671258552856, + "language_loss": 0.69101179, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71185195, + "num_input_tokens_seen": 354885685, + "step": 16445, + "time_per_iteration": 2.6833441257476807 + }, + { + "auxiliary_loss_clip": 0.0109078, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.03563833, + "balance_loss_mlp": 1.01586628, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 2.8245361980976993, + "language_loss": 0.59864962, + "learning_rate": 1.311740377491155e-09, + "loss": 0.61983031, + "num_input_tokens_seen": 354901505, + "step": 16446, + "time_per_iteration": 2.535245418548584 + }, + { + "auxiliary_loss_clip": 0.01067672, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.03215027, + "balance_loss_mlp": 1.02097368, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.0516099931072174, + "language_loss": 0.71000123, + "learning_rate": 1.297675079582783e-09, + "loss": 0.73099107, + "num_input_tokens_seen": 354920060, + "step": 16447, + "time_per_iteration": 2.5269734859466553 + }, + { + "auxiliary_loss_clip": 0.01094555, + "auxiliary_loss_mlp": 0.00749316, + "balance_loss_clip": 1.03239834, + "balance_loss_mlp": 1.00019419, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.8284043509433558, + "language_loss": 0.83742321, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.8558619, + "num_input_tokens_seen": 354938690, + "step": 16448, + "time_per_iteration": 2.526796579360962 + }, + { + "auxiliary_loss_clip": 0.0108232, + "auxiliary_loss_mlp": 0.01024593, + "balance_loss_clip": 1.03300333, + "balance_loss_mlp": 1.01474071, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 4.184010503792284, + "language_loss": 0.704175, + "learning_rate": 1.26977185727406e-09, + "loss": 0.72524416, + "num_input_tokens_seen": 354956955, + "step": 16449, + "time_per_iteration": 2.5285582542419434 + }, + { + "auxiliary_loss_clip": 0.01086723, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.03244615, + "balance_loss_mlp": 1.01794469, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 2.474122332627365, + "language_loss": 0.7356863, + "learning_rate": 1.25593393393153e-09, + "loss": 0.75684452, + "num_input_tokens_seen": 354976800, + "step": 16450, + "time_per_iteration": 2.6663591861724854 + }, + { + "auxiliary_loss_clip": 0.01097302, + "auxiliary_loss_mlp": 0.01029755, + "balance_loss_clip": 1.03130662, + "balance_loss_mlp": 1.01846588, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 3.0717884949371057, + "language_loss": 0.79309261, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81436312, + "num_input_tokens_seen": 354996625, + "step": 16451, + "time_per_iteration": 2.5185294151306152 + }, + { + "auxiliary_loss_clip": 0.01057282, + "auxiliary_loss_mlp": 0.01034643, + "balance_loss_clip": 1.02963018, + "balance_loss_mlp": 1.02291274, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 3.409871897440398, + "language_loss": 0.70042616, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72134542, + "num_input_tokens_seen": 355014535, + "step": 16452, + "time_per_iteration": 2.701216697692871 + }, + { + "auxiliary_loss_clip": 0.01095319, + "auxiliary_loss_mlp": 0.01023983, + "balance_loss_clip": 1.03447092, + "balance_loss_mlp": 1.01382589, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.621046091657929, + "language_loss": 0.73937792, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76057088, + "num_input_tokens_seen": 355033280, + "step": 16453, + "time_per_iteration": 2.5039539337158203 + }, + { + "auxiliary_loss_clip": 0.01057126, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.03270698, + "balance_loss_mlp": 1.02412868, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.513890021781657, + "language_loss": 0.70126486, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.72217989, + "num_input_tokens_seen": 355053320, + "step": 16454, + "time_per_iteration": 4.16351056098938 + }, + { + "auxiliary_loss_clip": 0.01064597, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.03117466, + "balance_loss_mlp": 1.01947343, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 1.7857857937065449, + "language_loss": 0.75661242, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.77755427, + "num_input_tokens_seen": 355070230, + "step": 16455, + "time_per_iteration": 2.613320827484131 + }, + { + "auxiliary_loss_clip": 0.01068772, + "auxiliary_loss_mlp": 0.01025958, + "balance_loss_clip": 1.03321445, + "balance_loss_mlp": 1.01502037, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.6292192600971391, + "language_loss": 0.65503222, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.6759795, + "num_input_tokens_seen": 355090125, + "step": 16456, + "time_per_iteration": 2.6043901443481445 + }, + { + "auxiliary_loss_clip": 0.01088621, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.03517449, + "balance_loss_mlp": 1.01698089, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 2.093409860453424, + "language_loss": 0.7412498, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76241243, + "num_input_tokens_seen": 355107890, + "step": 16457, + "time_per_iteration": 2.5541186332702637 + }, + { + "auxiliary_loss_clip": 0.01098416, + "auxiliary_loss_mlp": 0.01025886, + "balance_loss_clip": 1.03518581, + "balance_loss_mlp": 1.01530588, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.125560514777447, + "language_loss": 0.69441009, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.71565306, + "num_input_tokens_seen": 355126340, + "step": 16458, + "time_per_iteration": 2.5735068321228027 + }, + { + "auxiliary_loss_clip": 0.01082189, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.03197074, + "balance_loss_mlp": 1.01921558, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.7365793189890077, + "language_loss": 0.79211247, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81323338, + "num_input_tokens_seen": 355144025, + "step": 16459, + "time_per_iteration": 2.5401012897491455 + }, + { + "auxiliary_loss_clip": 0.01071645, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03158188, + "balance_loss_mlp": 1.02117419, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 2.022398939438978, + "language_loss": 0.70936167, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.73039711, + "num_input_tokens_seen": 355163125, + "step": 16460, + "time_per_iteration": 2.610856533050537 + }, + { + "auxiliary_loss_clip": 0.01075683, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.03127718, + "balance_loss_mlp": 1.01560283, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.5989152507513178, + "language_loss": 0.8758229, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89684832, + "num_input_tokens_seen": 355184060, + "step": 16461, + "time_per_iteration": 2.6148438453674316 + }, + { + "auxiliary_loss_clip": 0.01084377, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.03248334, + "balance_loss_mlp": 1.02080822, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 2.1668307257026793, + "language_loss": 0.63263917, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65380555, + "num_input_tokens_seen": 355204505, + "step": 16462, + "time_per_iteration": 2.5588433742523193 + }, + { + "auxiliary_loss_clip": 0.01090874, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.03689063, + "balance_loss_mlp": 1.01946926, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.6116541654454282, + "language_loss": 0.73120052, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75240958, + "num_input_tokens_seen": 355223055, + "step": 16463, + "time_per_iteration": 4.064153671264648 + }, + { + "auxiliary_loss_clip": 0.01086638, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.0336237, + "balance_loss_mlp": 1.01594853, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 1.8978849532187279, + "language_loss": 0.70109075, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72223181, + "num_input_tokens_seen": 355242000, + "step": 16464, + "time_per_iteration": 2.6430230140686035 + }, + { + "auxiliary_loss_clip": 0.01067027, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.03188384, + "balance_loss_mlp": 1.01637018, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 1.9867648949108392, + "language_loss": 0.72738016, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.74832404, + "num_input_tokens_seen": 355260175, + "step": 16465, + "time_per_iteration": 2.656817674636841 + }, + { + "auxiliary_loss_clip": 0.01094352, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.03243899, + "balance_loss_mlp": 1.01949465, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.6437471405296737, + "language_loss": 0.86106217, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88230002, + "num_input_tokens_seen": 355281930, + "step": 16466, + "time_per_iteration": 4.166178464889526 + }, + { + "auxiliary_loss_clip": 0.01057691, + "auxiliary_loss_mlp": 0.01023907, + "balance_loss_clip": 1.03220212, + "balance_loss_mlp": 1.01344669, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.6676581213382842, + "language_loss": 0.71686685, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73768282, + "num_input_tokens_seen": 355301555, + "step": 16467, + "time_per_iteration": 2.5928900241851807 + }, + { + "auxiliary_loss_clip": 0.01063857, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.02864838, + "balance_loss_mlp": 1.02144372, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.619427102144393, + "language_loss": 0.64983201, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67080534, + "num_input_tokens_seen": 355324925, + "step": 16468, + "time_per_iteration": 2.672534227371216 + }, + { + "auxiliary_loss_clip": 0.01068761, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.03317583, + "balance_loss_mlp": 1.0192728, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 1.9564217236262702, + "language_loss": 0.62267768, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.6436677, + "num_input_tokens_seen": 355343875, + "step": 16469, + "time_per_iteration": 2.64542555809021 + }, + { + "auxiliary_loss_clip": 0.01060127, + "auxiliary_loss_mlp": 0.01027275, + "balance_loss_clip": 1.03085065, + "balance_loss_mlp": 1.01593828, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.448494946623033, + "language_loss": 0.70273924, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72361332, + "num_input_tokens_seen": 355358835, + "step": 16470, + "time_per_iteration": 2.565930128097534 + }, + { + "auxiliary_loss_clip": 0.01012648, + "auxiliary_loss_mlp": 0.01004349, + "balance_loss_clip": 1.00297034, + "balance_loss_mlp": 1.00332344, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6992893575240045, + "language_loss": 0.55447793, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57464784, + "num_input_tokens_seen": 355431225, + "step": 16471, + "time_per_iteration": 3.311102867126465 + }, + { + "auxiliary_loss_clip": 0.01077274, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.03421068, + "balance_loss_mlp": 1.01765728, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 1.9958323075819266, + "language_loss": 0.84002101, + "learning_rate": 9.706760407131032e-10, + "loss": 0.86108172, + "num_input_tokens_seen": 355448250, + "step": 16472, + "time_per_iteration": 2.5822689533233643 + }, + { + "auxiliary_loss_clip": 0.01086371, + "auxiliary_loss_mlp": 0.01025666, + "balance_loss_clip": 1.03468418, + "balance_loss_mlp": 1.01493692, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 1.8244858627006746, + "language_loss": 0.85688758, + "learning_rate": 9.585814735431075e-10, + "loss": 0.87800795, + "num_input_tokens_seen": 355467040, + "step": 16473, + "time_per_iteration": 2.6239898204803467 + }, + { + "auxiliary_loss_clip": 0.01094461, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.0319252, + "balance_loss_mlp": 1.01779222, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 2.2045817086908936, + "language_loss": 0.84483206, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86605513, + "num_input_tokens_seen": 355487825, + "step": 16474, + "time_per_iteration": 2.5760042667388916 + }, + { + "auxiliary_loss_clip": 0.01069704, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.02839983, + "balance_loss_mlp": 1.02330518, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.6352442681034862, + "language_loss": 0.7634486, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78447795, + "num_input_tokens_seen": 355507445, + "step": 16475, + "time_per_iteration": 2.6296982765197754 + }, + { + "auxiliary_loss_clip": 0.01051131, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.02827311, + "balance_loss_mlp": 1.02425504, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.507311462218359, + "language_loss": 0.75821006, + "learning_rate": 9.227525969588423e-10, + "loss": 0.77908206, + "num_input_tokens_seen": 355527205, + "step": 16476, + "time_per_iteration": 2.6635079383850098 + }, + { + "auxiliary_loss_clip": 0.01089405, + "auxiliary_loss_mlp": 0.00749489, + "balance_loss_clip": 1.03239429, + "balance_loss_mlp": 1.0002501, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.319561391994334, + "language_loss": 0.67654133, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69493026, + "num_input_tokens_seen": 355544740, + "step": 16477, + "time_per_iteration": 2.5726823806762695 + }, + { + "auxiliary_loss_clip": 0.010813, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.03550422, + "balance_loss_mlp": 1.01745987, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 31.026178646530983, + "language_loss": 0.71775508, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73886245, + "num_input_tokens_seen": 355564385, + "step": 16478, + "time_per_iteration": 3.985135793685913 + }, + { + "auxiliary_loss_clip": 0.01098519, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.03393054, + "balance_loss_mlp": 1.02715874, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.284996601840504, + "language_loss": 0.81120729, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83258593, + "num_input_tokens_seen": 355579260, + "step": 16479, + "time_per_iteration": 2.410487174987793 + }, + { + "auxiliary_loss_clip": 0.01088006, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.03342009, + "balance_loss_mlp": 1.0225476, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.789425135084574, + "language_loss": 0.65939891, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68061018, + "num_input_tokens_seen": 355599790, + "step": 16480, + "time_per_iteration": 2.56136417388916 + }, + { + "auxiliary_loss_clip": 0.01082584, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.03123915, + "balance_loss_mlp": 1.01883268, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.9090336321739028, + "language_loss": 0.72398078, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74509805, + "num_input_tokens_seen": 355620925, + "step": 16481, + "time_per_iteration": 2.8342320919036865 + }, + { + "auxiliary_loss_clip": 0.0108335, + "auxiliary_loss_mlp": 0.01022192, + "balance_loss_clip": 1.03205085, + "balance_loss_mlp": 1.01160014, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 2.220337595262995, + "language_loss": 0.77763265, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79868805, + "num_input_tokens_seen": 355639165, + "step": 16482, + "time_per_iteration": 2.6022183895111084 + }, + { + "auxiliary_loss_clip": 0.01086597, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.03250289, + "balance_loss_mlp": 1.01673532, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 2.026611848940476, + "language_loss": 0.74875963, + "learning_rate": 8.418050878944427e-10, + "loss": 0.76990056, + "num_input_tokens_seen": 355657320, + "step": 16483, + "time_per_iteration": 2.509256362915039 + }, + { + "auxiliary_loss_clip": 0.01012713, + "auxiliary_loss_mlp": 0.01000405, + "balance_loss_clip": 1.00268435, + "balance_loss_mlp": 0.99950522, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6768039172572019, + "language_loss": 0.53641772, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55654883, + "num_input_tokens_seen": 355726370, + "step": 16484, + "time_per_iteration": 3.243687629699707 + }, + { + "auxiliary_loss_clip": 0.01092567, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.03236938, + "balance_loss_mlp": 1.0153091, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.7447528719289014, + "language_loss": 0.81977665, + "learning_rate": 8.19359496165184e-10, + "loss": 0.84096014, + "num_input_tokens_seen": 355745840, + "step": 16485, + "time_per_iteration": 2.4822771549224854 + }, + { + "auxiliary_loss_clip": 0.01050138, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.0289427, + "balance_loss_mlp": 1.02060962, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 2.0143336688226916, + "language_loss": 0.8136763, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83450437, + "num_input_tokens_seen": 355763385, + "step": 16486, + "time_per_iteration": 2.5751118659973145 + }, + { + "auxiliary_loss_clip": 0.01088919, + "auxiliary_loss_mlp": 0.01025328, + "balance_loss_clip": 1.03453159, + "balance_loss_mlp": 1.0152843, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.5209088755202558, + "language_loss": 0.66028047, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68142295, + "num_input_tokens_seen": 355786075, + "step": 16487, + "time_per_iteration": 2.6469004154205322 + }, + { + "auxiliary_loss_clip": 0.01082686, + "auxiliary_loss_mlp": 0.00749367, + "balance_loss_clip": 1.03191161, + "balance_loss_mlp": 1.00031948, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.7540768038656165, + "language_loss": 0.76512933, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78344989, + "num_input_tokens_seen": 355806295, + "step": 16488, + "time_per_iteration": 2.610628366470337 + }, + { + "auxiliary_loss_clip": 0.01069993, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.03338003, + "balance_loss_mlp": 1.01726758, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 2.7201319690957844, + "language_loss": 0.68552786, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70651424, + "num_input_tokens_seen": 355825730, + "step": 16489, + "time_per_iteration": 2.6028029918670654 + }, + { + "auxiliary_loss_clip": 0.00984597, + "auxiliary_loss_mlp": 0.01004273, + "balance_loss_clip": 1.00583172, + "balance_loss_mlp": 1.00327766, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6099337473030333, + "language_loss": 0.52545297, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54534167, + "num_input_tokens_seen": 355891545, + "step": 16490, + "time_per_iteration": 3.261777877807617 + }, + { + "auxiliary_loss_clip": 0.01068367, + "auxiliary_loss_mlp": 0.01038222, + "balance_loss_clip": 1.03321147, + "balance_loss_mlp": 1.02575862, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 1.6182440126590711, + "language_loss": 0.75715137, + "learning_rate": 7.538421534734052e-10, + "loss": 0.77821726, + "num_input_tokens_seen": 355909920, + "step": 16491, + "time_per_iteration": 2.594810724258423 + }, + { + "auxiliary_loss_clip": 0.01057202, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03543782, + "balance_loss_mlp": 1.0189383, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 3.0391769469128715, + "language_loss": 0.70127606, + "learning_rate": 7.431879346191383e-10, + "loss": 0.7221595, + "num_input_tokens_seen": 355923130, + "step": 16492, + "time_per_iteration": 2.6255788803100586 + }, + { + "auxiliary_loss_clip": 0.01059497, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.03031659, + "balance_loss_mlp": 1.01929915, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 2.169294716293776, + "language_loss": 0.68215346, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70306361, + "num_input_tokens_seen": 355941960, + "step": 16493, + "time_per_iteration": 2.653824806213379 + }, + { + "auxiliary_loss_clip": 0.01072566, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.03317952, + "balance_loss_mlp": 1.02043617, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.7552132518599872, + "language_loss": 0.71232778, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73337173, + "num_input_tokens_seen": 355961640, + "step": 16494, + "time_per_iteration": 2.6218888759613037 + }, + { + "auxiliary_loss_clip": 0.01086326, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.03307581, + "balance_loss_mlp": 1.0197283, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.299276478551422, + "language_loss": 0.68214822, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70332843, + "num_input_tokens_seen": 355977980, + "step": 16495, + "time_per_iteration": 4.07647967338562 + }, + { + "auxiliary_loss_clip": 0.01003962, + "auxiliary_loss_mlp": 0.0100227, + "balance_loss_clip": 1.00385356, + "balance_loss_mlp": 1.00142312, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7217394181116746, + "language_loss": 0.53442407, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55448639, + "num_input_tokens_seen": 356042900, + "step": 16496, + "time_per_iteration": 3.281053304672241 + }, + { + "auxiliary_loss_clip": 0.01071129, + "auxiliary_loss_mlp": 0.00749727, + "balance_loss_clip": 1.03144002, + "balance_loss_mlp": 1.00021362, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 2.0588081897544646, + "language_loss": 0.71332878, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73153734, + "num_input_tokens_seen": 356063000, + "step": 16497, + "time_per_iteration": 2.6373987197875977 + }, + { + "auxiliary_loss_clip": 0.01070341, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.03352869, + "balance_loss_mlp": 1.02047133, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 2.4879048607091017, + "language_loss": 0.82096261, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84199286, + "num_input_tokens_seen": 356078130, + "step": 16498, + "time_per_iteration": 2.651324510574341 + }, + { + "auxiliary_loss_clip": 0.01065021, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.03539145, + "balance_loss_mlp": 1.02548504, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.784405387526345, + "language_loss": 0.68353176, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70455062, + "num_input_tokens_seen": 356101655, + "step": 16499, + "time_per_iteration": 2.6786367893218994 + }, + { + "auxiliary_loss_clip": 0.01098457, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.03445876, + "balance_loss_mlp": 1.01572418, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 2.1061195396889127, + "language_loss": 0.82319897, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84445024, + "num_input_tokens_seen": 356121425, + "step": 16500, + "time_per_iteration": 2.510624647140503 + }, + { + "auxiliary_loss_clip": 0.01066096, + "auxiliary_loss_mlp": 0.01027514, + "balance_loss_clip": 1.03159535, + "balance_loss_mlp": 1.01592088, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.6785065194192523, + "language_loss": 0.81838655, + "learning_rate": 6.507115533036511e-10, + "loss": 0.83932269, + "num_input_tokens_seen": 356140710, + "step": 16501, + "time_per_iteration": 2.5984325408935547 + }, + { + "auxiliary_loss_clip": 0.01086747, + "auxiliary_loss_mlp": 0.01026032, + "balance_loss_clip": 1.03260875, + "balance_loss_mlp": 1.01488006, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 2.331830009590158, + "language_loss": 0.76642251, + "learning_rate": 6.408154723420711e-10, + "loss": 0.78755033, + "num_input_tokens_seen": 356159835, + "step": 16502, + "time_per_iteration": 2.580413579940796 + }, + { + "auxiliary_loss_clip": 0.0107347, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03230023, + "balance_loss_mlp": 1.01722217, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 6.31724337525984, + "language_loss": 0.71440995, + "learning_rate": 6.309952072811597e-10, + "loss": 0.73543692, + "num_input_tokens_seen": 356177555, + "step": 16503, + "time_per_iteration": 4.068079948425293 + }, + { + "auxiliary_loss_clip": 0.01012733, + "auxiliary_loss_mlp": 0.01008527, + "balance_loss_clip": 1.00310636, + "balance_loss_mlp": 1.00728738, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6390420042839627, + "language_loss": 0.55109996, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57131255, + "num_input_tokens_seen": 356244975, + "step": 16504, + "time_per_iteration": 3.1772525310516357 + }, + { + "auxiliary_loss_clip": 0.01063391, + "auxiliary_loss_mlp": 0.01023478, + "balance_loss_clip": 1.03134608, + "balance_loss_mlp": 1.01325536, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.9524731973063207, + "language_loss": 0.69514191, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71601063, + "num_input_tokens_seen": 356262605, + "step": 16505, + "time_per_iteration": 2.624199867248535 + }, + { + "auxiliary_loss_clip": 0.01061057, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.03128171, + "balance_loss_mlp": 1.01845217, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 2.079514274221254, + "language_loss": 0.65974975, + "learning_rate": 6.019893112119146e-10, + "loss": 0.68067086, + "num_input_tokens_seen": 356278935, + "step": 16506, + "time_per_iteration": 4.155103921890259 + }, + { + "auxiliary_loss_clip": 0.01033977, + "auxiliary_loss_mlp": 0.01026689, + "balance_loss_clip": 1.03136492, + "balance_loss_mlp": 1.0152564, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 1.833683854495791, + "language_loss": 0.62912363, + "learning_rate": 5.924723134487219e-10, + "loss": 0.64973027, + "num_input_tokens_seen": 356295675, + "step": 16507, + "time_per_iteration": 2.735111951828003 + }, + { + "auxiliary_loss_clip": 0.01097903, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.03375673, + "balance_loss_mlp": 1.02000093, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.3189220666609756, + "language_loss": 0.72356158, + "learning_rate": 5.830311334193983e-10, + "loss": 0.74485373, + "num_input_tokens_seen": 356312885, + "step": 16508, + "time_per_iteration": 2.577747106552124 + }, + { + "auxiliary_loss_clip": 0.0109653, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.03266335, + "balance_loss_mlp": 1.01688862, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.539353784653664, + "language_loss": 0.69977164, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72102177, + "num_input_tokens_seen": 356334070, + "step": 16509, + "time_per_iteration": 2.5209097862243652 + }, + { + "auxiliary_loss_clip": 0.01085672, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.03158712, + "balance_loss_mlp": 1.02026844, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.6885459280332016, + "language_loss": 0.68614388, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70731187, + "num_input_tokens_seen": 356359410, + "step": 16510, + "time_per_iteration": 2.9439003467559814 + }, + { + "auxiliary_loss_clip": 0.01062731, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.0328517, + "balance_loss_mlp": 1.01928663, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 3.679964386866404, + "language_loss": 0.80538476, + "learning_rate": 5.551625032997886e-10, + "loss": 0.82631981, + "num_input_tokens_seen": 356378345, + "step": 16511, + "time_per_iteration": 2.6372721195220947 + }, + { + "auxiliary_loss_clip": 0.01051441, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.02987981, + "balance_loss_mlp": 1.02038884, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.8995505946169136, + "language_loss": 0.91514051, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93596458, + "num_input_tokens_seen": 356397345, + "step": 16512, + "time_per_iteration": 2.6658530235290527 + }, + { + "auxiliary_loss_clip": 0.00992542, + "auxiliary_loss_mlp": 0.01003405, + "balance_loss_clip": 1.00295186, + "balance_loss_mlp": 1.00247478, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.6972419278917891, + "language_loss": 0.55259883, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57255828, + "num_input_tokens_seen": 356459160, + "step": 16513, + "time_per_iteration": 3.23760724067688 + }, + { + "auxiliary_loss_clip": 0.01068536, + "auxiliary_loss_mlp": 0.01027433, + "balance_loss_clip": 1.03143215, + "balance_loss_mlp": 1.01629901, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.3479270526436193, + "language_loss": 0.64674556, + "learning_rate": 5.279762455006054e-10, + "loss": 0.66770524, + "num_input_tokens_seen": 356486405, + "step": 16514, + "time_per_iteration": 2.902783155441284 + }, + { + "auxiliary_loss_clip": 0.01062701, + "auxiliary_loss_mlp": 0.01027923, + "balance_loss_clip": 1.03016663, + "balance_loss_mlp": 1.01576972, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 2.085602300968374, + "language_loss": 0.73220712, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75311339, + "num_input_tokens_seen": 356502905, + "step": 16515, + "time_per_iteration": 2.594106435775757 + }, + { + "auxiliary_loss_clip": 0.01049834, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.0300293, + "balance_loss_mlp": 1.02276039, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.5518148885291214, + "language_loss": 0.77340263, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79424948, + "num_input_tokens_seen": 356523830, + "step": 16516, + "time_per_iteration": 2.665739059448242 + }, + { + "auxiliary_loss_clip": 0.01065801, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.03274119, + "balance_loss_mlp": 1.01684141, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.529656290039931, + "language_loss": 0.78065032, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80157566, + "num_input_tokens_seen": 356543965, + "step": 16517, + "time_per_iteration": 2.6628897190093994 + }, + { + "auxiliary_loss_clip": 0.01086361, + "auxiliary_loss_mlp": 0.01036491, + "balance_loss_clip": 1.03530586, + "balance_loss_mlp": 1.02411103, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 3.110328451241328, + "language_loss": 0.67215019, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69337875, + "num_input_tokens_seen": 356561530, + "step": 16518, + "time_per_iteration": 4.1040565967559814 + }, + { + "auxiliary_loss_clip": 0.010043, + "auxiliary_loss_mlp": 0.01001152, + "balance_loss_clip": 1.00689566, + "balance_loss_mlp": 1.00021613, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.732263415193913, + "language_loss": 0.53445041, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55450499, + "num_input_tokens_seen": 356616845, + "step": 16519, + "time_per_iteration": 3.009605646133423 + }, + { + "auxiliary_loss_clip": 0.01048232, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.02863204, + "balance_loss_mlp": 1.02153373, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.6874424954786975, + "language_loss": 0.60228884, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62310803, + "num_input_tokens_seen": 356633560, + "step": 16520, + "time_per_iteration": 2.601407766342163 + }, + { + "auxiliary_loss_clip": 0.01063053, + "auxiliary_loss_mlp": 0.01027165, + "balance_loss_clip": 1.03066969, + "balance_loss_mlp": 1.01615596, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 2.9552795393341023, + "language_loss": 0.61936474, + "learning_rate": 4.671953657853223e-10, + "loss": 0.6402669, + "num_input_tokens_seen": 356657600, + "step": 16521, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.01075975, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.03584456, + "balance_loss_mlp": 1.02011621, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 1.7161992689696248, + "language_loss": 0.74375969, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76483893, + "num_input_tokens_seen": 356675880, + "step": 16522, + "time_per_iteration": 2.6130058765411377 + }, + { + "auxiliary_loss_clip": 0.01060113, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.03104043, + "balance_loss_mlp": 1.01987255, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.6819035517957674, + "language_loss": 0.7306242, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75153041, + "num_input_tokens_seen": 356696000, + "step": 16523, + "time_per_iteration": 2.6457483768463135 + }, + { + "auxiliary_loss_clip": 0.01071186, + "auxiliary_loss_mlp": 0.00749286, + "balance_loss_clip": 1.02977943, + "balance_loss_mlp": 1.00019598, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 2.339699168379076, + "language_loss": 0.71569097, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73389566, + "num_input_tokens_seen": 356716845, + "step": 16524, + "time_per_iteration": 2.6310973167419434 + }, + { + "auxiliary_loss_clip": 0.01064067, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_clip": 1.03231764, + "balance_loss_mlp": 1.01544845, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 1.801855669608302, + "language_loss": 0.79448438, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81538236, + "num_input_tokens_seen": 356732100, + "step": 16525, + "time_per_iteration": 2.6289963722229004 + }, + { + "auxiliary_loss_clip": 0.01052158, + "auxiliary_loss_mlp": 0.01024022, + "balance_loss_clip": 1.03219271, + "balance_loss_mlp": 1.01329899, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 1.8041296318253304, + "language_loss": 0.74485326, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.76561511, + "num_input_tokens_seen": 356751480, + "step": 16526, + "time_per_iteration": 2.6205711364746094 + }, + { + "auxiliary_loss_clip": 0.01093277, + "auxiliary_loss_mlp": 0.00749319, + "balance_loss_clip": 1.03199863, + "balance_loss_mlp": 1.00021505, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 1.4808171629848386, + "language_loss": 0.72437686, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74280286, + "num_input_tokens_seen": 356772650, + "step": 16527, + "time_per_iteration": 2.6281259059906006 + }, + { + "auxiliary_loss_clip": 0.01076864, + "auxiliary_loss_mlp": 0.01025134, + "balance_loss_clip": 1.03378725, + "balance_loss_mlp": 1.0142982, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.161583056225011, + "language_loss": 0.75979996, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78081989, + "num_input_tokens_seen": 356788510, + "step": 16528, + "time_per_iteration": 2.7187464237213135 + }, + { + "auxiliary_loss_clip": 0.01075757, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.03009939, + "balance_loss_mlp": 1.01687264, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 2.1111688658762993, + "language_loss": 0.67515278, + "learning_rate": 4.022808578922898e-10, + "loss": 0.69620126, + "num_input_tokens_seen": 356809115, + "step": 16529, + "time_per_iteration": 2.587883472442627 + }, + { + "auxiliary_loss_clip": 0.01092612, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.03593576, + "balance_loss_mlp": 1.01988709, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 3.328092086012569, + "language_loss": 0.65248793, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67374057, + "num_input_tokens_seen": 356826410, + "step": 16530, + "time_per_iteration": 2.559152126312256 + }, + { + "auxiliary_loss_clip": 0.0107977, + "auxiliary_loss_mlp": 0.010255, + "balance_loss_clip": 1.03241467, + "balance_loss_mlp": 1.01514685, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 2.541746319398807, + "language_loss": 0.71311951, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73417222, + "num_input_tokens_seen": 356844990, + "step": 16531, + "time_per_iteration": 2.55077862739563 + }, + { + "auxiliary_loss_clip": 0.01080095, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.03239894, + "balance_loss_mlp": 1.01801288, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.586565293498651, + "language_loss": 0.74116307, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76226199, + "num_input_tokens_seen": 356866530, + "step": 16532, + "time_per_iteration": 2.585630416870117 + }, + { + "auxiliary_loss_clip": 0.01040138, + "auxiliary_loss_mlp": 0.01030789, + "balance_loss_clip": 1.02887058, + "balance_loss_mlp": 1.02091885, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.5679404784967155, + "language_loss": 0.70373452, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72444373, + "num_input_tokens_seen": 356884660, + "step": 16533, + "time_per_iteration": 2.616689682006836 + }, + { + "auxiliary_loss_clip": 0.01086762, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.03401327, + "balance_loss_mlp": 1.01794612, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 2.375625384066092, + "language_loss": 0.83655643, + "learning_rate": 3.641735912007782e-10, + "loss": 0.85772109, + "num_input_tokens_seen": 356900895, + "step": 16534, + "time_per_iteration": 2.5684218406677246 + }, + { + "auxiliary_loss_clip": 0.01055007, + "auxiliary_loss_mlp": 0.01025769, + "balance_loss_clip": 1.02903795, + "balance_loss_mlp": 1.01540995, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.4227043898265774, + "language_loss": 0.65775537, + "learning_rate": 3.567796158934211e-10, + "loss": 0.67856312, + "num_input_tokens_seen": 356920985, + "step": 16535, + "time_per_iteration": 2.7073235511779785 + }, + { + "auxiliary_loss_clip": 0.01060873, + "auxiliary_loss_mlp": 0.01024047, + "balance_loss_clip": 1.0348264, + "balance_loss_mlp": 1.01450992, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.64899154125582, + "language_loss": 0.64888996, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66973913, + "num_input_tokens_seen": 356939800, + "step": 16536, + "time_per_iteration": 4.1557159423828125 + }, + { + "auxiliary_loss_clip": 0.01045644, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.02808285, + "balance_loss_mlp": 1.0183903, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.6761037582505207, + "language_loss": 0.78780615, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.8085739, + "num_input_tokens_seen": 356957780, + "step": 16537, + "time_per_iteration": 2.6156044006347656 + }, + { + "auxiliary_loss_clip": 0.01092056, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.03424263, + "balance_loss_mlp": 1.02033257, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.8242290694631156, + "language_loss": 0.68503577, + "learning_rate": 3.35052651107004e-10, + "loss": 0.7062732, + "num_input_tokens_seen": 356979185, + "step": 16538, + "time_per_iteration": 2.5248773097991943 + }, + { + "auxiliary_loss_clip": 0.01051669, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.02742529, + "balance_loss_mlp": 1.02269101, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.8546090598176725, + "language_loss": 0.75302917, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.77388632, + "num_input_tokens_seen": 356997735, + "step": 16539, + "time_per_iteration": 2.657763719558716 + }, + { + "auxiliary_loss_clip": 0.01048039, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.03241754, + "balance_loss_mlp": 1.0205543, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 2.0523485083317463, + "language_loss": 0.70734525, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72814429, + "num_input_tokens_seen": 357015660, + "step": 16540, + "time_per_iteration": 2.6895952224731445 + }, + { + "auxiliary_loss_clip": 0.0108318, + "auxiliary_loss_mlp": 0.01026118, + "balance_loss_clip": 1.03281939, + "balance_loss_mlp": 1.01641989, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.8459304169154334, + "language_loss": 0.75796318, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77905607, + "num_input_tokens_seen": 357034800, + "step": 16541, + "time_per_iteration": 2.589966058731079 + }, + { + "auxiliary_loss_clip": 0.01068073, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.03121459, + "balance_loss_mlp": 1.02116609, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 2.3412469034078023, + "language_loss": 0.76658219, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78758693, + "num_input_tokens_seen": 357053785, + "step": 16542, + "time_per_iteration": 2.632830858230591 + }, + { + "auxiliary_loss_clip": 0.01088262, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.03322387, + "balance_loss_mlp": 1.01634932, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.2388100719224595, + "language_loss": 0.7469914, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76815629, + "num_input_tokens_seen": 357072025, + "step": 16543, + "time_per_iteration": 4.069934368133545 + }, + { + "auxiliary_loss_clip": 0.01079832, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.03174734, + "balance_loss_mlp": 1.01717067, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.5625518821264515, + "language_loss": 0.81935966, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84044909, + "num_input_tokens_seen": 357086960, + "step": 16544, + "time_per_iteration": 2.518193483352661 + }, + { + "auxiliary_loss_clip": 0.01096681, + "auxiliary_loss_mlp": 0.01025668, + "balance_loss_clip": 1.03349829, + "balance_loss_mlp": 1.01488519, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 1.7417389940651296, + "language_loss": 0.78480679, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80603027, + "num_input_tokens_seen": 357105095, + "step": 16545, + "time_per_iteration": 2.501546859741211 + }, + { + "auxiliary_loss_clip": 0.01063799, + "auxiliary_loss_mlp": 0.01029828, + "balance_loss_clip": 1.03205252, + "balance_loss_mlp": 1.01861048, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.6881999500852556, + "language_loss": 0.72222364, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74315989, + "num_input_tokens_seen": 357125065, + "step": 16546, + "time_per_iteration": 4.239944934844971 + }, + { + "auxiliary_loss_clip": 0.01081177, + "auxiliary_loss_mlp": 0.01032165, + "balance_loss_clip": 1.03144324, + "balance_loss_mlp": 1.0215795, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 1.6792459844920353, + "language_loss": 0.77304912, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79418254, + "num_input_tokens_seen": 357141600, + "step": 16547, + "time_per_iteration": 2.53190541267395 + }, + { + "auxiliary_loss_clip": 0.01073126, + "auxiliary_loss_mlp": 0.01028811, + "balance_loss_clip": 1.03033769, + "balance_loss_mlp": 1.01826715, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.0302896351569966, + "language_loss": 0.70193779, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72295713, + "num_input_tokens_seen": 357157880, + "step": 16548, + "time_per_iteration": 2.574390172958374 + }, + { + "auxiliary_loss_clip": 0.01080518, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.031865, + "balance_loss_mlp": 1.01776123, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.9277702947631716, + "language_loss": 0.75121665, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77231026, + "num_input_tokens_seen": 357176705, + "step": 16549, + "time_per_iteration": 2.504263401031494 + }, + { + "auxiliary_loss_clip": 0.01068925, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.03329492, + "balance_loss_mlp": 1.02195454, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.7606289551156824, + "language_loss": 0.74601245, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.76703978, + "num_input_tokens_seen": 357197630, + "step": 16550, + "time_per_iteration": 2.6814029216766357 + }, + { + "auxiliary_loss_clip": 0.0104248, + "auxiliary_loss_mlp": 0.00749313, + "balance_loss_clip": 1.02775741, + "balance_loss_mlp": 1.00020015, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.6504622042575618, + "language_loss": 0.78246522, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80038321, + "num_input_tokens_seen": 357215445, + "step": 16551, + "time_per_iteration": 2.6682093143463135 + }, + { + "auxiliary_loss_clip": 0.01079508, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.03204644, + "balance_loss_mlp": 1.02060139, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.765380329993547, + "language_loss": 0.66645253, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68754917, + "num_input_tokens_seen": 357234285, + "step": 16552, + "time_per_iteration": 2.557605028152466 + }, + { + "auxiliary_loss_clip": 0.01098034, + "auxiliary_loss_mlp": 0.01023567, + "balance_loss_clip": 1.03359318, + "balance_loss_mlp": 1.0126946, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 2.0873062875455966, + "language_loss": 0.81461728, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83583325, + "num_input_tokens_seen": 357257565, + "step": 16553, + "time_per_iteration": 2.579073905944824 + }, + { + "auxiliary_loss_clip": 0.01013678, + "auxiliary_loss_mlp": 0.01001916, + "balance_loss_clip": 1.0040139, + "balance_loss_mlp": 1.00100982, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.7180936401353638, + "language_loss": 0.57288003, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59303606, + "num_input_tokens_seen": 357320205, + "step": 16554, + "time_per_iteration": 3.2057740688323975 + }, + { + "auxiliary_loss_clip": 0.01079943, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.03353167, + "balance_loss_mlp": 1.0222497, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.5999967204686418, + "language_loss": 0.77251786, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79364681, + "num_input_tokens_seen": 357340695, + "step": 16555, + "time_per_iteration": 2.555824041366577 + }, + { + "auxiliary_loss_clip": 0.01061765, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.03021908, + "balance_loss_mlp": 1.02087259, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 2.157866817488015, + "language_loss": 0.86278659, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88372761, + "num_input_tokens_seen": 357357505, + "step": 16556, + "time_per_iteration": 2.678834915161133 + }, + { + "auxiliary_loss_clip": 0.01051595, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.03186476, + "balance_loss_mlp": 1.01919103, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 3.0825336247934083, + "language_loss": 0.72778058, + "learning_rate": 2.132967729762125e-10, + "loss": 0.74860048, + "num_input_tokens_seen": 357375395, + "step": 16557, + "time_per_iteration": 2.667351007461548 + }, + { + "auxiliary_loss_clip": 0.01084547, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.03304362, + "balance_loss_mlp": 1.02228332, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 1.7868751107717717, + "language_loss": 0.76217926, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78334755, + "num_input_tokens_seen": 357397375, + "step": 16558, + "time_per_iteration": 4.118633508682251 + }, + { + "auxiliary_loss_clip": 0.01068899, + "auxiliary_loss_mlp": 0.01026343, + "balance_loss_clip": 1.02956986, + "balance_loss_mlp": 1.01495278, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 1.7833911188066573, + "language_loss": 0.63147455, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65242696, + "num_input_tokens_seen": 357418880, + "step": 16559, + "time_per_iteration": 2.633338451385498 + }, + { + "auxiliary_loss_clip": 0.01080452, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.03136253, + "balance_loss_mlp": 1.01499152, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 1.785287180380938, + "language_loss": 0.73956233, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76062787, + "num_input_tokens_seen": 357438310, + "step": 16560, + "time_per_iteration": 2.554521322250366 + }, + { + "auxiliary_loss_clip": 0.0103591, + "auxiliary_loss_mlp": 0.01026469, + "balance_loss_clip": 1.03022647, + "balance_loss_mlp": 1.01540637, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.8460753853284522, + "language_loss": 0.79248351, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.81310731, + "num_input_tokens_seen": 357457155, + "step": 16561, + "time_per_iteration": 2.6862308979034424 + }, + { + "auxiliary_loss_clip": 0.01094572, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.03393602, + "balance_loss_mlp": 1.0206809, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 2.2180410487885878, + "language_loss": 0.65650278, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67775786, + "num_input_tokens_seen": 357468060, + "step": 16562, + "time_per_iteration": 2.5079779624938965 + }, + { + "auxiliary_loss_clip": 0.01072922, + "auxiliary_loss_mlp": 0.00749398, + "balance_loss_clip": 1.03421462, + "balance_loss_mlp": 1.00030959, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 2.4439598476140234, + "language_loss": 0.64620095, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66442418, + "num_input_tokens_seen": 357489665, + "step": 16563, + "time_per_iteration": 2.6351873874664307 + }, + { + "auxiliary_loss_clip": 0.01072662, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.03174901, + "balance_loss_mlp": 1.01631451, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.5238494199264658, + "language_loss": 0.64454579, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66554725, + "num_input_tokens_seen": 357511975, + "step": 16564, + "time_per_iteration": 2.6593313217163086 + }, + { + "auxiliary_loss_clip": 0.0107279, + "auxiliary_loss_mlp": 0.00749207, + "balance_loss_clip": 1.03222251, + "balance_loss_mlp": 1.00023794, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 1.9887363519382462, + "language_loss": 0.74322599, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.76144594, + "num_input_tokens_seen": 357529345, + "step": 16565, + "time_per_iteration": 2.590322494506836 + }, + { + "auxiliary_loss_clip": 0.01075219, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.03027153, + "balance_loss_mlp": 1.02001667, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 3.1026165703903934, + "language_loss": 0.79082185, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81187642, + "num_input_tokens_seen": 357547615, + "step": 16566, + "time_per_iteration": 2.6250832080841064 + }, + { + "auxiliary_loss_clip": 0.01044878, + "auxiliary_loss_mlp": 0.00749329, + "balance_loss_clip": 1.03034258, + "balance_loss_mlp": 1.00029349, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.7273322152321136, + "language_loss": 0.71003276, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72797483, + "num_input_tokens_seen": 357567380, + "step": 16567, + "time_per_iteration": 2.7026796340942383 + }, + { + "auxiliary_loss_clip": 0.01083037, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.03176177, + "balance_loss_mlp": 1.01849675, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 1.8857481252901958, + "language_loss": 0.78683513, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.80796784, + "num_input_tokens_seen": 357586435, + "step": 16568, + "time_per_iteration": 2.5856070518493652 + }, + { + "auxiliary_loss_clip": 0.01082256, + "auxiliary_loss_mlp": 0.01024444, + "balance_loss_clip": 1.03245831, + "balance_loss_mlp": 1.01473427, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.6526639820239921, + "language_loss": 0.81986725, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.84093428, + "num_input_tokens_seen": 357604720, + "step": 16569, + "time_per_iteration": 2.5663363933563232 + }, + { + "auxiliary_loss_clip": 0.0105446, + "auxiliary_loss_mlp": 0.00749336, + "balance_loss_clip": 1.03145146, + "balance_loss_mlp": 1.00027275, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 1.8111168054488462, + "language_loss": 0.70320177, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72123975, + "num_input_tokens_seen": 357622345, + "step": 16570, + "time_per_iteration": 2.6336607933044434 + }, + { + "auxiliary_loss_clip": 0.01067239, + "auxiliary_loss_mlp": 0.01024074, + "balance_loss_clip": 1.03397727, + "balance_loss_mlp": 1.01304114, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.8996545238757985, + "language_loss": 0.7528246, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77373773, + "num_input_tokens_seen": 357642710, + "step": 16571, + "time_per_iteration": 2.6814656257629395 + }, + { + "auxiliary_loss_clip": 0.01074884, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.03180993, + "balance_loss_mlp": 1.02129793, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 2.321076334090005, + "language_loss": 0.79664397, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81771755, + "num_input_tokens_seen": 357659870, + "step": 16572, + "time_per_iteration": 2.555016040802002 + }, + { + "auxiliary_loss_clip": 0.01074056, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.0341711, + "balance_loss_mlp": 1.02008367, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.8065446675592638, + "language_loss": 0.69892514, + "learning_rate": 1.3199841727074e-10, + "loss": 0.71996975, + "num_input_tokens_seen": 357677075, + "step": 16573, + "time_per_iteration": 2.636780023574829 + }, + { + "auxiliary_loss_clip": 0.01074048, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.03218579, + "balance_loss_mlp": 1.02179897, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 2.0917348235198503, + "language_loss": 0.63607067, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65714443, + "num_input_tokens_seen": 357696715, + "step": 16574, + "time_per_iteration": 2.6056623458862305 + }, + { + "auxiliary_loss_clip": 0.010678, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.03513169, + "balance_loss_mlp": 1.02010906, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 2.0947327746600553, + "language_loss": 0.7615698, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78257084, + "num_input_tokens_seen": 357712345, + "step": 16575, + "time_per_iteration": 4.163519382476807 + }, + { + "auxiliary_loss_clip": 0.0107439, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03346336, + "balance_loss_mlp": 1.0191561, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 12.06505480549783, + "language_loss": 0.70530504, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.7263546, + "num_input_tokens_seen": 357731815, + "step": 16576, + "time_per_iteration": 2.5770938396453857 + }, + { + "auxiliary_loss_clip": 0.01083402, + "auxiliary_loss_mlp": 0.01026032, + "balance_loss_clip": 1.03211379, + "balance_loss_mlp": 1.01523185, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.6655568818927091, + "language_loss": 0.72087765, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74197197, + "num_input_tokens_seen": 357751640, + "step": 16577, + "time_per_iteration": 2.5601720809936523 + }, + { + "auxiliary_loss_clip": 0.01072497, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.03275299, + "balance_loss_mlp": 1.02179563, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 3.0590008955392607, + "language_loss": 0.7830019, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80404913, + "num_input_tokens_seen": 357769850, + "step": 16578, + "time_per_iteration": 2.525466203689575 + }, + { + "auxiliary_loss_clip": 0.01056463, + "auxiliary_loss_mlp": 0.00749277, + "balance_loss_clip": 1.03389537, + "balance_loss_mlp": 1.00017953, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.5922705018063583, + "language_loss": 0.76162279, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.77968019, + "num_input_tokens_seen": 357789550, + "step": 16579, + "time_per_iteration": 2.645961046218872 + }, + { + "auxiliary_loss_clip": 0.01074645, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.03370702, + "balance_loss_mlp": 1.02006114, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 2.4273619899679217, + "language_loss": 0.69699502, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71806669, + "num_input_tokens_seen": 357809525, + "step": 16580, + "time_per_iteration": 2.732248067855835 + }, + { + "auxiliary_loss_clip": 0.01042845, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.02904153, + "balance_loss_mlp": 1.01899993, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 2.4350308037564896, + "language_loss": 0.79803127, + "learning_rate": 9.862937031113184e-11, + "loss": 0.81875885, + "num_input_tokens_seen": 357829795, + "step": 16581, + "time_per_iteration": 2.672668695449829 + }, + { + "auxiliary_loss_clip": 0.01066493, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.03192341, + "balance_loss_mlp": 1.01603389, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.6996729083875979, + "language_loss": 0.80436087, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82528424, + "num_input_tokens_seen": 357851655, + "step": 16582, + "time_per_iteration": 2.6725013256073 + }, + { + "auxiliary_loss_clip": 0.01081046, + "auxiliary_loss_mlp": 0.01025153, + "balance_loss_clip": 1.03100669, + "balance_loss_mlp": 1.01468623, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 3.0286879518014165, + "language_loss": 0.60620046, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62726235, + "num_input_tokens_seen": 357871205, + "step": 16583, + "time_per_iteration": 4.10666036605835 + }, + { + "auxiliary_loss_clip": 0.01076497, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.03330898, + "balance_loss_mlp": 1.02240515, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.7558229517487391, + "language_loss": 0.77981651, + "learning_rate": 8.736727507452357e-11, + "loss": 0.80091405, + "num_input_tokens_seen": 357892145, + "step": 16584, + "time_per_iteration": 2.6052029132843018 + }, + { + "auxiliary_loss_clip": 0.01069829, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.030599, + "balance_loss_mlp": 1.01901531, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.5038414224721655, + "language_loss": 0.69589055, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71687829, + "num_input_tokens_seen": 357911205, + "step": 16585, + "time_per_iteration": 2.594876766204834 + }, + { + "auxiliary_loss_clip": 0.01096244, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.03398371, + "balance_loss_mlp": 1.01790977, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.5489085032588763, + "language_loss": 0.81564713, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83690244, + "num_input_tokens_seen": 357928190, + "step": 16586, + "time_per_iteration": 4.104680776596069 + }, + { + "auxiliary_loss_clip": 0.01070436, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.02987278, + "balance_loss_mlp": 1.02066278, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.62757847874175, + "language_loss": 0.7796551, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80066854, + "num_input_tokens_seen": 357946985, + "step": 16587, + "time_per_iteration": 2.6511175632476807 + }, + { + "auxiliary_loss_clip": 0.01069687, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.03318024, + "balance_loss_mlp": 1.02379799, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 1.7152966566304615, + "language_loss": 0.7266711, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74771559, + "num_input_tokens_seen": 357966720, + "step": 16588, + "time_per_iteration": 2.5708365440368652 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.03364623, + "balance_loss_mlp": 1.01564169, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.561708086218109, + "language_loss": 0.8276217, + "learning_rate": 7.011385585031781e-11, + "loss": 0.84889221, + "num_input_tokens_seen": 357981375, + "step": 16589, + "time_per_iteration": 2.5143120288848877 + }, + { + "auxiliary_loss_clip": 0.01088057, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.03233027, + "balance_loss_mlp": 1.02268541, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 2.29576756430647, + "language_loss": 0.705365, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72659957, + "num_input_tokens_seen": 358000290, + "step": 16590, + "time_per_iteration": 2.648700475692749 + }, + { + "auxiliary_loss_clip": 0.01073185, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.03228974, + "balance_loss_mlp": 1.01362252, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 1.657640408345688, + "language_loss": 0.63545215, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65643615, + "num_input_tokens_seen": 358022075, + "step": 16591, + "time_per_iteration": 2.687116861343384 + }, + { + "auxiliary_loss_clip": 0.01072839, + "auxiliary_loss_mlp": 0.01028469, + "balance_loss_clip": 1.0301888, + "balance_loss_mlp": 1.01808619, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 5.5338926459476765, + "language_loss": 0.73098481, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75199795, + "num_input_tokens_seen": 358043940, + "step": 16592, + "time_per_iteration": 2.714167356491089 + }, + { + "auxiliary_loss_clip": 0.01074594, + "auxiliary_loss_mlp": 0.01026525, + "balance_loss_clip": 1.03189838, + "balance_loss_mlp": 1.01514602, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.8157626226691188, + "language_loss": 0.84903854, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87004972, + "num_input_tokens_seen": 358062720, + "step": 16593, + "time_per_iteration": 2.5972118377685547 + }, + { + "auxiliary_loss_clip": 0.01093389, + "auxiliary_loss_mlp": 0.00749212, + "balance_loss_clip": 1.03296816, + "balance_loss_mlp": 1.00025189, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.6684876008054021, + "language_loss": 0.69420445, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71263051, + "num_input_tokens_seen": 358081560, + "step": 16594, + "time_per_iteration": 2.537583351135254 + }, + { + "auxiliary_loss_clip": 0.01056529, + "auxiliary_loss_mlp": 0.01023984, + "balance_loss_clip": 1.03392589, + "balance_loss_mlp": 1.01280797, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.2306614962660567, + "language_loss": 0.72318769, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.74399281, + "num_input_tokens_seen": 358099065, + "step": 16595, + "time_per_iteration": 2.651080369949341 + }, + { + "auxiliary_loss_clip": 0.01001254, + "auxiliary_loss_mlp": 0.01003942, + "balance_loss_clip": 1.0029546, + "balance_loss_mlp": 1.00302446, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.8147278897398839, + "language_loss": 0.60327023, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62332225, + "num_input_tokens_seen": 358156095, + "step": 16596, + "time_per_iteration": 2.994185447692871 + }, + { + "auxiliary_loss_clip": 0.01085156, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.03300905, + "balance_loss_mlp": 1.01722336, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 2.142420559176354, + "language_loss": 0.7745918, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79572296, + "num_input_tokens_seen": 358175230, + "step": 16597, + "time_per_iteration": 2.686643362045288 + }, + { + "auxiliary_loss_clip": 0.01084281, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.03141046, + "balance_loss_mlp": 1.01752687, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 1.9520183204085393, + "language_loss": 0.82460129, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84573269, + "num_input_tokens_seen": 358197075, + "step": 16598, + "time_per_iteration": 4.260613679885864 + }, + { + "auxiliary_loss_clip": 0.01070186, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.03283501, + "balance_loss_mlp": 1.02457345, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 1.9495660409440456, + "language_loss": 0.64198899, + "learning_rate": 4.129484715709175e-11, + "loss": 0.66305614, + "num_input_tokens_seen": 358215925, + "step": 16599, + "time_per_iteration": 2.725517988204956 + }, + { + "auxiliary_loss_clip": 0.01004089, + "auxiliary_loss_mlp": 0.01002493, + "balance_loss_clip": 1.0042274, + "balance_loss_mlp": 1.0014739, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8539902606153325, + "language_loss": 0.62270308, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64276886, + "num_input_tokens_seen": 358269035, + "step": 16600, + "time_per_iteration": 3.0295612812042236 + }, + { + "auxiliary_loss_clip": 0.01073097, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.03249586, + "balance_loss_mlp": 1.01942849, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 5.609533211354091, + "language_loss": 0.78454506, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80556536, + "num_input_tokens_seen": 358287680, + "step": 16601, + "time_per_iteration": 2.610806465148926 + }, + { + "auxiliary_loss_clip": 0.01064335, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.03201544, + "balance_loss_mlp": 1.02057528, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 1.8906857286931495, + "language_loss": 0.8250134, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84597003, + "num_input_tokens_seen": 358304080, + "step": 16602, + "time_per_iteration": 2.6575348377227783 + }, + { + "auxiliary_loss_clip": 0.01057096, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.03335392, + "balance_loss_mlp": 1.02209187, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 1.9480410531732524, + "language_loss": 0.62472403, + "learning_rate": 3.189071962883538e-11, + "loss": 0.6456244, + "num_input_tokens_seen": 358323670, + "step": 16603, + "time_per_iteration": 2.6274759769439697 + }, + { + "auxiliary_loss_clip": 0.01070827, + "auxiliary_loss_mlp": 0.01026112, + "balance_loss_clip": 1.03035188, + "balance_loss_mlp": 1.01496577, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 1.8223246143247958, + "language_loss": 0.7116127, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73258209, + "num_input_tokens_seen": 358341980, + "step": 16604, + "time_per_iteration": 2.691532850265503 + }, + { + "auxiliary_loss_clip": 0.01095949, + "auxiliary_loss_mlp": 0.01025154, + "balance_loss_clip": 1.03261495, + "balance_loss_mlp": 1.01399064, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.772420372732247, + "language_loss": 0.64437264, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66558373, + "num_input_tokens_seen": 358360400, + "step": 16605, + "time_per_iteration": 2.5158498287200928 + }, + { + "auxiliary_loss_clip": 0.01059275, + "auxiliary_loss_mlp": 0.01027397, + "balance_loss_clip": 1.02968001, + "balance_loss_mlp": 1.01710343, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.6196228774655859, + "language_loss": 0.71318173, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73404843, + "num_input_tokens_seen": 358378990, + "step": 16606, + "time_per_iteration": 2.5606772899627686 + }, + { + "auxiliary_loss_clip": 0.0108695, + "auxiliary_loss_mlp": 0.00749364, + "balance_loss_clip": 1.03364968, + "balance_loss_mlp": 1.00026309, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 1.8416776806830317, + "language_loss": 0.81748927, + "learning_rate": 2.370001590090709e-11, + "loss": 0.83585244, + "num_input_tokens_seen": 358395970, + "step": 16607, + "time_per_iteration": 2.572934627532959 + }, + { + "auxiliary_loss_clip": 0.01062675, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.03003299, + "balance_loss_mlp": 1.01798713, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.6077617699259166, + "language_loss": 0.67188013, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69280332, + "num_input_tokens_seen": 358417355, + "step": 16608, + "time_per_iteration": 2.679924488067627 + }, + { + "auxiliary_loss_clip": 0.01049536, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.03351927, + "balance_loss_mlp": 1.01701808, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 1.80742573755667, + "language_loss": 0.8059963, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.82677686, + "num_input_tokens_seen": 358434345, + "step": 16609, + "time_per_iteration": 2.620262861251831 + }, + { + "auxiliary_loss_clip": 0.01075073, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.03128457, + "balance_loss_mlp": 1.01834416, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.5300260576513478, + "language_loss": 0.62728512, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.64832819, + "num_input_tokens_seen": 358452870, + "step": 16610, + "time_per_iteration": 2.6722397804260254 + }, + { + "auxiliary_loss_clip": 0.01077089, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.03157747, + "balance_loss_mlp": 1.02122796, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 2.102782960362646, + "language_loss": 0.67789912, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69898915, + "num_input_tokens_seen": 358472210, + "step": 16611, + "time_per_iteration": 2.5468244552612305 + }, + { + "auxiliary_loss_clip": 0.01051364, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.03117156, + "balance_loss_mlp": 1.02356315, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.5016738177479212, + "language_loss": 0.6977334, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71860045, + "num_input_tokens_seen": 358493840, + "step": 16612, + "time_per_iteration": 2.682013750076294 + }, + { + "auxiliary_loss_clip": 0.01071249, + "auxiliary_loss_mlp": 0.01026769, + "balance_loss_clip": 1.03329515, + "balance_loss_mlp": 1.01665962, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.7006336649145242, + "language_loss": 0.73920625, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76018643, + "num_input_tokens_seen": 358515060, + "step": 16613, + "time_per_iteration": 2.7073628902435303 + }, + { + "auxiliary_loss_clip": 0.01057539, + "auxiliary_loss_mlp": 0.00749656, + "balance_loss_clip": 1.03073001, + "balance_loss_mlp": 1.00021815, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 2.3118099225482607, + "language_loss": 0.73406547, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75213742, + "num_input_tokens_seen": 358528200, + "step": 16614, + "time_per_iteration": 2.5756747722625732 + }, + { + "auxiliary_loss_clip": 0.01080018, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.03212738, + "balance_loss_mlp": 1.02095544, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 1.7982628006793668, + "language_loss": 0.73141319, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75252771, + "num_input_tokens_seen": 358548360, + "step": 16615, + "time_per_iteration": 4.052059650421143 + }, + { + "auxiliary_loss_clip": 0.01100156, + "auxiliary_loss_mlp": 0.00749305, + "balance_loss_clip": 1.03479052, + "balance_loss_mlp": 1.00021625, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 2.0486193003122843, + "language_loss": 0.77374655, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79224116, + "num_input_tokens_seen": 358566270, + "step": 16616, + "time_per_iteration": 2.560713529586792 + }, + { + "auxiliary_loss_clip": 0.01070056, + "auxiliary_loss_mlp": 0.0102659, + "balance_loss_clip": 1.03348935, + "balance_loss_mlp": 1.01594496, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 1.8186965701819404, + "language_loss": 0.82837915, + "learning_rate": 8.532016508855378e-12, + "loss": 0.84934562, + "num_input_tokens_seen": 358584710, + "step": 16617, + "time_per_iteration": 2.577199697494507 + }, + { + "auxiliary_loss_clip": 0.01074442, + "auxiliary_loss_mlp": 0.01024578, + "balance_loss_clip": 1.03134251, + "balance_loss_mlp": 1.01460648, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.9823761678681675, + "language_loss": 0.78389513, + "learning_rate": 7.43233506206309e-12, + "loss": 0.80488539, + "num_input_tokens_seen": 358606750, + "step": 16618, + "time_per_iteration": 2.6224164962768555 + }, + { + "auxiliary_loss_clip": 0.01094387, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.03206098, + "balance_loss_mlp": 1.01670229, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.5843072485409548, + "language_loss": 0.74820483, + "learning_rate": 6.408493534060255e-12, + "loss": 0.76942015, + "num_input_tokens_seen": 358624675, + "step": 16619, + "time_per_iteration": 2.5264461040496826 + }, + { + "auxiliary_loss_clip": 0.01081125, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.03092623, + "balance_loss_mlp": 1.01670599, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 1.9547570441673576, + "language_loss": 0.86601746, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88709176, + "num_input_tokens_seen": 358640715, + "step": 16620, + "time_per_iteration": 2.52394962310791 + }, + { + "auxiliary_loss_clip": 0.01055485, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.02868974, + "balance_loss_mlp": 1.01544428, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 1.888382038771388, + "language_loss": 0.7274884, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74829847, + "num_input_tokens_seen": 358659630, + "step": 16621, + "time_per_iteration": 2.7216224670410156 + }, + { + "auxiliary_loss_clip": 0.00994102, + "auxiliary_loss_mlp": 0.0100104, + "balance_loss_clip": 1.00483036, + "balance_loss_mlp": 1.00009823, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.7323890198810151, + "language_loss": 0.56523263, + "learning_rate": 3.79200883515729e-12, + "loss": 0.5851841, + "num_input_tokens_seen": 358727840, + "step": 16622, + "time_per_iteration": 4.8529322147369385 + }, + { + "auxiliary_loss_clip": 0.01053788, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.02976584, + "balance_loss_mlp": 1.0180347, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 2.79199573245598, + "language_loss": 0.70827675, + "learning_rate": 3.071527340914315e-12, + "loss": 0.72911024, + "num_input_tokens_seen": 358744125, + "step": 16623, + "time_per_iteration": 2.6425535678863525 + }, + { + "auxiliary_loss_clip": 0.0105323, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.03297544, + "balance_loss_mlp": 1.0188899, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 2.023746685906121, + "language_loss": 0.74410868, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76494598, + "num_input_tokens_seen": 358761420, + "step": 16624, + "time_per_iteration": 2.617309093475342 + }, + { + "auxiliary_loss_clip": 0.01060427, + "auxiliary_loss_mlp": 0.01028756, + "balance_loss_clip": 1.03339958, + "balance_loss_mlp": 1.01744866, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.5465379437704931, + "language_loss": 0.73751271, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.75840455, + "num_input_tokens_seen": 358782600, + "step": 16625, + "time_per_iteration": 2.705508232116699 + }, + { + "auxiliary_loss_clip": 0.01084805, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.03343832, + "balance_loss_mlp": 1.01852751, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.940256139140892, + "language_loss": 0.76702201, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.78815818, + "num_input_tokens_seen": 358801220, + "step": 16626, + "time_per_iteration": 4.1383161544799805 + }, + { + "auxiliary_loss_clip": 0.01095878, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.03459001, + "balance_loss_mlp": 1.02108026, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 2.198993171563383, + "language_loss": 0.82334936, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84462249, + "num_input_tokens_seen": 358819190, + "step": 16627, + "time_per_iteration": 2.5415196418762207 + }, + { + "auxiliary_loss_clip": 0.01081634, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.03373384, + "balance_loss_mlp": 1.02150905, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 3.163857554727078, + "language_loss": 0.70779634, + "learning_rate": 6.067215747584952e-13, + "loss": 0.72894651, + "num_input_tokens_seen": 358839850, + "step": 16628, + "time_per_iteration": 2.7379980087280273 + }, + { + "auxiliary_loss_clip": 0.01085733, + "auxiliary_loss_mlp": 0.01025758, + "balance_loss_clip": 1.03145623, + "balance_loss_mlp": 1.0153569, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.540709359218704, + "language_loss": 0.75199741, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.7731123, + "num_input_tokens_seen": 358859805, + "step": 16629, + "time_per_iteration": 2.5522866249084473 + }, + { + "auxiliary_loss_clip": 0.01077551, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.03299451, + "balance_loss_mlp": 1.02052248, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.704020888509457, + "language_loss": 0.60067493, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62176758, + "num_input_tokens_seen": 358877900, + "step": 16630, + "time_per_iteration": 2.530578374862671 + }, + { + "auxiliary_loss_clip": 0.01050249, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.03062809, + "balance_loss_mlp": 1.01609755, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 1.8253725141606487, + "language_loss": 0.60114855, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62192142, + "num_input_tokens_seen": 358897285, + "step": 16631, + "time_per_iteration": 2.654735803604126 + }, + { + "auxiliary_loss_clip": 0.01032112, + "auxiliary_loss_mlp": 0.00749361, + "balance_loss_clip": 1.03033471, + "balance_loss_mlp": 1.00023532, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.975073393630984, + "language_loss": 0.72807407, + "learning_rate": 0.0, + "loss": 0.74588883, + "num_input_tokens_seen": 358911570, + "step": 16632, + "time_per_iteration": 2.635075092315674 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3992169073237033e+18, + "train_loss": 0.7690299102653959, + "train_runtime": 47912.6079, + "train_samples_per_second": 13.886, + "train_steps_per_second": 0.347 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/training_args.bin b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..20fdb0de7610db9be028a9b5b0c1e7a948b3c026 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_down_router_final_0.001_competesmoev6/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aff4a80bb503f844b840910d3da0c7571d86c216bc9bc2e21a93a03c9a33c9c +size 7992